You've already forked linux-packaging-mono
							
							
		
			
	
	
		
			742 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
		
		
			
		
	
	
			742 lines
		
	
	
		
			26 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
|   | //
 | ||
|  | //                     The LLVM Compiler Infrastructure
 | ||
|  | //
 | ||
|  | // This file is distributed under the University of Illinois Open Source
 | ||
|  | // License. See LICENSE.TXT for details.
 | ||
|  | //
 | ||
|  | //===----------------------------------------------------------------------===//
 | ||
|  | //
 | ||
|  | // This file contains a pass that performs optimization on SIMD instructions
 | ||
|  | // with high latency by splitting them into more efficient series of
 | ||
|  | // instructions.
 | ||
|  | //
 | ||
|  | // 1. Rewrite certain SIMD instructions with vector element due to their
 | ||
|  | // inefficiency on some targets.
 | ||
|  | //
 | ||
|  | // For example:
 | ||
|  | //    fmla v0.4s, v1.4s, v2.s[1]
 | ||
|  | //
 | ||
|  | // Is rewritten into:
 | ||
|  | //    dup v3.4s, v2.s[1]
 | ||
|  | //    fmla v0.4s, v1.4s, v3.4s
 | ||
|  | //
 | ||
|  | // 2. Rewrite interleaved memory access instructions due to their
 | ||
|  | // inefficiency on some targets.
 | ||
|  | //
 | ||
|  | // For example:
 | ||
|  | //    st2 {v0.4s, v1.4s}, addr
 | ||
|  | //
 | ||
|  | // Is rewritten into:
 | ||
|  | //    zip1 v2.4s, v0.4s, v1.4s
 | ||
|  | //    zip2 v3.4s, v0.4s, v1.4s
 | ||
|  | //    stp  q2, q3,  addr
 | ||
|  | //
 | ||
|  | //===----------------------------------------------------------------------===//
 | ||
|  | 
 | ||
|  | #include "AArch64InstrInfo.h"
 | ||
|  | #include "llvm/ADT/SmallVector.h"
 | ||
|  | #include "llvm/ADT/Statistic.h"
 | ||
|  | #include "llvm/ADT/StringRef.h"
 | ||
|  | #include "llvm/CodeGen/MachineBasicBlock.h"
 | ||
|  | #include "llvm/CodeGen/MachineFunction.h"
 | ||
|  | #include "llvm/CodeGen/MachineFunctionPass.h"
 | ||
|  | #include "llvm/CodeGen/MachineInstr.h"
 | ||
|  | #include "llvm/CodeGen/MachineInstrBuilder.h"
 | ||
|  | #include "llvm/CodeGen/MachineOperand.h"
 | ||
|  | #include "llvm/CodeGen/MachineRegisterInfo.h"
 | ||
|  | #include "llvm/CodeGen/TargetInstrInfo.h"
 | ||
|  | #include "llvm/CodeGen/TargetSchedule.h"
 | ||
|  | #include "llvm/CodeGen/TargetSubtargetInfo.h"
 | ||
|  | #include "llvm/MC/MCInstrDesc.h"
 | ||
|  | #include "llvm/MC/MCSchedule.h"
 | ||
|  | #include "llvm/Pass.h"
 | ||
|  | #include <unordered_map>
 | ||
|  | 
 | ||
|  | using namespace llvm; | ||
|  | 
 | ||
|  | #define DEBUG_TYPE "aarch64-simdinstr-opt"
 | ||
|  | 
 | ||
|  | STATISTIC(NumModifiedInstr, | ||
|  |           "Number of SIMD instructions modified"); | ||
|  | 
 | ||
|  | #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME                                     \
 | ||
|  |   "AArch64 SIMD instructions optimization pass" | ||
|  | 
 | ||
|  | namespace { | ||
|  | 
 | ||
|  | struct AArch64SIMDInstrOpt : public MachineFunctionPass { | ||
|  |   static char ID; | ||
|  | 
 | ||
|  |   const TargetInstrInfo *TII; | ||
|  |   MachineRegisterInfo *MRI; | ||
|  |   TargetSchedModel SchedModel; | ||
|  | 
 | ||
|  |   // The two maps below are used to cache decisions instead of recomputing:
 | ||
|  |   // This is used to cache instruction replacement decisions within function
 | ||
|  |   // units and across function units.
 | ||
|  |   std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable; | ||
|  |   // This is used to cache the decision of whether to leave the interleaved
 | ||
|  |   // store instructions replacement pass early or not for a particular target.
 | ||
|  |   std::unordered_map<std::string, bool> InterlEarlyExit; | ||
|  | 
 | ||
|  |   typedef enum { | ||
|  |     VectorElem, | ||
|  |     Interleave | ||
|  |   } Subpass; | ||
|  | 
 | ||
|  |   // Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
 | ||
|  |   struct InstReplInfo { | ||
|  |     unsigned OrigOpc; | ||
|  | 		std::vector<unsigned> ReplOpc; | ||
|  |     const TargetRegisterClass RC; | ||
|  |   }; | ||
|  | 
 | ||
|  | #define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
 | ||
|  |   {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} | ||
|  | #define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
 | ||
|  |                 OpcR7, OpcR8, OpcR9, RC) \ | ||
|  |   {OpcOrg, \ | ||
|  |    {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC} | ||
|  | 
 | ||
|  |   // The Instruction Replacement Table:
 | ||
|  |   std::vector<InstReplInfo> IRT = { | ||
|  |     // ST2 instructions
 | ||
|  |     RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | ||
|  |           AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | ||
|  |           AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | ||
|  |           AArch64::STPDi, AArch64::FPR64RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | ||
|  |           AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | ||
|  |           AArch64::STPDi, AArch64::FPR64RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | ||
|  |           AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | ||
|  |           AArch64::STPDi, AArch64::FPR64RegClass), | ||
|  |     // ST4 instructions
 | ||
|  |     RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | ||
|  |           AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, | ||
|  |           AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, | ||
|  |           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | ||
|  |           AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, | ||
|  |           AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, | ||
|  |           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | ||
|  |           AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, | ||
|  |           AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, | ||
|  |           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | ||
|  |           AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, | ||
|  |           AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, | ||
|  |           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | ||
|  |           AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, | ||
|  |           AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, | ||
|  |           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | ||
|  |           AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, | ||
|  |           AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, | ||
|  |           AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), | ||
|  |     RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | ||
|  |           AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, | ||
|  |           AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, | ||
|  |           AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) | ||
|  |   }; | ||
|  | 
 | ||
|  |   // A costly instruction is replaced in this work by N efficient instructions
 | ||
|  |   // The maximum of N is curently 10 and it is for ST4 case.
 | ||
|  |   static const unsigned MaxNumRepl = 10; | ||
|  | 
 | ||
|  |   AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { | ||
|  |     initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); | ||
|  |   } | ||
|  | 
 | ||
|  |   /// Based only on latency of instructions, determine if it is cost efficient
 | ||
|  |   /// to replace the instruction InstDesc by the instructions stored in the
 | ||
|  |   /// array InstDescRepl.
 | ||
|  |   /// Return true if replacement is expected to be faster.
 | ||
|  |   bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, | ||
|  |                          SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID); | ||
|  | 
 | ||
|  |   /// Determine if we need to exit the instruction replacement optimization
 | ||
|  |   /// passes early. This makes sure that no compile time is spent in this pass
 | ||
|  |   /// for targets with no need for any of these optimizations.
 | ||
|  |   /// Return true if early exit of the pass is recommended.
 | ||
|  |   bool shouldExitEarly(MachineFunction *MF, Subpass SP); | ||
|  | 
 | ||
|  |   /// Check whether an equivalent DUP instruction has already been
 | ||
|  |   /// created or not.
 | ||
|  |   /// Return true when the DUP instruction already exists. In this case,
 | ||
|  |   /// DestReg will point to the destination of the already created DUP.
 | ||
|  |   bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, | ||
|  |                 unsigned LaneNumber, unsigned *DestReg) const; | ||
|  | 
 | ||
|  |   /// Certain SIMD instructions with vector element operand are not efficient.
 | ||
|  |   /// Rewrite them into SIMD instructions with vector operands. This rewrite
 | ||
|  |   /// is driven by the latency of the instructions.
 | ||
|  |   /// Return true if the SIMD instruction is modified.
 | ||
|  |   bool optimizeVectElement(MachineInstr &MI); | ||
|  | 
 | ||
|  |   /// Process The REG_SEQUENCE instruction, and extract the source
 | ||
|  |   /// operands of the ST2/4 instruction from it.
 | ||
|  |   /// Example of such instructions.
 | ||
|  |   ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
 | ||
|  |   /// Return true when the instruction is processed successfully.
 | ||
|  |   bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, | ||
|  |                          unsigned* StRegKill, unsigned NumArg) const; | ||
|  | 
 | ||
|  |   /// Load/Store Interleaving instructions are not always beneficial.
 | ||
|  |   /// Replace them by ZIP instructionand classical load/store.
 | ||
|  |   /// Return true if the SIMD instruction is modified.
 | ||
|  |   bool optimizeLdStInterleave(MachineInstr &MI); | ||
|  | 
 | ||
|  |   /// Return the number of useful source registers for this
 | ||
|  |   /// instruction (2 for ST2 and 4 for ST4).
 | ||
|  |   unsigned determineSrcReg(MachineInstr &MI) const; | ||
|  | 
 | ||
|  |   bool runOnMachineFunction(MachineFunction &Fn) override; | ||
|  | 
 | ||
|  |   StringRef getPassName() const override { | ||
|  |     return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME; | ||
|  |   } | ||
|  | }; | ||
|  | 
 | ||
|  | char AArch64SIMDInstrOpt::ID = 0; | ||
|  | 
 | ||
|  | } // end anonymous namespace
 | ||
|  | 
 | ||
|  | INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", | ||
|  |                 AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) | ||
|  | 
 | ||
|  | /// Based only on latency of instructions, determine if it is cost efficient
 | ||
|  | /// to replace the instruction InstDesc by the instructions stored in the
 | ||
|  | /// array InstDescRepl.
 | ||
|  | /// Return true if replacement is expected to be faster.
 | ||
|  | bool AArch64SIMDInstrOpt:: | ||
|  | shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, | ||
|  |                   SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) { | ||
|  |   // Check if replacement decision is already available in the cached table.
 | ||
|  |   // if so, return it.
 | ||
|  |   std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); | ||
|  |   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); | ||
|  |   if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) | ||
|  |     return SIMDInstrTable[InstID]; | ||
|  | 
 | ||
|  |   unsigned SCIdx = InstDesc->getSchedClass(); | ||
|  |   const MCSchedClassDesc *SCDesc = | ||
|  |     SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); | ||
|  | 
 | ||
|  |   // If a target does not define resources for the instructions
 | ||
|  |   // of interest, then return false for no replacement.
 | ||
|  |   const MCSchedClassDesc *SCDescRepl; | ||
|  |   if (!SCDesc->isValid() || SCDesc->isVariant()) | ||
|  |   { | ||
|  |     SIMDInstrTable[InstID] = false; | ||
|  |     return false; | ||
|  |   } | ||
|  |   for (auto IDesc : InstDescRepl) | ||
|  |   { | ||
|  |     SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( | ||
|  |       IDesc->getSchedClass()); | ||
|  |     if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) | ||
|  |     { | ||
|  |       SIMDInstrTable[InstID] = false; | ||
|  |       return false; | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   // Replacement cost.
 | ||
|  |   unsigned ReplCost = 0; | ||
|  |   for (auto IDesc :InstDescRepl) | ||
|  |     ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); | ||
|  | 
 | ||
|  |   if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) | ||
|  |   { | ||
|  |     SIMDInstrTable[InstID] = true; | ||
|  |     return true; | ||
|  |   } | ||
|  |   else | ||
|  |   { | ||
|  |     SIMDInstrTable[InstID] = false; | ||
|  |     return false; | ||
|  |   } | ||
|  | } | ||
|  | 
 | ||
|  | /// Determine if we need to exit this pass for a kind of instruction replacement
 | ||
|  | /// early. This makes sure that no compile time is spent in this pass for
 | ||
|  | /// targets with no need for any of these optimizations beyond performing this
 | ||
|  | /// check.
 | ||
|  | /// Return true if early exit of this pass for a kind of instruction
 | ||
|  | /// replacement is recommended for a target.
 | ||
|  | bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { | ||
|  |   const MCInstrDesc* OriginalMCID; | ||
|  |   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; | ||
|  | 
 | ||
|  |   switch (SP) { | ||
|  |   // For this optimization, check by comparing the latency of a representative
 | ||
|  |   // instruction to that of the replacement instructions.
 | ||
|  |   // TODO: check for all concerned instructions.
 | ||
|  |   case VectorElem: | ||
|  |     OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); | ||
|  |     ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); | ||
|  |     ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32)); | ||
|  |     if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) | ||
|  |       return false; | ||
|  |     break; | ||
|  | 
 | ||
|  |   // For this optimization, check for all concerned instructions.
 | ||
|  |   case Interleave: | ||
|  |     std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); | ||
|  |     if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) | ||
|  |       return InterlEarlyExit[Subtarget]; | ||
|  | 
 | ||
|  |     for (auto &I : IRT) { | ||
|  |       OriginalMCID = &TII->get(I.OrigOpc); | ||
|  |       for (auto &Repl : I.ReplOpc) | ||
|  |         ReplInstrMCID.push_back(&TII->get(Repl)); | ||
|  |       if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { | ||
|  |         InterlEarlyExit[Subtarget] = false; | ||
|  |         return false; | ||
|  |       } | ||
|  |       ReplInstrMCID.clear(); | ||
|  |     } | ||
|  |     InterlEarlyExit[Subtarget] = true; | ||
|  |     break; | ||
|  |   } | ||
|  | 
 | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | /// Check whether an equivalent DUP instruction has already been
 | ||
|  | /// created or not.
 | ||
|  | /// Return true when the DUP instruction already exists. In this case,
 | ||
|  | /// DestReg will point to the destination of the already created DUP.
 | ||
|  | bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, | ||
|  |                                          unsigned SrcReg, unsigned LaneNumber, | ||
|  |                                          unsigned *DestReg) const { | ||
|  |   for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); | ||
|  |        MII != MIE;) { | ||
|  |     MII--; | ||
|  |     MachineInstr *CurrentMI = &*MII; | ||
|  | 
 | ||
|  |     if (CurrentMI->getOpcode() == DupOpcode && | ||
|  |         CurrentMI->getNumOperands() == 3 && | ||
|  |         CurrentMI->getOperand(1).getReg() == SrcReg && | ||
|  |         CurrentMI->getOperand(2).getImm() == LaneNumber) { | ||
|  |       *DestReg = CurrentMI->getOperand(0).getReg(); | ||
|  |       return true; | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   return false; | ||
|  | } | ||
|  | 
 | ||
|  | /// Certain SIMD instructions with vector element operand are not efficient.
 | ||
|  | /// Rewrite them into SIMD instructions with vector operands. This rewrite
 | ||
|  | /// is driven by the latency of the instructions.
 | ||
|  | /// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
 | ||
|  | /// and FMULX and hence they are hardcoded.
 | ||
|  | ///
 | ||
|  | /// For example:
 | ||
|  | ///    fmla v0.4s, v1.4s, v2.s[1]
 | ||
|  | ///
 | ||
|  | /// Is rewritten into
 | ||
|  | ///    dup  v3.4s, v2.s[1]      // DUP not necessary if redundant
 | ||
|  | ///    fmla v0.4s, v1.4s, v3.4s
 | ||
|  | ///
 | ||
|  | /// Return true if the SIMD instruction is modified.
 | ||
|  | bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { | ||
|  |   const MCInstrDesc *MulMCID, *DupMCID; | ||
|  |   const TargetRegisterClass *RC = &AArch64::FPR128RegClass; | ||
|  | 
 | ||
|  |   switch (MI.getOpcode()) { | ||
|  |   default: | ||
|  |     return false; | ||
|  | 
 | ||
|  |   // 4X32 instructions
 | ||
|  |   case AArch64::FMLAv4i32_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv4i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLAv4f32); | ||
|  |     break; | ||
|  |   case AArch64::FMLSv4i32_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv4i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLSv4f32); | ||
|  |     break; | ||
|  |   case AArch64::FMULXv4i32_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv4i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULXv4f32); | ||
|  |     break; | ||
|  |   case AArch64::FMULv4i32_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv4i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULv4f32); | ||
|  |     break; | ||
|  | 
 | ||
|  |   // 2X64 instructions
 | ||
|  |   case AArch64::FMLAv2i64_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i64lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLAv2f64); | ||
|  |     break; | ||
|  |   case AArch64::FMLSv2i64_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i64lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLSv2f64); | ||
|  |     break; | ||
|  |   case AArch64::FMULXv2i64_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i64lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULXv2f64); | ||
|  |     break; | ||
|  |   case AArch64::FMULv2i64_indexed: | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i64lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULv2f64); | ||
|  |     break; | ||
|  | 
 | ||
|  |   // 2X32 instructions
 | ||
|  |   case AArch64::FMLAv2i32_indexed: | ||
|  |     RC = &AArch64::FPR64RegClass; | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLAv2f32); | ||
|  |     break; | ||
|  |   case AArch64::FMLSv2i32_indexed: | ||
|  |     RC = &AArch64::FPR64RegClass; | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMLSv2f32); | ||
|  |     break; | ||
|  |   case AArch64::FMULXv2i32_indexed: | ||
|  |     RC = &AArch64::FPR64RegClass; | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULXv2f32); | ||
|  |     break; | ||
|  |   case AArch64::FMULv2i32_indexed: | ||
|  |     RC = &AArch64::FPR64RegClass; | ||
|  |     DupMCID = &TII->get(AArch64::DUPv2i32lane); | ||
|  |     MulMCID = &TII->get(AArch64::FMULv2f32); | ||
|  |     break; | ||
|  |   } | ||
|  | 
 | ||
|  |   SmallVector<const MCInstrDesc*, 2> ReplInstrMCID; | ||
|  |   ReplInstrMCID.push_back(DupMCID); | ||
|  |   ReplInstrMCID.push_back(MulMCID); | ||
|  |   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), | ||
|  |                          ReplInstrMCID)) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   const DebugLoc &DL = MI.getDebugLoc(); | ||
|  |   MachineBasicBlock &MBB = *MI.getParent(); | ||
|  |   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); | ||
|  | 
 | ||
|  |   // Get the operands of the current SIMD arithmetic instruction.
 | ||
|  |   unsigned MulDest = MI.getOperand(0).getReg(); | ||
|  |   unsigned SrcReg0 = MI.getOperand(1).getReg(); | ||
|  |   unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill()); | ||
|  |   unsigned SrcReg1 = MI.getOperand(2).getReg(); | ||
|  |   unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill()); | ||
|  |   unsigned DupDest; | ||
|  | 
 | ||
|  |   // Instructions of interest have either 4 or 5 operands.
 | ||
|  |   if (MI.getNumOperands() == 5) { | ||
|  |     unsigned SrcReg2 = MI.getOperand(3).getReg(); | ||
|  |     unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); | ||
|  |     unsigned LaneNumber = MI.getOperand(4).getImm(); | ||
|  |     // Create a new DUP instruction. Note that if an equivalent DUP instruction
 | ||
|  |     // has already been created before, then use that one instead of creating
 | ||
|  |     // a new one.
 | ||
|  |     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) { | ||
|  |       DupDest = MRI.createVirtualRegister(RC); | ||
|  |       BuildMI(MBB, MI, DL, *DupMCID, DupDest) | ||
|  |           .addReg(SrcReg2, Src2IsKill) | ||
|  |           .addImm(LaneNumber); | ||
|  |     } | ||
|  |     BuildMI(MBB, MI, DL, *MulMCID, MulDest) | ||
|  |         .addReg(SrcReg0, Src0IsKill) | ||
|  |         .addReg(SrcReg1, Src1IsKill) | ||
|  |         .addReg(DupDest, Src2IsKill); | ||
|  |   } else if (MI.getNumOperands() == 4) { | ||
|  |     unsigned LaneNumber = MI.getOperand(3).getImm(); | ||
|  |     if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) { | ||
|  |       DupDest = MRI.createVirtualRegister(RC); | ||
|  |       BuildMI(MBB, MI, DL, *DupMCID, DupDest) | ||
|  |           .addReg(SrcReg1, Src1IsKill) | ||
|  |           .addImm(LaneNumber); | ||
|  |     } | ||
|  |     BuildMI(MBB, MI, DL, *MulMCID, MulDest) | ||
|  |         .addReg(SrcReg0, Src0IsKill) | ||
|  |         .addReg(DupDest, Src1IsKill); | ||
|  |   } else { | ||
|  |     return false; | ||
|  |   } | ||
|  | 
 | ||
|  |   ++NumModifiedInstr; | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | /// Load/Store Interleaving instructions are not always beneficial.
 | ||
|  | /// Replace them by ZIP instructions and classical load/store.
 | ||
|  | ///
 | ||
|  | /// For example:
 | ||
|  | ///    st2 {v0.4s, v1.4s}, addr
 | ||
|  | ///
 | ||
|  | /// Is rewritten into:
 | ||
|  | ///    zip1 v2.4s, v0.4s, v1.4s
 | ||
|  | ///    zip2 v3.4s, v0.4s, v1.4s
 | ||
|  | ///    stp  q2, q3, addr
 | ||
|  | //
 | ||
|  | /// For example:
 | ||
|  | ///    st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
 | ||
|  | ///
 | ||
|  | /// Is rewritten into:
 | ||
|  | ///    zip1 v4.4s, v0.4s, v2.4s
 | ||
|  | ///    zip2 v5.4s, v0.4s, v2.4s
 | ||
|  | ///    zip1 v6.4s, v1.4s, v3.4s
 | ||
|  | ///    zip2 v7.4s, v1.4s, v3.4s
 | ||
|  | ///    zip1 v8.4s, v4.4s, v6.4s
 | ||
|  | ///    zip2 v9.4s, v4.4s, v6.4s
 | ||
|  | ///    zip1 v10.4s, v5.4s, v7.4s
 | ||
|  | ///    zip2 v11.4s, v5.4s, v7.4s
 | ||
|  | ///    stp  q8, q9, addr
 | ||
|  | ///    stp  q10, q11, addr+32
 | ||
|  | ///
 | ||
|  | /// Currently only instructions related to ST2 and ST4 are considered.
 | ||
|  | /// Other may be added later.
 | ||
|  | /// Return true if the SIMD instruction is modified.
 | ||
|  | bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { | ||
|  | 
 | ||
|  |   unsigned SeqReg, AddrReg; | ||
|  |   unsigned StReg[4], StRegKill[4]; | ||
|  |   MachineInstr *DefiningMI; | ||
|  |   const DebugLoc &DL = MI.getDebugLoc(); | ||
|  |   MachineBasicBlock &MBB = *MI.getParent(); | ||
|  |   SmallVector<unsigned, MaxNumRepl> ZipDest; | ||
|  |   SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID; | ||
|  | 
 | ||
|  |   // If current instruction matches any of the rewriting rules, then
 | ||
|  |   // gather information about parameters of the new instructions.
 | ||
|  |   bool Match = false; | ||
|  |   for (auto &I : IRT) { | ||
|  |     if (MI.getOpcode() == I.OrigOpc) { | ||
|  |       SeqReg  = MI.getOperand(0).getReg(); | ||
|  |       AddrReg = MI.getOperand(1).getReg(); | ||
|  |       DefiningMI = MRI->getUniqueVRegDef(SeqReg); | ||
|  |       unsigned NumReg = determineSrcReg(MI); | ||
|  |       if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) | ||
|  |         return false; | ||
|  | 
 | ||
|  |       for (auto &Repl : I.ReplOpc) { | ||
|  |         ReplInstrMCID.push_back(&TII->get(Repl)); | ||
|  |         // Generate destination registers but only for non-store instruction.
 | ||
|  |         if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) | ||
|  |           ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); | ||
|  |       } | ||
|  |       Match = true; | ||
|  |       break; | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   if (!Match) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   // Determine if it is profitable to replace MI by the series of instructions
 | ||
|  |   // represented in ReplInstrMCID.
 | ||
|  |   if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), | ||
|  |                          ReplInstrMCID)) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   // Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
 | ||
|  |   // this point, the code generation is hardcoded and does not rely on the IRT
 | ||
|  |   // table used above given that code generation for ST2 replacement is somewhat
 | ||
|  |   // different than for ST4 replacement. We could have added more info into the
 | ||
|  |   // table related to how we build new instructions but we may be adding more
 | ||
|  |   // complexity with that).
 | ||
|  |   switch (MI.getOpcode()) { | ||
|  |   default: | ||
|  |     return false; | ||
|  | 
 | ||
|  |   case AArch64::ST2Twov16b: | ||
|  |   case AArch64::ST2Twov8b: | ||
|  |   case AArch64::ST2Twov8h: | ||
|  |   case AArch64::ST2Twov4h: | ||
|  |   case AArch64::ST2Twov4s: | ||
|  |   case AArch64::ST2Twov2s: | ||
|  |   case AArch64::ST2Twov2d: | ||
|  |     // ZIP instructions
 | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) | ||
|  |         .addReg(StReg[0]) | ||
|  |         .addReg(StReg[1]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) | ||
|  |         .addReg(StReg[0], StRegKill[0]) | ||
|  |         .addReg(StReg[1], StRegKill[1]); | ||
|  |     // STP instructions
 | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) | ||
|  |         .addReg(ZipDest[0]) | ||
|  |         .addReg(ZipDest[1]) | ||
|  |         .addReg(AddrReg) | ||
|  |         .addImm(0); | ||
|  |     break; | ||
|  | 
 | ||
|  |   case AArch64::ST4Fourv16b: | ||
|  |   case AArch64::ST4Fourv8b: | ||
|  |   case AArch64::ST4Fourv8h: | ||
|  |   case AArch64::ST4Fourv4h: | ||
|  |   case AArch64::ST4Fourv4s: | ||
|  |   case AArch64::ST4Fourv2s: | ||
|  |   case AArch64::ST4Fourv2d: | ||
|  |     // ZIP instructions
 | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) | ||
|  |         .addReg(StReg[0]) | ||
|  |         .addReg(StReg[2]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) | ||
|  |         .addReg(StReg[0], StRegKill[0]) | ||
|  |         .addReg(StReg[2], StRegKill[2]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) | ||
|  |         .addReg(StReg[1]) | ||
|  |         .addReg(StReg[3]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) | ||
|  |         .addReg(StReg[1], StRegKill[1]) | ||
|  |         .addReg(StReg[3], StRegKill[3]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) | ||
|  |         .addReg(ZipDest[0]) | ||
|  |         .addReg(ZipDest[2]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) | ||
|  |         .addReg(ZipDest[0]) | ||
|  |         .addReg(ZipDest[2]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) | ||
|  |         .addReg(ZipDest[1]) | ||
|  |         .addReg(ZipDest[3]); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) | ||
|  |         .addReg(ZipDest[1]) | ||
|  |         .addReg(ZipDest[3]); | ||
|  |     // stp instructions
 | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) | ||
|  |         .addReg(ZipDest[4]) | ||
|  |         .addReg(ZipDest[5]) | ||
|  |         .addReg(AddrReg) | ||
|  |         .addImm(0); | ||
|  |     BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) | ||
|  |         .addReg(ZipDest[6]) | ||
|  |         .addReg(ZipDest[7]) | ||
|  |         .addReg(AddrReg) | ||
|  |         .addImm(2); | ||
|  |     break; | ||
|  |   } | ||
|  | 
 | ||
|  |   ++NumModifiedInstr; | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | /// Process The REG_SEQUENCE instruction, and extract the source
 | ||
|  | /// operands of the ST2/4 instruction from it.
 | ||
|  | /// Example of such instruction.
 | ||
|  | ///    %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
 | ||
|  | /// Return true when the instruction is processed successfully.
 | ||
|  | bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, | ||
|  |      unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { | ||
|  |   assert (DefiningMI != NULL); | ||
|  |   if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   for (unsigned i=0; i<NumArg; i++) { | ||
|  |     StReg[i]     = DefiningMI->getOperand(2*i+1).getReg(); | ||
|  |     StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); | ||
|  | 
 | ||
|  |     // Sanity check for the other arguments.
 | ||
|  |     if (DefiningMI->getOperand(2*i+2).isImm()) { | ||
|  |       switch (DefiningMI->getOperand(2*i+2).getImm()) { | ||
|  |       default: | ||
|  |         return false; | ||
|  | 
 | ||
|  |       case AArch64::dsub0: | ||
|  |       case AArch64::dsub1: | ||
|  |       case AArch64::dsub2: | ||
|  |       case AArch64::dsub3: | ||
|  |       case AArch64::qsub0: | ||
|  |       case AArch64::qsub1: | ||
|  |       case AArch64::qsub2: | ||
|  |       case AArch64::qsub3: | ||
|  |         break; | ||
|  |       } | ||
|  |     } | ||
|  |     else | ||
|  |       return false; | ||
|  |   } | ||
|  |   return true; | ||
|  | } | ||
|  | 
 | ||
|  | /// Return the number of useful source registers for this instruction
 | ||
|  | /// (2 for ST2 and 4 for ST4).
 | ||
|  | unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { | ||
|  |   switch (MI.getOpcode()) { | ||
|  |   default: | ||
|  |     llvm_unreachable("Unsupported instruction for this pass"); | ||
|  | 
 | ||
|  |   case AArch64::ST2Twov16b: | ||
|  |   case AArch64::ST2Twov8b: | ||
|  |   case AArch64::ST2Twov8h: | ||
|  |   case AArch64::ST2Twov4h: | ||
|  |   case AArch64::ST2Twov4s: | ||
|  |   case AArch64::ST2Twov2s: | ||
|  |   case AArch64::ST2Twov2d: | ||
|  |     return 2; | ||
|  | 
 | ||
|  |   case AArch64::ST4Fourv16b: | ||
|  |   case AArch64::ST4Fourv8b: | ||
|  |   case AArch64::ST4Fourv8h: | ||
|  |   case AArch64::ST4Fourv4h: | ||
|  |   case AArch64::ST4Fourv4s: | ||
|  |   case AArch64::ST4Fourv2s: | ||
|  |   case AArch64::ST4Fourv2d: | ||
|  |     return 4; | ||
|  |   } | ||
|  | } | ||
|  | 
 | ||
|  | bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { | ||
|  |   if (skipFunction(MF.getFunction())) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   TII = MF.getSubtarget().getInstrInfo(); | ||
|  |   MRI = &MF.getRegInfo(); | ||
|  |   const TargetSubtargetInfo &ST = MF.getSubtarget(); | ||
|  |   const AArch64InstrInfo *AAII = | ||
|  |       static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); | ||
|  |   if (!AAII) | ||
|  |     return false; | ||
|  |   SchedModel.init(ST.getSchedModel(), &ST, AAII); | ||
|  |   if (!SchedModel.hasInstrSchedModel()) | ||
|  |     return false; | ||
|  | 
 | ||
|  |   bool Changed = false; | ||
|  |   for (auto OptimizationKind : {VectorElem, Interleave}) { | ||
|  |     if (!shouldExitEarly(&MF, OptimizationKind)) { | ||
|  |       SmallVector<MachineInstr *, 8> RemoveMIs; | ||
|  |       for (MachineBasicBlock &MBB : MF) { | ||
|  |         for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); | ||
|  |              MII != MIE;) { | ||
|  |           MachineInstr &MI = *MII; | ||
|  |           bool InstRewrite; | ||
|  |           if (OptimizationKind == VectorElem) | ||
|  |             InstRewrite = optimizeVectElement(MI) ; | ||
|  |           else | ||
|  |             InstRewrite = optimizeLdStInterleave(MI); | ||
|  |           if (InstRewrite) { | ||
|  |             // Add MI to the list of instructions to be removed given that it
 | ||
|  |             // has been replaced.
 | ||
|  |             RemoveMIs.push_back(&MI); | ||
|  |             Changed = true; | ||
|  |           } | ||
|  |           ++MII; | ||
|  |         } | ||
|  |       } | ||
|  |       for (MachineInstr *MI : RemoveMIs) | ||
|  |         MI->eraseFromParent(); | ||
|  |     } | ||
|  |   } | ||
|  | 
 | ||
|  |   return Changed; | ||
|  | } | ||
|  | 
 | ||
|  | /// Returns an instance of the high cost ASIMD instruction replacement
 | ||
|  | /// optimization pass.
 | ||
|  | FunctionPass *llvm::createAArch64SIMDInstrOptPass() { | ||
|  |   return new AArch64SIMDInstrOpt(); | ||
|  | } |