Files
acceptance-tests
data
debian
docs
external
Newtonsoft.Json
api-doc-tools
api-snapshot
aspnetwebstack
bdwgc
binary-reference-assemblies
bockbuild
boringssl
cecil
cecil-legacy
corefx
corert
helix-binaries
ikdasm
ikvm
illinker-test-assets
linker
llvm-project
clang
clang-tools-extra
compiler-rt
eng
libcxx
libcxxabi
libunwind
lld
lldb
llvm
bindings
cmake
docs
examples
include
lib
Analysis
AsmParser
BinaryFormat
Bitcode
CodeGen
DebugInfo
Demangle
ExecutionEngine
FuzzMutate
Fuzzer
IR
IRReader
LTO
LineEditor
Linker
MC
Object
ObjectYAML
Option
Passes
ProfileData
Support
TableGen
Target
AArch64
AsmParser
Disassembler
InstPrinter
MCTargetDesc
TargetInfo
Utils
AArch64.h
AArch64.td
AArch64A53Fix835769.cpp
AArch64A57FPLoadBalancing.cpp
AArch64AdvSIMDScalarPass.cpp
AArch64AsmPrinter.cpp
AArch64CallLowering.cpp
AArch64CallLowering.h
AArch64CallingConvention.h
AArch64CallingConvention.td
AArch64CleanupLocalDynamicTLSPass.cpp
AArch64CollectLOH.cpp
AArch64CondBrTuning.cpp
AArch64ConditionOptimizer.cpp
AArch64ConditionalCompares.cpp
AArch64DeadRegisterDefinitionsPass.cpp
AArch64ExpandPseudoInsts.cpp
AArch64FalkorHWPFFix.cpp
AArch64FastISel.cpp.REMOVED.git-id
AArch64FrameLowering.cpp
AArch64FrameLowering.h
AArch64GenRegisterBankInfo.def
AArch64ISelDAGToDAG.cpp.REMOVED.git-id
AArch64ISelLowering.cpp.REMOVED.git-id
AArch64ISelLowering.h
AArch64InstrAtomics.td
AArch64InstrFormats.td.REMOVED.git-id
AArch64InstrInfo.cpp.REMOVED.git-id
AArch64InstrInfo.h
AArch64InstrInfo.td.REMOVED.git-id
AArch64InstructionSelector.cpp
AArch64LegalizerInfo.cpp
AArch64LegalizerInfo.h
AArch64LoadStoreOptimizer.cpp
AArch64MCInstLower.cpp
AArch64MCInstLower.h
AArch64MachineFunctionInfo.h
AArch64MacroFusion.cpp
AArch64MacroFusion.h
AArch64PBQPRegAlloc.cpp
AArch64PBQPRegAlloc.h
AArch64PerfectShuffle.h.REMOVED.git-id
AArch64PromoteConstant.cpp
AArch64RedundantCopyElimination.cpp
AArch64RegisterBankInfo.cpp
AArch64RegisterBankInfo.h
AArch64RegisterBanks.td
AArch64RegisterInfo.cpp
AArch64RegisterInfo.h
AArch64RegisterInfo.td
AArch64SIMDInstrOpt.cpp
AArch64SVEInstrInfo.td
AArch64SchedA53.td
AArch64SchedA57.td
AArch64SchedA57WriteRes.td
AArch64SchedCyclone.td
AArch64SchedFalkor.td
AArch64SchedFalkorDetails.td
AArch64SchedKryo.td
AArch64SchedKryoDetails.td
AArch64SchedM1.td
AArch64SchedThunderX.td
AArch64SchedThunderX2T99.td
AArch64Schedule.td
AArch64SelectionDAGInfo.cpp
AArch64SelectionDAGInfo.h
AArch64StorePairSuppress.cpp
AArch64Subtarget.cpp
AArch64Subtarget.h
AArch64SystemOperands.td
AArch64TargetMachine.cpp
AArch64TargetMachine.h
AArch64TargetObjectFile.cpp
AArch64TargetObjectFile.h
AArch64TargetTransformInfo.cpp
AArch64TargetTransformInfo.h
CMakeLists.txt
LLVMBuild.txt
SVEInstrFormats.td
AMDGPU
ARC
ARM
AVR
BPF
Hexagon
Lanai
MSP430
Mips
NVPTX
Nios2
PowerPC
RISCV
Sparc
SystemZ
WebAssembly
X86
XCore
CMakeLists.txt
LLVMBuild.txt
README.txt
Target.cpp
TargetIntrinsicInfo.cpp
TargetLoweringObjectFile.cpp
TargetMachine.cpp
TargetMachineC.cpp
Testing
ToolDrivers
Transforms
WindowsManifest
XRay
CMakeLists.txt
LLVMBuild.txt
projects
resources
runtimes
scripts
test
tools
unittests
utils
.arcconfig
.clang-format
.clang-tidy
.gitattributes
.gitignore
CMakeLists.txt
CODE_OWNERS.TXT
CREDITS.TXT
LICENSE.TXT
LLVMBuild.txt
README.txt
RELEASE_TESTERS.TXT
configure
llvm.spec.in
version.txt.in
nuget
openmp
polly
Directory.Build.props
Directory.Build.targets
NuGet.config
azure-pipelines.yml
build.cmd
build.sh
dir.common.props
global.json
llvm.proj
mxe-Win64.cmake.in
nuget-buildtasks
nunit-lite
roslyn-binaries
rx
xunit-binaries
how-to-bump-roslyn-binaries.md
ikvm-native
llvm
m4
man
mcs
mono
msvc
netcore
po
runtime
samples
scripts
support
tools
COPYING.LIB
LICENSE
Makefile.am
Makefile.in
NEWS
README.md
acinclude.m4
aclocal.m4
autogen.sh
code_of_conduct.md
compile
config.guess
config.h.in
config.rpath
config.sub
configure.REMOVED.git-id
configure.ac.REMOVED.git-id
depcomp
install-sh
ltmain.sh.REMOVED.git-id
missing
mkinstalldirs
mono-uninstalled.pc.in
test-driver
winconfig.h
linux-packaging-mono/external/llvm-project/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp

842 lines
23 KiB
C++
Raw Normal View History

//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
/// that may inhibit the HW prefetching. This is done in two steps. Before
/// ISel, we mark strided loads (i.e. those that will likely benefit from
/// prefetching) with metadata. Then, after opcodes have been finalized, we
/// insert MOVs and re-write loads to prevent unintnentional tag collisions.
// ===---------------------------------------------------------------------===//
#include "AArch64.h"
#include "AArch64InstrInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DepthFirstIterator.h"
#include "llvm/ADT/None.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <iterator>
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "falkor-hwpf-fix"
STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
STATISTIC(NumCollisionsAvoided,
"Number of HW prefetch tag collisions avoided");
STATISTIC(NumCollisionsNotAvoided,
"Number of HW prefetch tag collisions not avoided due to lack of regsiters");
DEBUG_COUNTER(FixCounter, "falkor-hwpf",
"Controls which tag collisions are avoided");
namespace {
class FalkorMarkStridedAccesses {
public:
FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
: LI(LI), SE(SE) {}
bool run();
private:
bool runOnLoop(Loop &L);
LoopInfo &LI;
ScalarEvolution &SE;
};
class FalkorMarkStridedAccessesLegacy : public FunctionPass {
public:
static char ID; // Pass ID, replacement for typeid
FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
initializeFalkorMarkStridedAccessesLegacyPass(
*PassRegistry::getPassRegistry());
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addPreserved<ScalarEvolutionWrapperPass>();
}
bool runOnFunction(Function &F) override;
};
} // end anonymous namespace
char FalkorMarkStridedAccessesLegacy::ID = 0;
INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
"Falkor HW Prefetch Fix", false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
"Falkor HW Prefetch Fix", false, false)
FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
return new FalkorMarkStridedAccessesLegacy();
}
bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const AArch64Subtarget *ST =
TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
if (ST->getProcFamily() != AArch64Subtarget::Falkor)
return false;
if (skipFunction(F))
return false;
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
FalkorMarkStridedAccesses LDP(LI, SE);
return LDP.run();
}
bool FalkorMarkStridedAccesses::run() {
bool MadeChange = false;
for (Loop *L : LI)
for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt)
MadeChange |= runOnLoop(**LIt);
return MadeChange;
}
bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
// Only mark strided loads in the inner-most loop
if (!L.empty())
return false;
bool MadeChange = false;
for (BasicBlock *BB : L.blocks()) {
for (Instruction &I : *BB) {
LoadInst *LoadI = dyn_cast<LoadInst>(&I);
if (!LoadI)
continue;
Value *PtrValue = LoadI->getPointerOperand();
if (L.isLoopInvariant(PtrValue))
continue;
const SCEV *LSCEV = SE.getSCEV(PtrValue);
const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
continue;
LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
MDNode::get(LoadI->getContext(), {}));
++NumStridedLoadsMarked;
DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
MadeChange = true;
}
}
return MadeChange;
}
namespace {
class FalkorHWPFFix : public MachineFunctionPass {
public:
static char ID;
FalkorHWPFFix() : MachineFunctionPass(ID) {
initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
}
bool runOnMachineFunction(MachineFunction &Fn) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>();
MachineFunctionPass::getAnalysisUsage(AU);
}
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
private:
void runOnLoop(MachineLoop &L, MachineFunction &Fn);
const AArch64InstrInfo *TII;
const TargetRegisterInfo *TRI;
DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
bool Modified;
};
/// Bits from load opcodes used to compute HW prefetcher instruction tags.
struct LoadInfo {
LoadInfo() = default;
unsigned DestReg = 0;
unsigned BaseReg = 0;
int BaseRegIdx = -1;
const MachineOperand *OffsetOpnd = nullptr;
bool IsPrePost = false;
};
} // end anonymous namespace
char FalkorHWPFFix::ID = 0;
INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
"Falkor HW Prefetch Fix Late Phase", false, false)
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
"Falkor HW Prefetch Fix Late Phase", false, false)
static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
}
static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
int DestRegIdx;
int BaseRegIdx;
int OffsetIdx;
bool IsPrePost;
switch (MI.getOpcode()) {
default:
return None;
case AArch64::LD1i64:
case AArch64::LD2i64:
DestRegIdx = 0;
BaseRegIdx = 3;
OffsetIdx = -1;
IsPrePost = false;
break;
case AArch64::LD1i8:
case AArch64::LD1i16:
case AArch64::LD1i32:
case AArch64::LD2i8:
case AArch64::LD2i16:
case AArch64::LD2i32:
case AArch64::LD3i8:
case AArch64::LD3i16:
case AArch64::LD3i32:
case AArch64::LD3i64:
case AArch64::LD4i8:
case AArch64::LD4i16:
case AArch64::LD4i32:
case AArch64::LD4i64:
DestRegIdx = -1;
BaseRegIdx = 3;
OffsetIdx = -1;
IsPrePost = false;
break;
case AArch64::LD1Onev1d:
case AArch64::LD1Onev2s:
case AArch64::LD1Onev4h:
case AArch64::LD1Onev8b:
case AArch64::LD1Onev2d:
case AArch64::LD1Onev4s:
case AArch64::LD1Onev8h:
case AArch64::LD1Onev16b:
case AArch64::LD1Rv1d:
case AArch64::LD1Rv2s:
case AArch64::LD1Rv4h:
case AArch64::LD1Rv8b:
case AArch64::LD1Rv2d:
case AArch64::LD1Rv4s:
case AArch64::LD1Rv8h:
case AArch64::LD1Rv16b:
DestRegIdx = 0;
BaseRegIdx = 1;
OffsetIdx = -1;
IsPrePost = false;
break;
case AArch64::LD1Twov1d:
case AArch64::LD1Twov2s:
case AArch64::LD1Twov4h:
case AArch64::LD1Twov8b:
case AArch64::LD1Twov2d:
case AArch64::LD1Twov4s:
case AArch64::LD1Twov8h:
case AArch64::LD1Twov16b:
case AArch64::LD1Threev1d:
case AArch64::LD1Threev2s:
case AArch64::LD1Threev4h:
case AArch64::LD1Threev8b:
case AArch64::LD1Threev2d:
case AArch64::LD1Threev4s:
case AArch64::LD1Threev8h:
case AArch64::LD1Threev16b:
case AArch64::LD1Fourv1d:
case AArch64::LD1Fourv2s:
case AArch64::LD1Fourv4h:
case AArch64::LD1Fourv8b:
case AArch64::LD1Fourv2d:
case AArch64::LD1Fourv4s:
case AArch64::LD1Fourv8h:
case AArch64::LD1Fourv16b:
case AArch64::LD2Twov2s:
case AArch64::LD2Twov4s:
case AArch64::LD2Twov8b:
case AArch64::LD2Twov2d:
case AArch64::LD2Twov4h:
case AArch64::LD2Twov8h:
case AArch64::LD2Twov16b:
case AArch64::LD2Rv1d:
case AArch64::LD2Rv2s:
case AArch64::LD2Rv4s:
case AArch64::LD2Rv8b:
case AArch64::LD2Rv2d:
case AArch64::LD2Rv4h:
case AArch64::LD2Rv8h:
case AArch64::LD2Rv16b:
case AArch64::LD3Threev2s:
case AArch64::LD3Threev4h:
case AArch64::LD3Threev8b:
case AArch64::LD3Threev2d:
case AArch64::LD3Threev4s:
case AArch64::LD3Threev8h:
case AArch64::LD3Threev16b:
case AArch64::LD3Rv1d:
case AArch64::LD3Rv2s:
case AArch64::LD3Rv4h:
case AArch64::LD3Rv8b:
case AArch64::LD3Rv2d:
case AArch64::LD3Rv4s:
case AArch64::LD3Rv8h:
case AArch64::LD3Rv16b:
case AArch64::LD4Fourv2s:
case AArch64::LD4Fourv4h:
case AArch64::LD4Fourv8b:
case AArch64::LD4Fourv2d:
case AArch64::LD4Fourv4s:
case AArch64::LD4Fourv8h:
case AArch64::LD4Fourv16b:
case AArch64::LD4Rv1d:
case AArch64::LD4Rv2s:
case AArch64::LD4Rv4h:
case AArch64::LD4Rv8b:
case AArch64::LD4Rv2d:
case AArch64::LD4Rv4s:
case AArch64::LD4Rv8h:
case AArch64::LD4Rv16b:
DestRegIdx = -1;
BaseRegIdx = 1;
OffsetIdx = -1;
IsPrePost = false;
break;
case AArch64::LD1i64_POST:
case AArch64::LD2i64_POST:
DestRegIdx = 1;
BaseRegIdx = 4;
OffsetIdx = 5;
IsPrePost = true;
break;
case AArch64::LD1i8_POST:
case AArch64::LD1i16_POST:
case AArch64::LD1i32_POST:
case AArch64::LD2i8_POST:
case AArch64::LD2i16_POST:
case AArch64::LD2i32_POST:
case AArch64::LD3i8_POST:
case AArch64::LD3i16_POST:
case AArch64::LD3i32_POST:
case AArch64::LD3i64_POST:
case AArch64::LD4i8_POST:
case AArch64::LD4i16_POST:
case AArch64::LD4i32_POST:
case AArch64::LD4i64_POST:
DestRegIdx = -1;
BaseRegIdx = 4;
OffsetIdx = 5;
IsPrePost = true;
break;
case AArch64::LD1Onev1d_POST:
case AArch64::LD1Onev2s_POST:
case AArch64::LD1Onev4h_POST:
case AArch64::LD1Onev8b_POST:
case AArch64::LD1Onev2d_POST:
case AArch64::LD1Onev4s_POST:
case AArch64::LD1Onev8h_POST:
case AArch64::LD1Onev16b_POST:
case AArch64::LD1Rv1d_POST:
case AArch64::LD1Rv2s_POST:
case AArch64::LD1Rv4h_POST:
case AArch64::LD1Rv8b_POST:
case AArch64::LD1Rv2d_POST:
case AArch64::LD1Rv4s_POST:
case AArch64::LD1Rv8h_POST:
case AArch64::LD1Rv16b_POST:
DestRegIdx = 1;
BaseRegIdx = 2;
OffsetIdx = 3;
IsPrePost = true;
break;
case AArch64::LD1Twov1d_POST:
case AArch64::LD1Twov2s_POST:
case AArch64::LD1Twov4h_POST:
case AArch64::LD1Twov8b_POST:
case AArch64::LD1Twov2d_POST:
case AArch64::LD1Twov4s_POST:
case AArch64::LD1Twov8h_POST:
case AArch64::LD1Twov16b_POST:
case AArch64::LD1Threev1d_POST:
case AArch64::LD1Threev2s_POST:
case AArch64::LD1Threev4h_POST:
case AArch64::LD1Threev8b_POST:
case AArch64::LD1Threev2d_POST:
case AArch64::LD1Threev4s_POST:
case AArch64::LD1Threev8h_POST:
case AArch64::LD1Threev16b_POST:
case AArch64::LD1Fourv1d_POST:
case AArch64::LD1Fourv2s_POST:
case AArch64::LD1Fourv4h_POST:
case AArch64::LD1Fourv8b_POST:
case AArch64::LD1Fourv2d_POST:
case AArch64::LD1Fourv4s_POST:
case AArch64::LD1Fourv8h_POST:
case AArch64::LD1Fourv16b_POST:
case AArch64::LD2Twov2s_POST:
case AArch64::LD2Twov4s_POST:
case AArch64::LD2Twov8b_POST:
case AArch64::LD2Twov2d_POST:
case AArch64::LD2Twov4h_POST:
case AArch64::LD2Twov8h_POST:
case AArch64::LD2Twov16b_POST:
case AArch64::LD2Rv1d_POST:
case AArch64::LD2Rv2s_POST:
case AArch64::LD2Rv4s_POST:
case AArch64::LD2Rv8b_POST:
case AArch64::LD2Rv2d_POST:
case AArch64::LD2Rv4h_POST:
case AArch64::LD2Rv8h_POST:
case AArch64::LD2Rv16b_POST:
case AArch64::LD3Threev2s_POST:
case AArch64::LD3Threev4h_POST:
case AArch64::LD3Threev8b_POST:
case AArch64::LD3Threev2d_POST:
case AArch64::LD3Threev4s_POST:
case AArch64::LD3Threev8h_POST:
case AArch64::LD3Threev16b_POST:
case AArch64::LD3Rv1d_POST:
case AArch64::LD3Rv2s_POST:
case AArch64::LD3Rv4h_POST:
case AArch64::LD3Rv8b_POST:
case AArch64::LD3Rv2d_POST:
case AArch64::LD3Rv4s_POST:
case AArch64::LD3Rv8h_POST:
case AArch64::LD3Rv16b_POST:
case AArch64::LD4Fourv2s_POST:
case AArch64::LD4Fourv4h_POST:
case AArch64::LD4Fourv8b_POST:
case AArch64::LD4Fourv2d_POST:
case AArch64::LD4Fourv4s_POST:
case AArch64::LD4Fourv8h_POST:
case AArch64::LD4Fourv16b_POST:
case AArch64::LD4Rv1d_POST:
case AArch64::LD4Rv2s_POST:
case AArch64::LD4Rv4h_POST:
case AArch64::LD4Rv8b_POST:
case AArch64::LD4Rv2d_POST:
case AArch64::LD4Rv4s_POST:
case AArch64::LD4Rv8h_POST:
case AArch64::LD4Rv16b_POST:
DestRegIdx = -1;
BaseRegIdx = 2;
OffsetIdx = 3;
IsPrePost = true;
break;
case AArch64::LDRBBroW:
case AArch64::LDRBBroX:
case AArch64::LDRBBui:
case AArch64::LDRBroW:
case AArch64::LDRBroX:
case AArch64::LDRBui:
case AArch64::LDRDl:
case AArch64::LDRDroW:
case AArch64::LDRDroX:
case AArch64::LDRDui:
case AArch64::LDRHHroW:
case AArch64::LDRHHroX:
case AArch64::LDRHHui:
case AArch64::LDRHroW:
case AArch64::LDRHroX:
case AArch64::LDRHui:
case AArch64::LDRQl:
case AArch64::LDRQroW:
case AArch64::LDRQroX:
case AArch64::LDRQui:
case AArch64::LDRSBWroW:
case AArch64::LDRSBWroX:
case AArch64::LDRSBWui:
case AArch64::LDRSBXroW:
case AArch64::LDRSBXroX:
case AArch64::LDRSBXui:
case AArch64::LDRSHWroW:
case AArch64::LDRSHWroX:
case AArch64::LDRSHWui:
case AArch64::LDRSHXroW:
case AArch64::LDRSHXroX:
case AArch64::LDRSHXui:
case AArch64::LDRSWl:
case AArch64::LDRSWroW:
case AArch64::LDRSWroX:
case AArch64::LDRSWui:
case AArch64::LDRSl:
case AArch64::LDRSroW:
case AArch64::LDRSroX:
case AArch64::LDRSui:
case AArch64::LDRWl:
case AArch64::LDRWroW:
case AArch64::LDRWroX:
case AArch64::LDRWui:
case AArch64::LDRXl:
case AArch64::LDRXroW:
case AArch64::LDRXroX:
case AArch64::LDRXui:
case AArch64::LDURBBi:
case AArch64::LDURBi:
case AArch64::LDURDi:
case AArch64::LDURHHi:
case AArch64::LDURHi:
case AArch64::LDURQi:
case AArch64::LDURSBWi:
case AArch64::LDURSBXi:
case AArch64::LDURSHWi:
case AArch64::LDURSHXi:
case AArch64::LDURSWi:
case AArch64::LDURSi:
case AArch64::LDURWi:
case AArch64::LDURXi:
DestRegIdx = 0;
BaseRegIdx = 1;
OffsetIdx = 2;
IsPrePost = false;
break;
case AArch64::LDRBBpost:
case AArch64::LDRBBpre:
case AArch64::LDRBpost:
case AArch64::LDRBpre:
case AArch64::LDRDpost:
case AArch64::LDRDpre:
case AArch64::LDRHHpost:
case AArch64::LDRHHpre:
case AArch64::LDRHpost:
case AArch64::LDRHpre:
case AArch64::LDRQpost:
case AArch64::LDRQpre:
case AArch64::LDRSBWpost:
case AArch64::LDRSBWpre:
case AArch64::LDRSBXpost:
case AArch64::LDRSBXpre:
case AArch64::LDRSHWpost:
case AArch64::LDRSHWpre:
case AArch64::LDRSHXpost:
case AArch64::LDRSHXpre:
case AArch64::LDRSWpost:
case AArch64::LDRSWpre:
case AArch64::LDRSpost:
case AArch64::LDRSpre:
case AArch64::LDRWpost:
case AArch64::LDRWpre:
case AArch64::LDRXpost:
case AArch64::LDRXpre:
DestRegIdx = 1;
BaseRegIdx = 2;
OffsetIdx = 3;
IsPrePost = true;
break;
case AArch64::LDNPDi:
case AArch64::LDNPQi:
case AArch64::LDNPSi:
case AArch64::LDPQi:
case AArch64::LDPDi:
case AArch64::LDPSi:
DestRegIdx = -1;
BaseRegIdx = 2;
OffsetIdx = 3;
IsPrePost = false;
break;
case AArch64::LDPSWi:
case AArch64::LDPWi:
case AArch64::LDPXi:
DestRegIdx = 0;
BaseRegIdx = 2;
OffsetIdx = 3;
IsPrePost = false;
break;
case AArch64::LDPQpost:
case AArch64::LDPQpre:
case AArch64::LDPDpost:
case AArch64::LDPDpre:
case AArch64::LDPSpost:
case AArch64::LDPSpre:
DestRegIdx = -1;
BaseRegIdx = 3;
OffsetIdx = 4;
IsPrePost = true;
break;
case AArch64::LDPSWpost:
case AArch64::LDPSWpre:
case AArch64::LDPWpost:
case AArch64::LDPWpre:
case AArch64::LDPXpost:
case AArch64::LDPXpre:
DestRegIdx = 1;
BaseRegIdx = 3;
OffsetIdx = 4;
IsPrePost = true;
break;
}
// Loads from the stack pointer don't get prefetched.
unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
if (BaseReg == AArch64::SP || BaseReg == AArch64::WSP)
return None;
LoadInfo LI;
LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg();
LI.BaseReg = BaseReg;
LI.BaseRegIdx = BaseRegIdx;
LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx);
LI.IsPrePost = IsPrePost;
return LI;
}
static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
const MachineInstr &MI, const LoadInfo &LI) {
unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0;
unsigned Base = TRI->getEncodingValue(LI.BaseReg);
unsigned Off;
if (LI.OffsetOpnd == nullptr)
Off = 0;
else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
LI.OffsetOpnd->isCPI())
return None;
else if (LI.OffsetOpnd->isReg())
Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
else
Off = LI.OffsetOpnd->getImm() >> 2;
return makeTag(Dest, Base, Off);
}
void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
// Build the initial tag map for the whole loop.
TagMap.clear();
for (MachineBasicBlock *MBB : L.getBlocks())
for (MachineInstr &MI : *MBB) {
Optional<LoadInfo> LInfo = getLoadInfo(MI);
if (!LInfo)
continue;
Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
if (!Tag)
continue;
TagMap[*Tag].push_back(&MI);
}
bool AnyCollisions = false;
for (auto &P : TagMap) {
auto Size = P.second.size();
if (Size > 1) {
for (auto *MI : P.second) {
if (TII->isStridedAccess(*MI)) {
AnyCollisions = true;
break;
}
}
}
if (AnyCollisions)
break;
}
// Nothing to fix.
if (!AnyCollisions)
return;
MachineRegisterInfo &MRI = Fn.getRegInfo();
// Go through all the basic blocks in the current loop and fix any streaming
// loads to avoid collisions with any other loads.
LiveRegUnits LR(*TRI);
for (MachineBasicBlock *MBB : L.getBlocks()) {
LR.clear();
LR.addLiveOuts(*MBB);
for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) {
MachineInstr &MI = *I;
if (!TII->isStridedAccess(MI))
continue;
Optional<LoadInfo> OptLdI = getLoadInfo(MI);
if (!OptLdI)
continue;
LoadInfo LdI = *OptLdI;
Optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
if (!OptOldTag)
continue;
auto &OldCollisions = TagMap[*OptOldTag];
if (OldCollisions.size() <= 1)
continue;
bool Fixed = false;
DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
if (!DebugCounter::shouldExecute(FixCounter)) {
DEBUG(dbgs() << "Skipping fix due to debug counter:\n " << MI);
continue;
}
// Add the non-base registers of MI as live so we don't use them as
// scratch registers.
for (unsigned OpI = 0, OpE = MI.getNumOperands(); OpI < OpE; ++OpI) {
if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
continue;
MachineOperand &MO = MI.getOperand(OpI);
if (MO.isReg() && MO.readsReg())
LR.addReg(MO.getReg());
}
for (unsigned ScratchReg : AArch64::GPR64RegClass) {
if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg))
continue;
LoadInfo NewLdI(LdI);
NewLdI.BaseReg = ScratchReg;
unsigned NewTag = *getTag(TRI, MI, NewLdI);
// Scratch reg tag would collide too, so don't use it.
if (TagMap.count(NewTag))
continue;
DEBUG(dbgs() << "Changing base reg to: " << printReg(ScratchReg, TRI)
<< '\n');
// Rewrite:
// Xd = LOAD Xb, off
// to:
// Xc = MOV Xb
// Xd = LOAD Xc, off
DebugLoc DL = MI.getDebugLoc();
BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
.addReg(AArch64::XZR)
.addReg(LdI.BaseReg)
.addImm(0);
MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
BaseOpnd.setReg(ScratchReg);
// If the load does a pre/post increment, then insert a MOV after as
// well to update the real base register.
if (LdI.IsPrePost) {
DEBUG(dbgs() << "Doing post MOV of incremented reg: "
<< printReg(ScratchReg, TRI) << '\n');
MI.getOperand(0).setReg(
ScratchReg); // Change tied operand pre/post update dest.
BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
TII->get(AArch64::ORRXrs), LdI.BaseReg)
.addReg(AArch64::XZR)
.addReg(ScratchReg)
.addImm(0);
}
for (int I = 0, E = OldCollisions.size(); I != E; ++I)
if (OldCollisions[I] == &MI) {
std::swap(OldCollisions[I], OldCollisions[E - 1]);
OldCollisions.pop_back();
break;
}
// Update TagMap to reflect instruction changes to reduce the number
// of later MOVs to be inserted. This needs to be done after
// OldCollisions is updated since it may be relocated by this
// insertion.
TagMap[NewTag].push_back(&MI);
++NumCollisionsAvoided;
Fixed = true;
Modified = true;
break;
}
if (!Fixed)
++NumCollisionsNotAvoided;
}
}
}
bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
if (ST.getProcFamily() != AArch64Subtarget::Falkor)
return false;
if (skipFunction(Fn.getFunction()))
return false;
TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
TRI = ST.getRegisterInfo();
assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
"Register liveness not available!");
MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
Modified = false;
for (MachineLoop *I : LI)
for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
// Only process inner-loops
if (L->empty())
runOnLoop(**L, Fn);
return Modified;
}
FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }