Imported Upstream version 6.10.0.49

Former-commit-id: 1d6753294b2993e1fbf92de9366bb9544db4189b
This commit is contained in:
Xamarin Public Jenkins (auto-signing)
2020-01-16 16:38:04 +00:00
parent d94e79959b
commit 468663ddbb
48518 changed files with 2789335 additions and 61176 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,400 @@
//===- CodeGeneration.cpp - Code generate the Scops using ISL. ---------======//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// The CodeGeneration pass takes a Scop created by ScopInfo and translates it
// back to LLVM-IR using the ISL code generator.
//
// The Scop describes the high level memory behavior of a control flow region.
// Transformation passes can update the schedule (execution order) of statements
// in the Scop. ISL is used to generate an abstract syntax tree that reflects
// the updated execution order. This clast is used to create new LLVM-IR that is
// computationally equivalent to the original control flow region, but executes
// its code in the new execution order defined by the changed schedule.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/IRBuilder.h"
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslNodeBuilder.h"
#include "polly/CodeGen/PerfMonitor.h"
#include "polly/CodeGen/Utils.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
#include "polly/Options.h"
#include "polly/ScopDetectionDiagnostic.h"
#include "polly/ScopInfo.h"
#include "polly/Support/ScopHelper.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "isl/ast.h"
#include <cassert>
#include <utility>
using namespace llvm;
using namespace polly;
#define DEBUG_TYPE "polly-codegen"
static cl::opt<bool> Verify("polly-codegen-verify",
cl::desc("Verify the function generated by Polly"),
cl::Hidden, cl::init(false), cl::ZeroOrMore,
cl::cat(PollyCategory));
bool polly::PerfMonitoring;
static cl::opt<bool, true>
XPerfMonitoring("polly-codegen-perf-monitoring",
cl::desc("Add run-time performance monitoring"), cl::Hidden,
cl::location(polly::PerfMonitoring), cl::init(false),
cl::ZeroOrMore, cl::cat(PollyCategory));
STATISTIC(ScopsProcessed, "Number of SCoP processed");
STATISTIC(CodegenedScops, "Number of successfully generated SCoPs");
STATISTIC(CodegenedAffineLoops,
"Number of original affine loops in SCoPs that have been generated");
STATISTIC(CodegenedBoxedLoops,
"Number of original boxed loops in SCoPs that have been generated");
namespace polly {
/// Mark a basic block unreachable.
///
/// Marks the basic block @p Block unreachable by equipping it with an
/// UnreachableInst.
void markBlockUnreachable(BasicBlock &Block, PollyIRBuilder &Builder) {
auto *OrigTerminator = Block.getTerminator();
Builder.SetInsertPoint(OrigTerminator);
Builder.CreateUnreachable();
OrigTerminator->eraseFromParent();
}
} // namespace polly
static void verifyGeneratedFunction(Scop &S, Function &F, IslAstInfo &AI) {
if (!Verify || !verifyFunction(F, &errs()))
return;
DEBUG({
errs() << "== ISL Codegen created an invalid function ==\n\n== The "
"SCoP ==\n";
errs() << S;
errs() << "\n== The isl AST ==\n";
AI.print(errs());
errs() << "\n== The invalid function ==\n";
F.print(errs());
});
llvm_unreachable("Polly generated function could not be verified. Add "
"-polly-codegen-verify=false to disable this assertion.");
}
// CodeGeneration adds a lot of BBs without updating the RegionInfo
// We make all created BBs belong to the scop's parent region without any
// nested structure to keep the RegionInfo verifier happy.
static void fixRegionInfo(Function &F, Region &ParentRegion, RegionInfo &RI) {
for (BasicBlock &BB : F) {
if (RI.getRegionFor(&BB))
continue;
RI.setRegionFor(&BB, &ParentRegion);
}
}
/// Remove all lifetime markers (llvm.lifetime.start, llvm.lifetime.end) from
/// @R.
///
/// CodeGeneration does not copy lifetime markers into the optimized SCoP,
/// which would leave the them only in the original path. This can transform
/// code such as
///
/// llvm.lifetime.start(%p)
/// llvm.lifetime.end(%p)
///
/// into
///
/// if (RTC) {
/// // generated code
/// } else {
/// // original code
/// llvm.lifetime.start(%p)
/// }
/// llvm.lifetime.end(%p)
///
/// The current StackColoring algorithm cannot handle if some, but not all,
/// paths from the end marker to the entry block cross the start marker. Same
/// for start markers that do not always cross the end markers. We avoid any
/// issues by removing all lifetime markers, even from the original code.
///
/// A better solution could be to hoist all llvm.lifetime.start to the split
/// node and all llvm.lifetime.end to the merge node, which should be
/// conservatively correct.
static void removeLifetimeMarkers(Region *R) {
for (auto *BB : R->blocks()) {
auto InstIt = BB->begin();
auto InstEnd = BB->end();
while (InstIt != InstEnd) {
auto NextIt = InstIt;
++NextIt;
if (auto *IT = dyn_cast<IntrinsicInst>(&*InstIt)) {
switch (IT->getIntrinsicID()) {
case Intrinsic::lifetime_start:
case Intrinsic::lifetime_end:
BB->getInstList().erase(InstIt);
break;
default:
break;
}
}
InstIt = NextIt;
}
}
}
static bool CodeGen(Scop &S, IslAstInfo &AI, LoopInfo &LI, DominatorTree &DT,
ScalarEvolution &SE, RegionInfo &RI) {
// Check whether IslAstInfo uses the same isl_ctx. Since -polly-codegen
// reports itself to preserve DependenceInfo and IslAstInfo, we might get
// those analysis that were computed by a different ScopInfo for a different
// Scop structure. When the ScopInfo/Scop object is freed, there is a high
// probability that the new ScopInfo/Scop object will be created at the same
// heap position with the same address. Comparing whether the Scop or ScopInfo
// address is the expected therefore is unreliable.
// Instead, we compare the address of the isl_ctx object. Both, DependenceInfo
// and IslAstInfo must hold a reference to the isl_ctx object to ensure it is
// not freed before the destruction of those analyses which might happen after
// the destruction of the Scop/ScopInfo they refer to. Hence, the isl_ctx
// will not be freed and its space not reused as long there is a
// DependenceInfo or IslAstInfo around.
IslAst &Ast = AI.getIslAst();
if (Ast.getSharedIslCtx() != S.getSharedIslCtx()) {
DEBUG(dbgs() << "Got an IstAst for a different Scop/isl_ctx\n");
return false;
}
// Check if we created an isl_ast root node, otherwise exit.
isl_ast_node *AstRoot = Ast.getAst();
if (!AstRoot)
return false;
// Collect statistics. Do it before we modify the IR to avoid having it any
// influence on the result.
auto ScopStats = S.getStatistics();
ScopsProcessed++;
auto &DL = S.getFunction().getParent()->getDataLayout();
Region *R = &S.getRegion();
assert(!R->isTopLevelRegion() && "Top level regions are not supported");
ScopAnnotator Annotator;
simplifyRegion(R, &DT, &LI, &RI);
assert(R->isSimple());
BasicBlock *EnteringBB = S.getEnteringBlock();
assert(EnteringBB);
PollyIRBuilder Builder = createPollyIRBuilder(EnteringBB, Annotator);
// Only build the run-time condition and parameters _after_ having
// introduced the conditional branch. This is important as the conditional
// branch will guard the original scop from new induction variables that
// the SCEVExpander may introduce while code generating the parameters and
// which may introduce scalar dependences that prevent us from correctly
// code generating this scop.
BBPair StartExitBlocks =
std::get<0>(executeScopConditionally(S, Builder.getTrue(), DT, RI, LI));
BasicBlock *StartBlock = std::get<0>(StartExitBlocks);
BasicBlock *ExitBlock = std::get<1>(StartExitBlocks);
removeLifetimeMarkers(R);
auto *SplitBlock = StartBlock->getSinglePredecessor();
IslNodeBuilder NodeBuilder(Builder, Annotator, DL, LI, SE, DT, S, StartBlock);
// All arrays must have their base pointers known before
// ScopAnnotator::buildAliasScopes.
NodeBuilder.allocateNewArrays(StartExitBlocks);
Annotator.buildAliasScopes(S);
if (PerfMonitoring) {
PerfMonitor P(S, EnteringBB->getParent()->getParent());
P.initialize();
P.insertRegionStart(SplitBlock->getTerminator());
BasicBlock *MergeBlock = ExitBlock->getUniqueSuccessor();
P.insertRegionEnd(MergeBlock->getTerminator());
}
// First generate code for the hoisted invariant loads and transitively the
// parameters they reference. Afterwards, for the remaining parameters that
// might reference the hoisted loads. Finally, build the runtime check
// that might reference both hoisted loads as well as parameters.
// If the hoisting fails we have to bail and execute the original code.
Builder.SetInsertPoint(SplitBlock->getTerminator());
if (!NodeBuilder.preloadInvariantLoads()) {
// Patch the introduced branch condition to ensure that we always execute
// the original SCoP.
auto *FalseI1 = Builder.getFalse();
auto *SplitBBTerm = Builder.GetInsertBlock()->getTerminator();
SplitBBTerm->setOperand(0, FalseI1);
// Since the other branch is hence ignored we mark it as unreachable and
// adjust the dominator tree accordingly.
auto *ExitingBlock = StartBlock->getUniqueSuccessor();
assert(ExitingBlock);
auto *MergeBlock = ExitingBlock->getUniqueSuccessor();
assert(MergeBlock);
markBlockUnreachable(*StartBlock, Builder);
markBlockUnreachable(*ExitingBlock, Builder);
auto *ExitingBB = S.getExitingBlock();
assert(ExitingBB);
DT.changeImmediateDominator(MergeBlock, ExitingBB);
DT.eraseNode(ExitingBlock);
isl_ast_node_free(AstRoot);
} else {
NodeBuilder.addParameters(S.getContext().release());
Value *RTC = NodeBuilder.createRTC(AI.getRunCondition());
Builder.GetInsertBlock()->getTerminator()->setOperand(0, RTC);
// Explicitly set the insert point to the end of the block to avoid that a
// split at the builder's current
// insert position would move the malloc calls to the wrong BasicBlock.
// Ideally we would just split the block during allocation of the new
// arrays, but this would break the assumption that there are no blocks
// between polly.start and polly.exiting (at this point).
Builder.SetInsertPoint(StartBlock->getTerminator());
NodeBuilder.create(AstRoot);
NodeBuilder.finalize();
fixRegionInfo(*EnteringBB->getParent(), *R->getParent(), RI);
CodegenedScops++;
CodegenedAffineLoops += ScopStats.NumAffineLoops;
CodegenedBoxedLoops += ScopStats.NumBoxedLoops;
}
Function *F = EnteringBB->getParent();
verifyGeneratedFunction(S, *F, AI);
for (auto *SubF : NodeBuilder.getParallelSubfunctions())
verifyGeneratedFunction(S, *SubF, AI);
// Mark the function such that we run additional cleanup passes on this
// function (e.g. mem2reg to rediscover phi nodes).
F->addFnAttr("polly-optimized");
return true;
}
namespace {
class CodeGeneration : public ScopPass {
public:
static char ID;
/// The data layout used.
const DataLayout *DL;
/// @name The analysis passes we need to generate code.
///
///{
LoopInfo *LI;
IslAstInfo *AI;
DominatorTree *DT;
ScalarEvolution *SE;
RegionInfo *RI;
///}
CodeGeneration() : ScopPass(ID) {}
/// Generate LLVM-IR for the SCoP @p S.
bool runOnScop(Scop &S) override {
// Skip SCoPs in case they're already code-generated by PPCGCodeGeneration.
if (S.isToBeSkipped())
return false;
AI = &getAnalysis<IslAstInfoWrapperPass>().getAI();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DL = &S.getFunction().getParent()->getDataLayout();
RI = &getAnalysis<RegionInfoPass>().getRegionInfo();
return CodeGen(S, *AI, *LI, *DT, *SE, *RI);
}
/// Register all analyses and transformation required.
void getAnalysisUsage(AnalysisUsage &AU) const override {
ScopPass::getAnalysisUsage(AU);
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<IslAstInfoWrapperPass>();
AU.addRequired<RegionInfoPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<ScopDetectionWrapperPass>();
AU.addRequired<ScopInfoRegionPass>();
AU.addRequired<LoopInfoWrapperPass>();
AU.addPreserved<DependenceInfo>();
AU.addPreserved<IslAstInfoWrapperPass>();
// FIXME: We do not yet add regions for the newly generated code to the
// region tree.
}
};
} // namespace
PreservedAnalyses CodeGenerationPass::run(Scop &S, ScopAnalysisManager &SAM,
ScopStandardAnalysisResults &AR,
SPMUpdater &U) {
auto &AI = SAM.getResult<IslAstAnalysis>(S, AR);
if (CodeGen(S, AI, AR.LI, AR.DT, AR.SE, AR.RI)) {
U.invalidateScop(S);
return PreservedAnalyses::none();
}
return PreservedAnalyses::all();
}
char CodeGeneration::ID = 1;
Pass *polly::createCodeGenerationPass() { return new CodeGeneration(); }
INITIALIZE_PASS_BEGIN(CodeGeneration, "polly-codegen",
"Polly - Create LLVM-IR from SCoPs", false, false);
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
INITIALIZE_PASS_END(CodeGeneration, "polly-codegen",
"Polly - Create LLVM-IR from SCoPs", false, false)

View File

@@ -0,0 +1,139 @@
//===- CodegenCleanup.cpp -------------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/CodegenCleanup.h"
#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/PassInfo.h"
#include "llvm/PassRegistry.h"
#include "llvm/PassSupport.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/GVN.h"
#define DEBUG_TYPE "polly-cleanup"
using namespace llvm;
using namespace polly;
namespace {
class CodegenCleanup : public FunctionPass {
private:
CodegenCleanup(const CodegenCleanup &) = delete;
const CodegenCleanup &operator=(const CodegenCleanup &) = delete;
llvm::legacy::FunctionPassManager *FPM;
public:
static char ID;
explicit CodegenCleanup() : FunctionPass(ID), FPM(nullptr) {}
/// @name FunctionPass interface
//@{
virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override {}
virtual bool doInitialization(Module &M) override {
assert(!FPM);
FPM = new llvm::legacy::FunctionPassManager(&M);
// TODO: How to make parent passes discoverable?
// TODO: Should be sensitive to compiler options in PassManagerBuilder, to
// which we do not have access here.
FPM->add(createScopedNoAliasAAWrapperPass());
FPM->add(createTypeBasedAAWrapperPass());
FPM->add(createAAResultsWrapperPass());
// TODO: These are non-conditional passes that run between
// EP_ModuleOptimizerEarly and EP_VectorizerStart just to ensure we do not
// miss any optimization that would have run after Polly with
// -polly-position=early. This can probably be reduced to a more compact set
// of passes.
FPM->add(createCFGSimplificationPass());
FPM->add(createSROAPass());
FPM->add(createEarlyCSEPass());
FPM->add(createPromoteMemoryToRegisterPass());
FPM->add(createInstructionCombiningPass(true));
FPM->add(createCFGSimplificationPass());
FPM->add(createSROAPass());
FPM->add(createEarlyCSEPass(true));
FPM->add(createSpeculativeExecutionIfHasBranchDivergencePass());
FPM->add(createJumpThreadingPass());
FPM->add(createCorrelatedValuePropagationPass());
FPM->add(createCFGSimplificationPass());
FPM->add(createInstructionCombiningPass(true));
FPM->add(createLibCallsShrinkWrapPass());
FPM->add(createTailCallEliminationPass());
FPM->add(createCFGSimplificationPass());
FPM->add(createReassociatePass());
FPM->add(createLoopRotatePass(-1));
FPM->add(createGVNPass());
FPM->add(createLICMPass());
FPM->add(createLoopUnswitchPass());
FPM->add(createCFGSimplificationPass());
FPM->add(createInstructionCombiningPass(true));
FPM->add(createIndVarSimplifyPass());
FPM->add(createLoopIdiomPass());
FPM->add(createLoopDeletionPass());
FPM->add(createCFGSimplificationPass());
FPM->add(createSimpleLoopUnrollPass(3));
FPM->add(createMergedLoadStoreMotionPass());
FPM->add(createGVNPass());
FPM->add(createMemCpyOptPass());
FPM->add(createSCCPPass());
FPM->add(createBitTrackingDCEPass());
FPM->add(createInstructionCombiningPass(true));
FPM->add(createJumpThreadingPass());
FPM->add(createCorrelatedValuePropagationPass());
FPM->add(createDeadStoreEliminationPass());
FPM->add(createLICMPass());
FPM->add(createAggressiveDCEPass());
FPM->add(createCFGSimplificationPass());
FPM->add(createInstructionCombiningPass(true));
FPM->add(createFloat2IntPass());
return FPM->doInitialization();
}
virtual bool doFinalization(Module &M) override {
bool Result = FPM->doFinalization();
delete FPM;
FPM = nullptr;
return Result;
}
virtual bool runOnFunction(llvm::Function &F) override {
if (!F.hasFnAttribute("polly-optimized")) {
DEBUG(dbgs() << F.getName()
<< ": Skipping cleanup because Polly did not optimize it.");
return false;
}
DEBUG(dbgs() << F.getName() << ": Running codegen cleanup...");
return FPM->run(F);
}
//@}
};
char CodegenCleanup::ID;
} // namespace
FunctionPass *polly::createCodegenCleanupPass() { return new CodegenCleanup(); }
INITIALIZE_PASS_BEGIN(CodegenCleanup, "polly-cleanup",
"Polly - Cleanup after code generation", false, false)
INITIALIZE_PASS_END(CodegenCleanup, "polly-cleanup",
"Polly - Cleanup after code generation", false, false)

View File

@@ -0,0 +1,256 @@
//===------ PollyIRBuilder.cpp --------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// The Polly IRBuilder file contains Polly specific extensions for the IRBuilder
// that are used e.g. to emit the llvm.loop.parallel metadata.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/IRBuilder.h"
#include "polly/ScopInfo.h"
#include "polly/Support/ScopHelper.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/IR/Metadata.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
using namespace polly;
static const int MaxArraysInAliasScops = 10;
/// Get a self referencing id metadata node.
///
/// The MDNode looks like this (if arg0/arg1 are not null):
///
/// '!n = metadata !{metadata !n, arg0, arg1}'
///
/// @return The self referencing id metadata node.
static MDNode *getID(LLVMContext &Ctx, Metadata *arg0 = nullptr,
Metadata *arg1 = nullptr) {
MDNode *ID;
SmallVector<Metadata *, 3> Args;
// Use a temporary node to safely create a unique pointer for the first arg.
auto TempNode = MDNode::getTemporary(Ctx, None);
// Reserve operand 0 for loop id self reference.
Args.push_back(TempNode.get());
if (arg0)
Args.push_back(arg0);
if (arg1)
Args.push_back(arg1);
ID = MDNode::get(Ctx, Args);
ID->replaceOperandWith(0, ID);
return ID;
}
ScopAnnotator::ScopAnnotator() : SE(nullptr), AliasScopeDomain(nullptr) {}
void ScopAnnotator::buildAliasScopes(Scop &S) {
SE = S.getSE();
LLVMContext &Ctx = SE->getContext();
AliasScopeDomain = getID(Ctx, MDString::get(Ctx, "polly.alias.scope.domain"));
AliasScopeMap.clear();
OtherAliasScopeListMap.clear();
// We are only interested in arrays, but no scalar references. Scalars should
// be handled easily by basicaa.
SmallVector<ScopArrayInfo *, 10> Arrays;
for (ScopArrayInfo *Array : S.arrays())
if (Array->isArrayKind())
Arrays.push_back(Array);
// The construction of alias scopes is quadratic in the number of arrays
// involved. In case of too many arrays, skip the construction of alias
// information to avoid quadratic increases in compile time and code size.
if (Arrays.size() > MaxArraysInAliasScops)
return;
std::string AliasScopeStr = "polly.alias.scope.";
for (const ScopArrayInfo *Array : Arrays) {
assert(Array->getBasePtr() && "Base pointer must be present");
AliasScopeMap[Array->getBasePtr()] =
getID(Ctx, AliasScopeDomain,
MDString::get(Ctx, (AliasScopeStr + Array->getName()).c_str()));
}
for (const ScopArrayInfo *Array : Arrays) {
MDNode *AliasScopeList = MDNode::get(Ctx, {});
for (const auto &AliasScopePair : AliasScopeMap) {
if (Array->getBasePtr() == AliasScopePair.first)
continue;
Metadata *Args = {AliasScopePair.second};
AliasScopeList =
MDNode::concatenate(AliasScopeList, MDNode::get(Ctx, Args));
}
OtherAliasScopeListMap[Array->getBasePtr()] = AliasScopeList;
}
}
void ScopAnnotator::pushLoop(Loop *L, bool IsParallel) {
ActiveLoops.push_back(L);
if (!IsParallel)
return;
BasicBlock *Header = L->getHeader();
MDNode *Id = getID(Header->getContext());
assert(Id->getOperand(0) == Id && "Expected Id to be a self-reference");
assert(Id->getNumOperands() == 1 && "Unexpected extra operands in Id");
MDNode *Ids = ParallelLoops.empty()
? Id
: MDNode::concatenate(ParallelLoops.back(), Id);
ParallelLoops.push_back(Ids);
}
void ScopAnnotator::popLoop(bool IsParallel) {
ActiveLoops.pop_back();
if (!IsParallel)
return;
assert(!ParallelLoops.empty() && "Expected a parallel loop to pop");
ParallelLoops.pop_back();
}
void ScopAnnotator::annotateLoopLatch(BranchInst *B, Loop *L, bool IsParallel,
bool IsLoopVectorizerDisabled) const {
MDNode *MData = nullptr;
if (IsLoopVectorizerDisabled) {
SmallVector<Metadata *, 3> Args;
LLVMContext &Ctx = SE->getContext();
Args.push_back(MDString::get(Ctx, "llvm.loop.vectorize.enable"));
auto *FalseValue = ConstantInt::get(Type::getInt1Ty(Ctx), 0);
Args.push_back(ValueAsMetadata::get(FalseValue));
MData = MDNode::concatenate(MData, getID(Ctx, MDNode::get(Ctx, Args)));
}
if (IsParallel) {
assert(!ParallelLoops.empty() && "Expected a parallel loop to annotate");
MDNode *Ids = ParallelLoops.back();
MDNode *Id = cast<MDNode>(Ids->getOperand(Ids->getNumOperands() - 1));
MData = MDNode::concatenate(MData, Id);
}
B->setMetadata("llvm.loop", MData);
}
/// Get the pointer operand
///
/// @param Inst The instruction to be analyzed.
/// @return the pointer operand in case @p Inst is a memory access
/// instruction and nullptr otherwise.
static llvm::Value *getMemAccInstPointerOperand(Instruction *Inst) {
auto MemInst = MemAccInst::dyn_cast(Inst);
if (!MemInst)
return nullptr;
return MemInst.getPointerOperand();
}
void ScopAnnotator::annotateSecondLevel(llvm::Instruction *Inst,
llvm::Value *BasePtr) {
auto *PtrSCEV = SE->getSCEV(getMemAccInstPointerOperand(Inst));
auto *BasePtrSCEV = SE->getPointerBase(PtrSCEV);
if (!PtrSCEV)
return;
auto SecondLevelAliasScope = SecondLevelAliasScopeMap.lookup(PtrSCEV);
auto SecondLevelOtherAliasScopeList =
SecondLevelOtherAliasScopeListMap.lookup(PtrSCEV);
if (!SecondLevelAliasScope) {
auto AliasScope = AliasScopeMap.lookup(BasePtr);
if (!AliasScope)
return;
LLVMContext &Ctx = SE->getContext();
SecondLevelAliasScope = getID(
Ctx, AliasScope, MDString::get(Ctx, "second level alias metadata"));
SecondLevelAliasScopeMap[PtrSCEV] = SecondLevelAliasScope;
Metadata *Args = {SecondLevelAliasScope};
auto SecondLevelBasePtrAliasScopeList =
SecondLevelAliasScopeMap.lookup(BasePtrSCEV);
SecondLevelAliasScopeMap[BasePtrSCEV] = MDNode::concatenate(
SecondLevelBasePtrAliasScopeList, MDNode::get(Ctx, Args));
auto OtherAliasScopeList = OtherAliasScopeListMap.lookup(BasePtr);
SecondLevelOtherAliasScopeList = MDNode::concatenate(
OtherAliasScopeList, SecondLevelBasePtrAliasScopeList);
SecondLevelOtherAliasScopeListMap[PtrSCEV] = SecondLevelOtherAliasScopeList;
}
Inst->setMetadata("alias.scope", SecondLevelAliasScope);
Inst->setMetadata("noalias", SecondLevelOtherAliasScopeList);
}
void ScopAnnotator::annotate(Instruction *Inst) {
if (!Inst->mayReadOrWriteMemory())
return;
if (!ParallelLoops.empty())
Inst->setMetadata("llvm.mem.parallel_loop_access", ParallelLoops.back());
// TODO: Use the ScopArrayInfo once available here.
if (!AliasScopeDomain)
return;
// Do not apply annotations on memory operations that take more than one
// pointer. It would be ambiguous to which pointer the annotation applies.
// FIXME: How can we specify annotations for all pointer arguments?
if (isa<CallInst>(Inst) && !isa<MemSetInst>(Inst))
return;
auto *Ptr = getMemAccInstPointerOperand(Inst);
if (!Ptr)
return;
auto *PtrSCEV = SE->getSCEV(Ptr);
auto *BaseSCEV = SE->getPointerBase(PtrSCEV);
auto *SU = dyn_cast<SCEVUnknown>(BaseSCEV);
if (!SU)
return;
auto *BasePtr = SU->getValue();
if (!BasePtr)
return;
auto AliasScope = AliasScopeMap.lookup(BasePtr);
if (!AliasScope) {
BasePtr = AlternativeAliasBases.lookup(BasePtr);
if (!BasePtr)
return;
AliasScope = AliasScopeMap.lookup(BasePtr);
if (!AliasScope)
return;
}
assert(OtherAliasScopeListMap.count(BasePtr) &&
"BasePtr either expected in AliasScopeMap and OtherAlias...Map");
auto *OtherAliasScopeList = OtherAliasScopeListMap[BasePtr];
if (InterIterationAliasFreeBasePtrs.count(BasePtr)) {
annotateSecondLevel(Inst, BasePtr);
return;
}
Inst->setMetadata("alias.scope", AliasScope);
Inst->setMetadata("noalias", OtherAliasScopeList);
}
void ScopAnnotator::addInterIterationAliasFreeBasePtr(llvm::Value *BasePtr) {
if (!BasePtr)
return;
InterIterationAliasFreeBasePtrs.insert(BasePtr);
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,381 @@
//===------ LoopGenerators.cpp - IR helper to create loops ---------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create scalar and parallel loops as LLVM-IR.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/LoopGenerators.h"
#include "polly/ScopDetection.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
using namespace polly;
static cl::opt<int>
PollyNumThreads("polly-num-threads",
cl::desc("Number of threads to use (0 = auto)"), cl::Hidden,
cl::init(0));
// We generate a loop of either of the following structures:
//
// BeforeBB BeforeBB
// | |
// v v
// GuardBB PreHeaderBB
// / | | _____
// __ PreHeaderBB | v \/ |
// / \ / | HeaderBB latch
// latch HeaderBB | |\ |
// \ / \ / | \------/
// < \ / |
// \ / v
// ExitBB ExitBB
//
// depending on whether or not we know that it is executed at least once. If
// not, GuardBB checks if the loop is executed at least once. If this is the
// case we branch to PreHeaderBB and subsequently to the HeaderBB, which
// contains the loop iv 'polly.indvar', the incremented loop iv
// 'polly.indvar_next' as well as the condition to check if we execute another
// iteration of the loop. After the loop has finished, we branch to ExitBB.
// We expect the type of UB, LB, UB+Stride to be large enough for values that
// UB may take throughout the execution of the loop, including the computation
// of indvar + Stride before the final abort.
Value *polly::createLoop(Value *LB, Value *UB, Value *Stride,
PollyIRBuilder &Builder, LoopInfo &LI,
DominatorTree &DT, BasicBlock *&ExitBB,
ICmpInst::Predicate Predicate,
ScopAnnotator *Annotator, bool Parallel, bool UseGuard,
bool LoopVectDisabled) {
Function *F = Builder.GetInsertBlock()->getParent();
LLVMContext &Context = F->getContext();
assert(LB->getType() == UB->getType() && "Types of loop bounds do not match");
IntegerType *LoopIVType = dyn_cast<IntegerType>(UB->getType());
assert(LoopIVType && "UB is not integer?");
BasicBlock *BeforeBB = Builder.GetInsertBlock();
BasicBlock *GuardBB =
UseGuard ? BasicBlock::Create(Context, "polly.loop_if", F) : nullptr;
BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.loop_header", F);
BasicBlock *PreHeaderBB =
BasicBlock::Create(Context, "polly.loop_preheader", F);
// Update LoopInfo
Loop *OuterLoop = LI.getLoopFor(BeforeBB);
Loop *NewLoop = LI.AllocateLoop();
if (OuterLoop)
OuterLoop->addChildLoop(NewLoop);
else
LI.addTopLevelLoop(NewLoop);
if (OuterLoop) {
if (GuardBB)
OuterLoop->addBasicBlockToLoop(GuardBB, LI);
OuterLoop->addBasicBlockToLoop(PreHeaderBB, LI);
}
NewLoop->addBasicBlockToLoop(HeaderBB, LI);
// Notify the annotator (if present) that we have a new loop, but only
// after the header block is set.
if (Annotator)
Annotator->pushLoop(NewLoop, Parallel);
// ExitBB
ExitBB = SplitBlock(BeforeBB, &*Builder.GetInsertPoint(), &DT, &LI);
ExitBB->setName("polly.loop_exit");
// BeforeBB
if (GuardBB) {
BeforeBB->getTerminator()->setSuccessor(0, GuardBB);
DT.addNewBlock(GuardBB, BeforeBB);
// GuardBB
Builder.SetInsertPoint(GuardBB);
Value *LoopGuard;
LoopGuard = Builder.CreateICmp(Predicate, LB, UB);
LoopGuard->setName("polly.loop_guard");
Builder.CreateCondBr(LoopGuard, PreHeaderBB, ExitBB);
DT.addNewBlock(PreHeaderBB, GuardBB);
} else {
BeforeBB->getTerminator()->setSuccessor(0, PreHeaderBB);
DT.addNewBlock(PreHeaderBB, BeforeBB);
}
// PreHeaderBB
Builder.SetInsertPoint(PreHeaderBB);
Builder.CreateBr(HeaderBB);
// HeaderBB
DT.addNewBlock(HeaderBB, PreHeaderBB);
Builder.SetInsertPoint(HeaderBB);
PHINode *IV = Builder.CreatePHI(LoopIVType, 2, "polly.indvar");
IV->addIncoming(LB, PreHeaderBB);
Stride = Builder.CreateZExtOrBitCast(Stride, LoopIVType);
Value *IncrementedIV = Builder.CreateNSWAdd(IV, Stride, "polly.indvar_next");
Value *LoopCondition =
Builder.CreateICmp(Predicate, IncrementedIV, UB, "polly.loop_cond");
// Create the loop latch and annotate it as such.
BranchInst *B = Builder.CreateCondBr(LoopCondition, HeaderBB, ExitBB);
if (Annotator)
Annotator->annotateLoopLatch(B, NewLoop, Parallel, LoopVectDisabled);
IV->addIncoming(IncrementedIV, HeaderBB);
if (GuardBB)
DT.changeImmediateDominator(ExitBB, GuardBB);
else
DT.changeImmediateDominator(ExitBB, HeaderBB);
// The loop body should be added here.
Builder.SetInsertPoint(HeaderBB->getFirstNonPHI());
return IV;
}
Value *ParallelLoopGenerator::createParallelLoop(
Value *LB, Value *UB, Value *Stride, SetVector<Value *> &UsedValues,
ValueMapT &Map, BasicBlock::iterator *LoopBody) {
Function *SubFn;
AllocaInst *Struct = storeValuesIntoStruct(UsedValues);
BasicBlock::iterator BeforeLoop = Builder.GetInsertPoint();
Value *IV = createSubFn(Stride, Struct, UsedValues, Map, &SubFn);
*LoopBody = Builder.GetInsertPoint();
Builder.SetInsertPoint(&*BeforeLoop);
Value *SubFnParam = Builder.CreateBitCast(Struct, Builder.getInt8PtrTy(),
"polly.par.userContext");
// Add one as the upper bound provided by OpenMP is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
UB = Builder.CreateAdd(UB, ConstantInt::get(LongType, 1));
// Tell the runtime we start a parallel loop
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
Builder.CreateCall(SubFn, SubFnParam);
createCallJoinThreads();
return IV;
}
void ParallelLoopGenerator::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam, Value *LB,
Value *UB, Value *Stride) {
const std::string Name = "GOMP_parallel_loop_runtime_start";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {PointerType::getUnqual(FunctionType::get(
Builder.getVoidTy(), Builder.getInt8PtrTy(), false)),
Builder.getInt8PtrTy(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *NumberOfThreads = Builder.getInt32(PollyNumThreads);
Value *Args[] = {SubFn, SubFnParam, NumberOfThreads, LB, UB, Stride};
Builder.CreateCall(F, Args);
}
Value *ParallelLoopGenerator::createCallGetWorkItem(Value *LBPtr,
Value *UBPtr) {
const std::string Name = "GOMP_loop_runtime_next";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {LBPtr, UBPtr};
Value *Return = Builder.CreateCall(F, Args);
Return = Builder.CreateICmpNE(
Return, Builder.CreateZExt(Builder.getFalse(), Return->getType()));
return Return;
}
void ParallelLoopGenerator::createCallJoinThreads() {
const std::string Name = "GOMP_parallel_end";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall(F, {});
}
void ParallelLoopGenerator::createCallCleanupThread() {
const std::string Name = "GOMP_loop_end_nowait";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}
Builder.CreateCall(F, {});
}
Function *ParallelLoopGenerator::createSubFnDefinition() {
Function *F = Builder.GetInsertBlock()->getParent();
std::vector<Type *> Arguments(1, Builder.getInt8PtrTy());
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
// Certain backends (e.g., NVPTX) do not support '.'s in function names.
// Hence, we ensure that all '.'s are replaced by '_'s.
std::string FunctionName = SubFn->getName();
std::replace(FunctionName.begin(), FunctionName.end(), '.', '_');
SubFn->setName(FunctionName);
// Do not run any polly pass on the new function.
SubFn->addFnAttr(PollySkipFnAttr);
Function::arg_iterator AI = SubFn->arg_begin();
AI->setName("polly.par.userContext");
return SubFn;
}
AllocaInst *
ParallelLoopGenerator::storeValuesIntoStruct(SetVector<Value *> &Values) {
SmallVector<Type *, 8> Members;
for (Value *V : Values)
Members.push_back(V->getType());
const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
// We do not want to allocate the alloca inside any loop, thus we allocate it
// in the entry block of the function and use annotations to denote the actual
// live span (similar to clang).
BasicBlock &EntryBB = Builder.GetInsertBlock()->getParent()->getEntryBlock();
Instruction *IP = &*EntryBB.getFirstInsertionPt();
StructType *Ty = StructType::get(Builder.getContext(), Members);
AllocaInst *Struct = new AllocaInst(Ty, DL.getAllocaAddrSpace(), nullptr,
"polly.par.userContext", IP);
for (unsigned i = 0; i < Values.size(); i++) {
Value *Address = Builder.CreateStructGEP(Ty, Struct, i);
Address->setName("polly.subfn.storeaddr." + Values[i]->getName());
Builder.CreateStore(Values[i], Address);
}
return Struct;
}
void ParallelLoopGenerator::extractValuesFromStruct(
SetVector<Value *> OldValues, Type *Ty, Value *Struct, ValueMapT &Map) {
for (unsigned i = 0; i < OldValues.size(); i++) {
Value *Address = Builder.CreateStructGEP(Ty, Struct, i);
Value *NewValue = Builder.CreateLoad(Address);
NewValue->setName("polly.subfunc.arg." + OldValues[i]->getName());
Map[OldValues[i]] = NewValue;
}
}
Value *ParallelLoopGenerator::createSubFn(Value *Stride, AllocaInst *StructData,
SetVector<Value *> Data,
ValueMapT &Map, Function **SubFnPtr) {
BasicBlock *PrevBB, *HeaderBB, *ExitBB, *CheckNextBB, *PreHeaderBB, *AfterBB;
Value *LBPtr, *UBPtr, *UserContext, *Ret1, *HasNextSchedule, *LB, *UB, *IV;
Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();
// Store the previous basic block.
PrevBB = Builder.GetInsertBlock();
// Create basic blocks.
HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
CheckNextBB = BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
PreHeaderBB = BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);
// Fill up basic block HeaderBB.
Builder.SetInsertPoint(HeaderBB);
LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
UserContext = Builder.CreateBitCast(
&*SubFn->arg_begin(), StructData->getType(), "polly.par.userContext");
extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
Map);
Builder.CreateBr(CheckNextBB);
// Add code to check if another set of iterations will be executed.
Builder.SetInsertPoint(CheckNextBB);
Ret1 = createCallGetWorkItem(LBPtr, UBPtr);
HasNextSchedule = Builder.CreateTrunc(Ret1, Builder.getInt1Ty(),
"polly.par.hasNextScheduleBlock");
Builder.CreateCondBr(HasNextSchedule, PreHeaderBB, ExitBB);
// Add code to load the iv bounds for this set of iterations.
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateLoad(LBPtr, "polly.par.LB");
UB = Builder.CreateLoad(UBPtr, "polly.par.UB");
// Subtract one as the upper bound provided by OpenMP is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
UB = Builder.CreateSub(UB, ConstantInt::get(LongType, 1),
"polly.par.UBAdjusted");
Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
IV = createLoop(LB, UB, Stride, Builder, LI, DT, AfterBB, ICmpInst::ICMP_SLE,
nullptr, true, /* UseGuard */ false);
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
// Add code to terminate this subfunction.
Builder.SetInsertPoint(ExitBB);
createCallCleanupThread();
Builder.CreateRetVoid();
Builder.SetInsertPoint(&*LoopBody);
*SubFnPtr = SubFn;
return IV;
}

View File

@@ -0,0 +1,442 @@
//===---- ManagedMemoryRewrite.cpp - Rewrite global & malloc'd memory -----===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Take a module and rewrite:
// 1. `malloc` -> `polly_mallocManaged`
// 2. `free` -> `polly_freeManaged`
// 3. global arrays with initializers -> global arrays that are initialized
// with a constructor call to
// `polly_mallocManaged`.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/CodeGeneration.h"
#include "polly/CodeGen/IslAst.h"
#include "polly/CodeGen/IslNodeBuilder.h"
#include "polly/CodeGen/PPCGCodeGeneration.h"
#include "polly/CodeGen/Utils.h"
#include "polly/DependenceInfo.h"
#include "polly/LinkAllPasses.h"
#include "polly/Options.h"
#include "polly/ScopDetection.h"
#include "polly/ScopInfo.h"
#include "polly/Support/SCEVValidator.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/CaptureTracking.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IRReader/IRReader.h"
#include "llvm/Linker/Linker.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/IPO/PassManagerBuilder.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
static cl::opt<bool> RewriteAllocas(
"polly-acc-rewrite-allocas",
cl::desc(
"Ask the managed memory rewriter to also rewrite alloca instructions"),
cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
static cl::opt<bool> IgnoreLinkageForGlobals(
"polly-acc-rewrite-ignore-linkage-for-globals",
cl::desc(
"By default, we only rewrite globals with internal linkage. This flag "
"enables rewriting of globals regardless of linkage"),
cl::Hidden, cl::init(false), cl::ZeroOrMore, cl::cat(PollyCategory));
#define DEBUG_TYPE "polly-acc-rewrite-managed-memory"
namespace {
static llvm::Function *getOrCreatePollyMallocManaged(Module &M) {
const char *Name = "polly_mallocManaged";
Function *F = M.getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
PollyIRBuilder Builder(M.getContext());
// TODO: How do I get `size_t`? I assume from DataLayout?
FunctionType *Ty = FunctionType::get(Builder.getInt8PtrTy(),
{Builder.getInt64Ty()}, false);
F = Function::Create(Ty, Linkage, Name, &M);
}
return F;
}
static llvm::Function *getOrCreatePollyFreeManaged(Module &M) {
const char *Name = "polly_freeManaged";
Function *F = M.getFunction(Name);
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
PollyIRBuilder Builder(M.getContext());
// TODO: How do I get `size_t`? I assume from DataLayout?
FunctionType *Ty =
FunctionType::get(Builder.getVoidTy(), {Builder.getInt8PtrTy()}, false);
F = Function::Create(Ty, Linkage, Name, &M);
}
return F;
}
// Expand a constant expression `Cur`, which is used at instruction `Parent`
// at index `index`.
// Since a constant expression can expand to multiple instructions, store all
// the expands into a set called `Expands`.
// Note that this goes inorder on the constant expression tree.
// A * ((B * D) + C)
// will be processed with first A, then B * D, then B, then D, and then C.
// Though ConstantExprs are not treated as "trees" but as DAGs, since you can
// have something like this:
// *
// / \
// \ /
// (D)
//
// For the purposes of this expansion, we expand the two occurences of D
// separately. Therefore, we expand the DAG into the tree:
// *
// / \
// D D
// TODO: We don't _have_to do this, but this is the simplest solution.
// We can write a solution that keeps track of which constants have been
// already expanded.
static void expandConstantExpr(ConstantExpr *Cur, PollyIRBuilder &Builder,
Instruction *Parent, int index,
SmallPtrSet<Instruction *, 4> &Expands) {
assert(Cur && "invalid constant expression passed");
Instruction *I = Cur->getAsInstruction();
assert(I && "unable to convert ConstantExpr to Instruction");
DEBUG(dbgs() << "Expanding ConstantExpression: (" << *Cur
<< ") in Instruction: (" << *I << ")\n";);
// Invalidate `Cur` so that no one after this point uses `Cur`. Rather,
// they should mutate `I`.
Cur = nullptr;
Expands.insert(I);
Parent->setOperand(index, I);
// The things that `Parent` uses (its operands) should be created
// before `Parent`.
Builder.SetInsertPoint(Parent);
Builder.Insert(I);
for (unsigned i = 0; i < I->getNumOperands(); i++) {
Value *Op = I->getOperand(i);
assert(isa<Constant>(Op) && "constant must have a constant operand");
if (ConstantExpr *CExprOp = dyn_cast<ConstantExpr>(Op))
expandConstantExpr(CExprOp, Builder, I, i, Expands);
}
}
// Edit all uses of `OldVal` to NewVal` in `Inst`. This will rewrite
// `ConstantExpr`s that are used in the `Inst`.
// Note that `replaceAllUsesWith` is insufficient for this purpose because it
// does not rewrite values in `ConstantExpr`s.
static void rewriteOldValToNew(Instruction *Inst, Value *OldVal, Value *NewVal,
PollyIRBuilder &Builder) {
// This contains a set of instructions in which OldVal must be replaced.
// We start with `Inst`, and we fill it up with the expanded `ConstantExpr`s
// from `Inst`s arguments.
// We need to go through this process because `replaceAllUsesWith` does not
// actually edit `ConstantExpr`s.
SmallPtrSet<Instruction *, 4> InstsToVisit = {Inst};
// Expand all `ConstantExpr`s and place it in `InstsToVisit`.
for (unsigned i = 0; i < Inst->getNumOperands(); i++) {
Value *Operand = Inst->getOperand(i);
if (ConstantExpr *ValueConstExpr = dyn_cast<ConstantExpr>(Operand))
expandConstantExpr(ValueConstExpr, Builder, Inst, i, InstsToVisit);
}
// Now visit each instruction and use `replaceUsesOfWith`. We know that
// will work because `I` cannot have any `ConstantExpr` within it.
for (Instruction *I : InstsToVisit)
I->replaceUsesOfWith(OldVal, NewVal);
}
// Given a value `Current`, return all Instructions that may contain `Current`
// in an expression.
// We need this auxiliary function, because if we have a
// `Constant` that is a user of `V`, we need to recurse into the
// `Constant`s uses to gather the root instruciton.
static void getInstructionUsersOfValue(Value *V,
SmallVector<Instruction *, 4> &Owners) {
if (auto *I = dyn_cast<Instruction>(V)) {
Owners.push_back(I);
} else {
// Anything that is a `User` must be a constant or an instruction.
auto *C = cast<Constant>(V);
for (Use &CUse : C->uses())
getInstructionUsersOfValue(CUse.getUser(), Owners);
}
}
static void
replaceGlobalArray(Module &M, const DataLayout &DL, GlobalVariable &Array,
SmallPtrSet<GlobalVariable *, 4> &ReplacedGlobals) {
// We only want arrays.
ArrayType *ArrayTy = dyn_cast<ArrayType>(Array.getType()->getElementType());
if (!ArrayTy)
return;
Type *ElemTy = ArrayTy->getElementType();
PointerType *ElemPtrTy = ElemTy->getPointerTo();
// We only wish to replace arrays that are visible in the module they
// inhabit. Otherwise, our type edit from [T] to T* would be illegal across
// modules.
const bool OnlyVisibleInsideModule = Array.hasPrivateLinkage() ||
Array.hasInternalLinkage() ||
IgnoreLinkageForGlobals;
if (!OnlyVisibleInsideModule) {
DEBUG(dbgs() << "Not rewriting (" << Array
<< ") to managed memory "
"because it could be visible externally. To force rewrite, "
"use -polly-acc-rewrite-ignore-linkage-for-globals.\n");
return;
}
if (!Array.hasInitializer() ||
!isa<ConstantAggregateZero>(Array.getInitializer())) {
DEBUG(dbgs() << "Not rewriting (" << Array
<< ") to managed memory "
"because it has an initializer which is "
"not a zeroinitializer.\n");
return;
}
// At this point, we have committed to replacing this array.
ReplacedGlobals.insert(&Array);
std::string NewName = Array.getName();
NewName += ".toptr";
GlobalVariable *ReplacementToArr =
cast<GlobalVariable>(M.getOrInsertGlobal(NewName, ElemPtrTy));
ReplacementToArr->setInitializer(ConstantPointerNull::get(ElemPtrTy));
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
std::string FnName = Array.getName();
FnName += ".constructor";
PollyIRBuilder Builder(M.getContext());
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), false);
const GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Function *F = Function::Create(Ty, Linkage, FnName, &M);
BasicBlock *Start = BasicBlock::Create(M.getContext(), "entry", F);
Builder.SetInsertPoint(Start);
const uint64_t ArraySizeInt = DL.getTypeAllocSize(ArrayTy);
Value *ArraySize = Builder.getInt64(ArraySizeInt);
ArraySize->setName("array.size");
Value *AllocatedMemRaw =
Builder.CreateCall(PollyMallocManaged, {ArraySize}, "mem.raw");
Value *AllocatedMemTyped =
Builder.CreatePointerCast(AllocatedMemRaw, ElemPtrTy, "mem.typed");
Builder.CreateStore(AllocatedMemTyped, ReplacementToArr);
Builder.CreateRetVoid();
const int Priority = 0;
appendToGlobalCtors(M, F, Priority, ReplacementToArr);
SmallVector<Instruction *, 4> ArrayUserInstructions;
// Get all instructions that use array. We need to do this weird thing
// because `Constant`s that contain this array neeed to be expanded into
// instructions so that we can replace their parameters. `Constant`s cannot
// be edited easily, so we choose to convert all `Constant`s to
// `Instruction`s and handle all of the uses of `Array` uniformly.
for (Use &ArrayUse : Array.uses())
getInstructionUsersOfValue(ArrayUse.getUser(), ArrayUserInstructions);
for (Instruction *UserOfArrayInst : ArrayUserInstructions) {
Builder.SetInsertPoint(UserOfArrayInst);
// <ty>** -> <ty>*
Value *ArrPtrLoaded = Builder.CreateLoad(ReplacementToArr, "arrptr.load");
// <ty>* -> [ty]*
Value *ArrPtrLoadedBitcasted = Builder.CreateBitCast(
ArrPtrLoaded, ArrayTy->getPointerTo(), "arrptr.bitcast");
rewriteOldValToNew(UserOfArrayInst, &Array, ArrPtrLoadedBitcasted, Builder);
}
}
// We return all `allocas` that may need to be converted to a call to
// cudaMallocManaged.
static void getAllocasToBeManaged(Function &F,
SmallSet<AllocaInst *, 4> &Allocas) {
for (BasicBlock &BB : F) {
for (Instruction &I : BB) {
auto *Alloca = dyn_cast<AllocaInst>(&I);
if (!Alloca)
continue;
DEBUG(dbgs() << "Checking if (" << *Alloca << ") may be captured: ");
if (PointerMayBeCaptured(Alloca, /* ReturnCaptures */ false,
/* StoreCaptures */ true)) {
Allocas.insert(Alloca);
DEBUG(dbgs() << "YES (captured).\n");
} else {
DEBUG(dbgs() << "NO (not captured).\n");
}
}
}
}
static void rewriteAllocaAsManagedMemory(AllocaInst *Alloca,
const DataLayout &DL) {
DEBUG(dbgs() << "rewriting: (" << *Alloca << ") to managed mem.\n");
Module *M = Alloca->getModule();
assert(M && "Alloca does not have a module");
PollyIRBuilder Builder(M->getContext());
Builder.SetInsertPoint(Alloca);
Value *MallocManagedFn = getOrCreatePollyMallocManaged(*Alloca->getModule());
const uint64_t Size =
DL.getTypeAllocSize(Alloca->getType()->getElementType());
Value *SizeVal = Builder.getInt64(Size);
Value *RawManagedMem = Builder.CreateCall(MallocManagedFn, {SizeVal});
Value *Bitcasted = Builder.CreateBitCast(RawManagedMem, Alloca->getType());
Function *F = Alloca->getFunction();
assert(F && "Alloca has invalid function");
Bitcasted->takeName(Alloca);
Alloca->replaceAllUsesWith(Bitcasted);
Alloca->eraseFromParent();
for (BasicBlock &BB : *F) {
ReturnInst *Return = dyn_cast<ReturnInst>(BB.getTerminator());
if (!Return)
continue;
Builder.SetInsertPoint(Return);
Value *FreeManagedFn = getOrCreatePollyFreeManaged(*M);
Builder.CreateCall(FreeManagedFn, {RawManagedMem});
}
}
// Replace all uses of `Old` with `New`, even inside `ConstantExpr`.
//
// `replaceAllUsesWith` does replace values in `ConstantExpr`. This function
// actually does replace it in `ConstantExpr`. The caveat is that if there is
// a use that is *outside* a function (say, at global declarations), we fail.
// So, this is meant to be used on values which we know will only be used
// within functions.
//
// This process works by looking through the uses of `Old`. If it finds a
// `ConstantExpr`, it recursively looks for the owning instruction.
// Then, it expands all the `ConstantExpr` to instructions and replaces
// `Old` with `New` in the expanded instructions.
static void replaceAllUsesAndConstantUses(Value *Old, Value *New,
PollyIRBuilder &Builder) {
SmallVector<Instruction *, 4> UserInstructions;
// Get all instructions that use array. We need to do this weird thing
// because `Constant`s that contain this array neeed to be expanded into
// instructions so that we can replace their parameters. `Constant`s cannot
// be edited easily, so we choose to convert all `Constant`s to
// `Instruction`s and handle all of the uses of `Array` uniformly.
for (Use &ArrayUse : Old->uses())
getInstructionUsersOfValue(ArrayUse.getUser(), UserInstructions);
for (Instruction *I : UserInstructions)
rewriteOldValToNew(I, Old, New, Builder);
}
class ManagedMemoryRewritePass : public ModulePass {
public:
static char ID;
GPUArch Architecture;
GPURuntime Runtime;
ManagedMemoryRewritePass() : ModulePass(ID) {}
virtual bool runOnModule(Module &M) {
const DataLayout &DL = M.getDataLayout();
Function *Malloc = M.getFunction("malloc");
if (Malloc) {
PollyIRBuilder Builder(M.getContext());
Function *PollyMallocManaged = getOrCreatePollyMallocManaged(M);
assert(PollyMallocManaged && "unable to create polly_mallocManaged");
replaceAllUsesAndConstantUses(Malloc, PollyMallocManaged, Builder);
Malloc->eraseFromParent();
}
Function *Free = M.getFunction("free");
if (Free) {
PollyIRBuilder Builder(M.getContext());
Function *PollyFreeManaged = getOrCreatePollyFreeManaged(M);
assert(PollyFreeManaged && "unable to create polly_freeManaged");
replaceAllUsesAndConstantUses(Free, PollyFreeManaged, Builder);
Free->eraseFromParent();
}
SmallPtrSet<GlobalVariable *, 4> GlobalsToErase;
for (GlobalVariable &Global : M.globals())
replaceGlobalArray(M, DL, Global, GlobalsToErase);
for (GlobalVariable *G : GlobalsToErase)
G->eraseFromParent();
// Rewrite allocas to cudaMallocs if we are asked to do so.
if (RewriteAllocas) {
SmallSet<AllocaInst *, 4> AllocasToBeManaged;
for (Function &F : M.functions())
getAllocasToBeManaged(F, AllocasToBeManaged);
for (AllocaInst *Alloca : AllocasToBeManaged)
rewriteAllocaAsManagedMemory(Alloca, DL);
}
return true;
}
};
} // namespace
char ManagedMemoryRewritePass::ID = 42;
Pass *polly::createManagedMemoryRewritePassPass(GPUArch Arch,
GPURuntime Runtime) {
ManagedMemoryRewritePass *pass = new ManagedMemoryRewritePass();
pass->Runtime = Runtime;
pass->Architecture = Arch;
return pass;
}
INITIALIZE_PASS_BEGIN(
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
"Polly - Rewrite all allocations in heap & data section to managed memory",
false, false)
INITIALIZE_PASS_DEPENDENCY(PPCGCodeGeneration);
INITIALIZE_PASS_DEPENDENCY(DependenceInfo);
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
INITIALIZE_PASS_DEPENDENCY(RegionInfoPass);
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass);
INITIALIZE_PASS_DEPENDENCY(ScopDetectionWrapperPass);
INITIALIZE_PASS_END(
ManagedMemoryRewritePass, "polly-acc-rewrite-managed-memory",
"Polly - Rewrite all allocations in heap & data section to managed memory",
false, false)

View File

@@ -0,0 +1 @@
d6652146483f6426c13e661b939c1fc78abc4008

View File

@@ -0,0 +1,305 @@
//===------ PerfMonitor.cpp - Generate a run-time performance monitor. -======//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/PerfMonitor.h"
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "polly/ScopInfo.h"
#include "llvm/ADT/Triple.h"
#include "llvm/IR/Intrinsics.h"
#include <sstream>
using namespace llvm;
using namespace polly;
Function *PerfMonitor::getAtExit() {
const char *Name = "atexit";
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(),
{Builder.getInt8PtrTy()}, false);
F = Function::Create(Ty, Linkage, Name, M);
}
return F;
}
void PerfMonitor::addToGlobalConstructors(Function *Fn) {
const char *Name = "llvm.global_ctors";
GlobalVariable *GV = M->getGlobalVariable(Name);
std::vector<Constant *> V;
if (GV) {
Constant *Array = GV->getInitializer();
for (Value *X : Array->operand_values())
V.push_back(cast<Constant>(X));
GV->eraseFromParent();
}
StructType *ST = StructType::get(Builder.getInt32Ty(), Fn->getType(),
Builder.getInt8PtrTy());
V.push_back(
ConstantStruct::get(ST, Builder.getInt32(10), Fn,
ConstantPointerNull::get(Builder.getInt8PtrTy())));
ArrayType *Ty = ArrayType::get(ST, V.size());
GV = new GlobalVariable(*M, Ty, true, GlobalValue::AppendingLinkage,
ConstantArray::get(Ty, V), Name, nullptr,
GlobalVariable::NotThreadLocal);
}
Function *PerfMonitor::getRDTSCP() {
return Intrinsic::getDeclaration(M, Intrinsic::x86_rdtscp);
}
PerfMonitor::PerfMonitor(const Scop &S, Module *M)
: M(M), Builder(M->getContext()), S(S) {
if (Triple(M->getTargetTriple()).getArch() == llvm::Triple::x86_64)
Supported = true;
else
Supported = false;
}
static void TryRegisterGlobal(Module *M, const char *Name,
Constant *InitialValue, Value **Location) {
*Location = M->getGlobalVariable(Name);
if (!*Location)
*Location = new GlobalVariable(
*M, InitialValue->getType(), true, GlobalValue::WeakAnyLinkage,
InitialValue, Name, nullptr, GlobalVariable::InitialExecTLSModel);
}
// Generate a unique name that is usable as a LLVM name for a scop to name its
// performance counter.
static std::string GetScopUniqueVarname(const Scop &S) {
std::stringstream Name;
std::string EntryString, ExitString;
std::tie(EntryString, ExitString) = S.getEntryExitStr();
Name << "__polly_perf_in_" << std::string(S.getFunction().getName())
<< "_from__" << EntryString << "__to__" << ExitString;
return Name.str();
}
void PerfMonitor::addScopCounter() {
const std::string varname = GetScopUniqueVarname(S);
TryRegisterGlobal(M, (varname + "_cycles").c_str(), Builder.getInt64(0),
&CyclesInCurrentScopPtr);
TryRegisterGlobal(M, (varname + "_trip_count").c_str(), Builder.getInt64(0),
&TripCountForCurrentScopPtr);
}
void PerfMonitor::addGlobalVariables() {
TryRegisterGlobal(M, "__polly_perf_cycles_total_start", Builder.getInt64(0),
&CyclesTotalStartPtr);
TryRegisterGlobal(M, "__polly_perf_initialized", Builder.getInt1(0),
&AlreadyInitializedPtr);
TryRegisterGlobal(M, "__polly_perf_cycles_in_scops", Builder.getInt64(0),
&CyclesInScopsPtr);
TryRegisterGlobal(M, "__polly_perf_cycles_in_scop_start", Builder.getInt64(0),
&CyclesInScopStartPtr);
TryRegisterGlobal(M, "__polly_perf_write_loation", Builder.getInt32(0),
&RDTSCPWriteLocation);
}
static const char *InitFunctionName = "__polly_perf_init";
static const char *FinalReportingFunctionName = "__polly_perf_final";
static BasicBlock *FinalStartBB = nullptr;
static ReturnInst *ReturnFromFinal = nullptr;
Function *PerfMonitor::insertFinalReporting() {
// Create new function.
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
Function *ExitFn =
Function::Create(Ty, Linkage, FinalReportingFunctionName, M);
FinalStartBB = BasicBlock::Create(M->getContext(), "start", ExitFn);
Builder.SetInsertPoint(FinalStartBB);
if (!Supported) {
RuntimeDebugBuilder::createCPUPrinter(
Builder, "Polly runtime information generation not supported\n");
Builder.CreateRetVoid();
return ExitFn;
}
// Measure current cycles and compute final timings.
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Value *CyclesStart = Builder.CreateLoad(CyclesTotalStartPtr, true);
Value *CyclesTotal = Builder.CreateSub(CurrentCycles, CyclesStart);
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
// Print the runtime information.
RuntimeDebugBuilder::createCPUPrinter(Builder, "Polly runtime information\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "-------------------------\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Total: ", CyclesTotal, "\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Scops: ", CyclesInScops,
"\n");
// Print the preamble for per-scop information.
RuntimeDebugBuilder::createCPUPrinter(Builder, "\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "Per SCoP information\n");
RuntimeDebugBuilder::createCPUPrinter(Builder, "--------------------\n");
RuntimeDebugBuilder::createCPUPrinter(
Builder, "scop function, "
"entry block name, exit block name, total time, trip count\n");
ReturnFromFinal = Builder.CreateRetVoid();
return ExitFn;
}
void PerfMonitor::AppendScopReporting() {
if (!Supported)
return;
assert(FinalStartBB && "Expected FinalStartBB to be initialized by "
"PerfMonitor::insertFinalReporting.");
assert(ReturnFromFinal && "Expected ReturnFromFinal to be initialized by "
"PerfMonitor::insertFinalReporting.");
Builder.SetInsertPoint(FinalStartBB);
ReturnFromFinal->eraseFromParent();
Value *CyclesInCurrentScop =
Builder.CreateLoad(this->CyclesInCurrentScopPtr, true);
Value *TripCountForCurrentScop =
Builder.CreateLoad(this->TripCountForCurrentScopPtr, true);
std::string EntryName, ExitName;
std::tie(EntryName, ExitName) = S.getEntryExitStr();
// print in CSV for easy parsing with other tools.
RuntimeDebugBuilder::createCPUPrinter(
Builder, S.getFunction().getName(), ", ", EntryName, ", ", ExitName, ", ",
CyclesInCurrentScop, ", ", TripCountForCurrentScop, "\n");
ReturnFromFinal = Builder.CreateRetVoid();
}
static Function *FinalReporting = nullptr;
void PerfMonitor::initialize() {
addGlobalVariables();
addScopCounter();
// Ensure that we only add the final reporting function once.
// On later invocations, append to the reporting function.
if (!FinalReporting) {
FinalReporting = insertFinalReporting();
Function *InitFn = insertInitFunction(FinalReporting);
addToGlobalConstructors(InitFn);
}
AppendScopReporting();
}
Function *PerfMonitor::insertInitFunction(Function *FinalReporting) {
// Insert function definition and BBs.
GlobalValue::LinkageTypes Linkage = Function::WeakODRLinkage;
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), {}, false);
Function *InitFn = Function::Create(Ty, Linkage, InitFunctionName, M);
BasicBlock *Start = BasicBlock::Create(M->getContext(), "start", InitFn);
BasicBlock *EarlyReturn =
BasicBlock::Create(M->getContext(), "earlyreturn", InitFn);
BasicBlock *InitBB = BasicBlock::Create(M->getContext(), "initbb", InitFn);
Builder.SetInsertPoint(Start);
// Check if this function was already run. If yes, return.
//
// In case profiling has been enabled in multiple translation units, the
// initializer function will be added to the global constructors list of
// each translation unit. When merging translation units, the global
// constructor lists are just appended, such that the initializer will appear
// multiple times. To avoid initializations being run multiple times (and
// especially to avoid that atExitFn is called more than once), we bail
// out if the initializer is run more than once.
Value *HasRunBefore = Builder.CreateLoad(AlreadyInitializedPtr);
Builder.CreateCondBr(HasRunBefore, EarlyReturn, InitBB);
Builder.SetInsertPoint(EarlyReturn);
Builder.CreateRetVoid();
// Keep track that this function has been run once.
Builder.SetInsertPoint(InitBB);
Value *True = Builder.getInt1(true);
Builder.CreateStore(True, AlreadyInitializedPtr);
// Register the final reporting function with atexit().
Value *FinalReportingPtr =
Builder.CreatePointerCast(FinalReporting, Builder.getInt8PtrTy());
Function *AtExitFn = getAtExit();
Builder.CreateCall(AtExitFn, {FinalReportingPtr});
if (Supported) {
// Read the currently cycle counter and store the result for later.
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Builder.CreateStore(CurrentCycles, CyclesTotalStartPtr, true);
}
Builder.CreateRetVoid();
return InitFn;
}
void PerfMonitor::insertRegionStart(Instruction *InsertBefore) {
if (!Supported)
return;
Builder.SetInsertPoint(InsertBefore);
Function *RDTSCPFn = getRDTSCP();
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Builder.CreateStore(CurrentCycles, CyclesInScopStartPtr, true);
}
void PerfMonitor::insertRegionEnd(Instruction *InsertBefore) {
if (!Supported)
return;
Builder.SetInsertPoint(InsertBefore);
Function *RDTSCPFn = getRDTSCP();
LoadInst *CyclesStart = Builder.CreateLoad(CyclesInScopStartPtr, true);
Value *CurrentCycles = Builder.CreateCall(
RDTSCPFn,
Builder.CreatePointerCast(RDTSCPWriteLocation, Builder.getInt8PtrTy()));
Value *CyclesInScop = Builder.CreateSub(CurrentCycles, CyclesStart);
Value *CyclesInScops = Builder.CreateLoad(CyclesInScopsPtr, true);
CyclesInScops = Builder.CreateAdd(CyclesInScops, CyclesInScop);
Builder.CreateStore(CyclesInScops, CyclesInScopsPtr, true);
Value *CyclesInCurrentScop = Builder.CreateLoad(CyclesInCurrentScopPtr, true);
CyclesInCurrentScop = Builder.CreateAdd(CyclesInCurrentScop, CyclesInScop);
Builder.CreateStore(CyclesInCurrentScop, CyclesInCurrentScopPtr, true);
Value *TripCountForCurrentScop =
Builder.CreateLoad(TripCountForCurrentScopPtr, true);
TripCountForCurrentScop =
Builder.CreateAdd(TripCountForCurrentScop, Builder.getInt64(1));
Builder.CreateStore(TripCountForCurrentScop, TripCountForCurrentScopPtr,
true);
}

View File

@@ -0,0 +1,275 @@
//===--- RuntimeDebugBuilder.cpp - Helper to insert prints into LLVM-IR ---===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/RuntimeDebugBuilder.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/Debug.h"
#include <string>
#include <vector>
using namespace llvm;
using namespace polly;
Function *RuntimeDebugBuilder::getVPrintF(PollyIRBuilder &Builder) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *Name = "vprintf";
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(
Builder.getInt32Ty(), {Builder.getInt8PtrTy(), Builder.getInt8PtrTy()},
false);
F = Function::Create(Ty, Linkage, Name, M);
}
return F;
}
Function *RuntimeDebugBuilder::getAddressSpaceCast(PollyIRBuilder &Builder,
unsigned Src, unsigned Dst,
unsigned SrcBits,
unsigned DstBits) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
auto Name = std::string("llvm.nvvm.ptr.constant.to.gen.p") +
std::to_string(Dst) + "i" + std::to_string(DstBits) + ".p" +
std::to_string(Src) + "i" + std::to_string(SrcBits);
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(
PointerType::get(Builder.getIntNTy(DstBits), Dst),
PointerType::get(Builder.getIntNTy(SrcBits), Src), false);
F = Function::Create(Ty, Linkage, Name, M);
}
return F;
}
std::vector<Value *>
RuntimeDebugBuilder::getGPUThreadIdentifiers(PollyIRBuilder &Builder) {
std::vector<Value *> Identifiers;
auto M = Builder.GetInsertBlock()->getParent()->getParent();
std::vector<Function *> BlockIDs = {
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_x),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_y),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_ctaid_z),
};
Identifiers.push_back(Builder.CreateGlobalStringPtr("> block-id: ", "", 4));
for (auto GetID : BlockIDs) {
Value *Id = Builder.CreateCall(GetID, {});
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
Identifiers.push_back(Id);
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
}
Identifiers.push_back(Builder.CreateGlobalStringPtr("| ", "", 4));
std::vector<Function *> ThreadIDs = {
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_x),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_y),
Intrinsic::getDeclaration(M, Intrinsic::nvvm_read_ptx_sreg_tid_z),
};
Identifiers.push_back(Builder.CreateGlobalStringPtr("thread-id: ", "", 4));
for (auto GetId : ThreadIDs) {
Value *Id = Builder.CreateCall(GetId, {});
Id = Builder.CreateIntCast(Id, Builder.getInt64Ty(), false);
Identifiers.push_back(Id);
Identifiers.push_back(Builder.CreateGlobalStringPtr(" ", "", 4));
}
return Identifiers;
}
void RuntimeDebugBuilder::createPrinter(PollyIRBuilder &Builder, bool IsGPU,
ArrayRef<Value *> Values) {
if (IsGPU)
createGPUPrinterT(Builder, Values);
else
createCPUPrinterT(Builder, Values);
}
static std::tuple<std::string, std::vector<Value *>>
prepareValuesForPrinting(PollyIRBuilder &Builder, ArrayRef<Value *> Values) {
std::string FormatString;
std::vector<Value *> ValuesToPrint;
for (auto Val : Values) {
Type *Ty = Val->getType();
if (Ty->isFloatingPointTy()) {
if (!Ty->isDoubleTy())
Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
} else if (Ty->isIntegerTy()) {
if (Ty->getIntegerBitWidth() < 64)
Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
else
assert(Ty->getIntegerBitWidth() &&
"Integer types larger 64 bit not supported");
} else if (isa<PointerType>(Ty)) {
if (Ty->getPointerElementType() == Builder.getInt8Ty() &&
Ty->getPointerAddressSpace() == 4) {
Val = Builder.CreateGEP(Val, Builder.getInt64(0));
} else {
Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
}
} else {
llvm_unreachable("Unknown type");
}
Ty = Val->getType();
if (Ty->isFloatingPointTy())
FormatString += "%f";
else if (Ty->isIntegerTy())
FormatString += "%ld";
else
FormatString += "%s";
ValuesToPrint.push_back(Val);
}
return std::make_tuple(FormatString, ValuesToPrint);
}
void RuntimeDebugBuilder::createCPUPrinterT(PollyIRBuilder &Builder,
ArrayRef<Value *> Values) {
std::string FormatString;
std::vector<Value *> ValuesToPrint;
std::tie(FormatString, ValuesToPrint) =
prepareValuesForPrinting(Builder, Values);
createPrintF(Builder, FormatString, ValuesToPrint);
createFlush(Builder);
}
void RuntimeDebugBuilder::createGPUPrinterT(PollyIRBuilder &Builder,
ArrayRef<Value *> Values) {
std::string str;
auto *Zero = Builder.getInt64(0);
auto ToPrint = getGPUThreadIdentifiers(Builder);
ToPrint.push_back(Builder.CreateGlobalStringPtr("\n ", "", 4));
ToPrint.insert(ToPrint.end(), Values.begin(), Values.end());
const DataLayout &DL = Builder.GetInsertBlock()->getModule()->getDataLayout();
// Allocate print buffer (assuming 2*32 bit per element)
auto T = ArrayType::get(Builder.getInt32Ty(), ToPrint.size() * 2);
Value *Data = new AllocaInst(
T, DL.getAllocaAddrSpace(), "polly.vprint.buffer",
&Builder.GetInsertBlock()->getParent()->getEntryBlock().front());
auto *DataPtr = Builder.CreateGEP(Data, {Zero, Zero});
int Offset = 0;
for (auto Val : ToPrint) {
auto Ptr = Builder.CreateGEP(DataPtr, Builder.getInt64(Offset));
Type *Ty = Val->getType();
if (Ty->isFloatingPointTy()) {
if (!Ty->isDoubleTy())
Val = Builder.CreateFPExt(Val, Builder.getDoubleTy());
} else if (Ty->isIntegerTy()) {
if (Ty->getIntegerBitWidth() < 64)
Val = Builder.CreateSExt(Val, Builder.getInt64Ty());
else
assert(Ty->getIntegerBitWidth() &&
"Integer types larger 64 bit not supported");
} else if (auto PtTy = dyn_cast<PointerType>(Ty)) {
if (PtTy->getAddressSpace() == 4) {
// Pointers in constant address space are printed as strings
Val = Builder.CreateGEP(Val, Builder.getInt64(0));
auto F = RuntimeDebugBuilder::getAddressSpaceCast(Builder, 4, 0);
Val = Builder.CreateCall(F, Val);
} else {
Val = Builder.CreatePtrToInt(Val, Builder.getInt64Ty());
}
} else {
llvm_unreachable("Unknown type");
}
Ty = Val->getType();
Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Ty->getPointerTo(5));
Builder.CreateAlignedStore(Val, Ptr, 4);
if (Ty->isFloatingPointTy())
str += "%f";
else if (Ty->isIntegerTy())
str += "%ld";
else
str += "%s";
Offset += 2;
}
Value *Format = Builder.CreateGlobalStringPtr(str, "polly.vprintf.buffer", 4);
Format = Builder.CreateCall(getAddressSpaceCast(Builder, 4, 0), Format);
Data = Builder.CreateBitCast(Data, Builder.getInt8PtrTy());
Builder.CreateCall(getVPrintF(Builder), {Format, Data});
}
Function *RuntimeDebugBuilder::getPrintF(PollyIRBuilder &Builder) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *Name = "printf";
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), true);
F = Function::Create(Ty, Linkage, Name, M);
}
return F;
}
void RuntimeDebugBuilder::createPrintF(PollyIRBuilder &Builder,
std::string Format,
ArrayRef<Value *> Values) {
Value *FormatString = Builder.CreateGlobalStringPtr(Format);
std::vector<Value *> Arguments;
Arguments.push_back(FormatString);
Arguments.insert(Arguments.end(), Values.begin(), Values.end());
Builder.CreateCall(getPrintF(Builder), Arguments);
}
void RuntimeDebugBuilder::createFlush(PollyIRBuilder &Builder) {
Module *M = Builder.GetInsertBlock()->getParent()->getParent();
const char *Name = "fflush";
Function *F = M->getFunction(Name);
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
FunctionType *Ty =
FunctionType::get(Builder.getInt32Ty(), Builder.getInt8PtrTy(), false);
F = Function::Create(Ty, Linkage, Name, M);
}
// fflush(NULL) flushes _all_ open output streams.
//
// fflush is declared as 'int fflush(FILE *stream)'. As we only pass on a NULL
// pointer, the type we point to does conceptually not matter. However, if
// fflush is already declared in this translation unit, we use the very same
// type to ensure that LLVM does not complain about mismatching types.
Builder.CreateCall(F, Constant::getNullValue(F->arg_begin()->getType()));
}

View File

@@ -0,0 +1,221 @@
//===--- Utils.cpp - Utility functions for the code generation --*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains utility functions for the code generation.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/Utils.h"
#include "polly/CodeGen/IRBuilder.h"
#include "polly/ScopInfo.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/RegionInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
// Alternative to llvm::SplitCriticalEdge.
//
// Creates a new block which branches to Succ. The edge to split is redirected
// to the new block.
//
// The issue with llvm::SplitCriticalEdge is that it does nothing if the edge is
// not critical.
// The issue with llvm::SplitEdge is that it does not always create the middle
// block, but reuses Prev/Succ if it can. We always want a new middle block.
static BasicBlock *splitEdge(BasicBlock *Prev, BasicBlock *Succ,
const char *Suffix, DominatorTree *DT,
LoopInfo *LI, RegionInfo *RI) {
assert(Prev && Succ);
// Before:
// \ / / //
// Prev / //
// | \___/ //
// | ___ //
// | / \ //
// Succ \ //
// / \ \ //
// The algorithm to update DominatorTree and LoopInfo of
// llvm::SplitCriticalEdge is more efficient than
// llvm::SplitBlockPredecessors, which is more general. In the future we might
// either modify llvm::SplitCriticalEdge to allow skipping the critical edge
// check; or Copy&Pase it here.
BasicBlock *MiddleBlock = SplitBlockPredecessors(
Succ, ArrayRef<BasicBlock *>(Prev), Suffix, DT, LI);
if (RI) {
Region *PrevRegion = RI->getRegionFor(Prev);
Region *SuccRegion = RI->getRegionFor(Succ);
if (PrevRegion->contains(MiddleBlock)) {
RI->setRegionFor(MiddleBlock, PrevRegion);
} else {
RI->setRegionFor(MiddleBlock, SuccRegion);
}
}
// After:
// \ / / //
// Prev / //
// | \___/ //
// | //
// MiddleBlock //
// | ___ //
// | / \ //
// Succ \ //
// / \ \ //
return MiddleBlock;
}
std::pair<polly::BBPair, BranchInst *>
polly::executeScopConditionally(Scop &S, Value *RTC, DominatorTree &DT,
RegionInfo &RI, LoopInfo &LI) {
Region &R = S.getRegion();
PollyIRBuilder Builder(S.getEntry());
// Before:
//
// \ / //
// EnteringBB //
// _____|_____ //
// / EntryBB \ //
// | (region) | //
// \_ExitingBB_/ //
// | //
// ExitBB //
// / \ //
// Create a fork block.
BasicBlock *EnteringBB = S.getEnteringBlock();
BasicBlock *EntryBB = S.getEntry();
assert(EnteringBB && "Must be a simple region");
BasicBlock *SplitBlock =
splitEdge(EnteringBB, EntryBB, ".split_new_and_old", &DT, &LI, &RI);
SplitBlock->setName("polly.split_new_and_old");
// If EntryBB is the exit block of the region that includes Prev, exclude
// SplitBlock from that region by making it itself the exit block. This is
// trivially possible because there is just one edge to EnteringBB.
// This is necessary because we will add an outgoing edge from SplitBlock,
// which would violate the single exit block requirement of PrevRegion.
Region *PrevRegion = RI.getRegionFor(EnteringBB);
while (PrevRegion->getExit() == EntryBB) {
PrevRegion->replaceExit(SplitBlock);
PrevRegion = PrevRegion->getParent();
}
RI.setRegionFor(SplitBlock, PrevRegion);
// Create a join block
BasicBlock *ExitingBB = S.getExitingBlock();
BasicBlock *ExitBB = S.getExit();
assert(ExitingBB && "Must be a simple region");
BasicBlock *MergeBlock =
splitEdge(ExitingBB, ExitBB, ".merge_new_and_old", &DT, &LI, &RI);
MergeBlock->setName("polly.merge_new_and_old");
// Exclude the join block from the region.
R.replaceExitRecursive(MergeBlock);
RI.setRegionFor(MergeBlock, R.getParent());
// \ / //
// EnteringBB //
// | //
// SplitBlock //
// _____|_____ //
// / EntryBB \ //
// | (region) | //
// \_ExitingBB_/ //
// | //
// MergeBlock //
// | //
// ExitBB //
// / \ //
// Create the start and exiting block.
Function *F = SplitBlock->getParent();
BasicBlock *StartBlock =
BasicBlock::Create(F->getContext(), "polly.start", F);
BasicBlock *ExitingBlock =
BasicBlock::Create(F->getContext(), "polly.exiting", F);
SplitBlock->getTerminator()->eraseFromParent();
Builder.SetInsertPoint(SplitBlock);
BranchInst *CondBr = Builder.CreateCondBr(RTC, StartBlock, S.getEntry());
if (Loop *L = LI.getLoopFor(SplitBlock)) {
L->addBasicBlockToLoop(StartBlock, LI);
L->addBasicBlockToLoop(ExitingBlock, LI);
}
DT.addNewBlock(StartBlock, SplitBlock);
DT.addNewBlock(ExitingBlock, StartBlock);
RI.setRegionFor(StartBlock, RI.getRegionFor(SplitBlock));
RI.setRegionFor(ExitingBlock, RI.getRegionFor(SplitBlock));
// \ / //
// EnteringBB //
// | //
// SplitBlock---------\ //
// _____|_____ | //
// / EntryBB \ StartBlock //
// | (region) | | //
// \_ExitingBB_/ ExitingBlock //
// | //
// MergeBlock //
// | //
// ExitBB //
// / \ //
// Connect start block to exiting block.
Builder.SetInsertPoint(StartBlock);
Builder.CreateBr(ExitingBlock);
DT.changeImmediateDominator(ExitingBlock, StartBlock);
// Connect exiting block to join block.
Builder.SetInsertPoint(ExitingBlock);
Builder.CreateBr(MergeBlock);
DT.changeImmediateDominator(MergeBlock, SplitBlock);
// \ / //
// EnteringBB //
// | //
// SplitBlock---------\ //
// _____|_____ | //
// / EntryBB \ StartBlock //
// | (region) | | //
// \_ExitingBB_/ ExitingBlock //
// | | //
// MergeBlock---------/ //
// | //
// ExitBB //
// / \ //
//
// Split the edge between SplitBlock and EntryBB, to avoid a critical edge.
splitEdge(SplitBlock, EntryBB, ".pre_entry_bb", &DT, &LI, &RI);
// \ / //
// EnteringBB //
// | //
// SplitBlock---------\ //
// | | //
// PreEntryBB | //
// _____|_____ | //
// / EntryBB \ StartBlock //
// | (region) | | //
// \_ExitingBB_/ ExitingBlock //
// | | //
// MergeBlock---------/ //
// | //
// ExitBB //
// / \ //
return std::make_pair(std::make_pair(StartBlock, ExitingBlock), CondBr);
}