[Statepoints] Support lowering gc relocations to virtual registers

(Disabled under flag for the moment)

This is part of a larger project wherein we are finally integrating lowering of gc live operands with the register allocator.  Today, we force spill all operands in SelectionDAG.  The code to do so is distinctly non-optimal.  The approach this patch is working towards is to instead lower the relocations directly into the MI form, and let the register allocator pick which ones get spilled and which stack slots they get spilled to.  In terms of performance, the later part is actually more important as it avoids redundant shuffling of values between stack slots.

This particular change adds ISEL support to produce the variadic def STATEPOINT form required by the above.  In particular, the first N are lowered to variadic tied def/use pairs.  So new statepoint looks like this:
reloc1,reloc2,... = STATEPOINT ..., base1, derived1<tied-def0>, base2, derived2<tied-def1>, ...

N is limited by the maximal number of tied registers machine instruction can have (15 at the moment).

The current patch is restricted to handling relocations within a single basic block.  Cross block relocations (e.g. invokes) are handled via the legacy mechanism.  This restriction will be relaxed in future patches.

Patch By: dantrushin
Differential Revision: https://reviews.llvm.org/D81648
This commit is contained in:
Philip Reames
2020-07-11 10:50:34 -07:00
parent 9182dc7814
commit 3da1a9634e
6 changed files with 1074 additions and 24 deletions
@@ -82,6 +82,28 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
return N;
}
/// Return starting index of GC operand list.
// FIXME: need a better place for this. Put it in StackMaps?
static unsigned getStatepointGCArgStartIdx(MachineInstr *MI) {
assert(MI->getOpcode() == TargetOpcode::STATEPOINT &&
"STATEPOINT node expected");
unsigned OperIdx = StatepointOpers(MI).getNumDeoptArgsIdx();
unsigned NumDeopts = MI->getOperand(OperIdx).getImm();
// At this point stack references has not been lowered yet, so they
// take single operand.
++OperIdx;
while (NumDeopts--) {
MachineOperand &MO = MI->getOperand(OperIdx);
if (MO.isImm() && MO.getImm() == StackMaps::ConstantOp) {
++OperIdx;
assert(MI->getOperand(OperIdx).isImm() &&
"Unexpected statepoint operand");
}
++OperIdx;
}
return OperIdx;
}
/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
/// implicit physical register output.
void InstrEmitter::
@@ -200,6 +222,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
II.isVariadic() && II.variadicOpsAreDefs();
unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
if (Node->getMachineOpcode() == TargetOpcode::STATEPOINT)
NumVRegs = NumResults;
for (unsigned i = 0; i < NumVRegs; ++i) {
// If the specific node value is only used by a CopyToReg and the dest reg
// is a vreg in the same register class, use the CopyToReg'd destination
@@ -821,6 +845,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
NumDefs = NumResults;
}
ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
} else if (Opc == TargetOpcode::STATEPOINT) {
NumDefs = NumResults;
}
unsigned NumImpUses = 0;
@@ -970,6 +996,20 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
if (!UsedRegs.empty() || II.getImplicitDefs() || II.hasOptionalDef())
MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
// STATEPOINT is too 'dynamic' to have meaningful machine description.
// We have to manually tie operands.
if (Opc == TargetOpcode::STATEPOINT && NumDefs > 0) {
assert(!HasPhysRegOuts && "STATEPOINT mishandled");
MachineInstr *MI = MIB;
unsigned Def = 0;
unsigned Use = getStatepointGCArgStartIdx(MI) + 1;
while (Def < NumDefs) {
if (MI->getOperand(Use).isReg())
MI->tieOperands(Def++, Use);
Use += 2;
}
}
// Run post-isel target hook to adjust this instruction if needed.
if (II.hasPostISelHook())
TLI->AdjustInstrPostInstrSelection(*MIB, Node);
@@ -125,8 +125,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
PhysReg = Reg;
} else if (Def->isMachineOpcode()) {
const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
if (ResNo >= II.getNumDefs() &&
II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
if (ResNo >= II.getNumDefs() && II.hasImplicitDefOfPhysReg(Reg))
PhysReg = Reg;
}
@@ -67,6 +67,10 @@ cl::opt<bool> UseRegistersForDeoptValues(
"use-registers-for-deopt-values", cl::Hidden, cl::init(false),
cl::desc("Allow using registers for non pointer deopt args"));
cl::opt<unsigned> MaxRegistersForGCPointers(
"max-registers-for-gc-values", cl::Hidden, cl::init(0),
cl::desc("Max number of VRegs allowed to pass GC pointer meta args in"));
static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
SelectionDAGBuilder &Builder, uint64_t Value) {
SDLoc L = Builder.getCurSDLoc();
@@ -86,11 +90,13 @@ void StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
// FunctionLoweringInfo. Also need to ensure used bits get cleared.
AllocatedStackSlots.clear();
AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
DerivedPtrMap.clear();
}
void StatepointLoweringState::clear() {
Locations.clear();
AllocatedStackSlots.clear();
DerivedPtrMap.clear();
assert(PendingGCRelocateCalls.empty() &&
"cleared before statepoint sequence completed");
}
@@ -221,7 +227,6 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
return None;
}
/// Return true if-and-only-if the given SDValue can be lowered as either a
/// constant argument or a stack reference. The key point is that the value
/// doesn't need to be spilled or tracked as a vreg use.
@@ -242,7 +247,6 @@ static bool willLowerDirectly(SDValue Incoming) {
Incoming.isUndef());
}
/// Try to find existing copies of the incoming values in stack slots used for
/// statepoint spilling. If we can find a spill slot for the incoming value,
/// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -388,7 +392,7 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
StoreMMO);
MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
Builder.StatepointLowering.setLocation(Incoming, Loc);
}
@@ -485,7 +489,9 @@ lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot,
/// will be set to the last value spilled (if any were).
static void
lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
SmallVectorImpl<MachineMemOperand*> &MemRefs, SelectionDAGBuilder::StatepointLoweringInfo &SI,
SmallVectorImpl<MachineMemOperand *> &MemRefs,
DenseMap<SDValue, int> &LowerAsVReg,
SelectionDAGBuilder::StatepointLoweringInfo &SI,
SelectionDAGBuilder &Builder) {
// Lower the deopt and gc arguments for this statepoint. Layout will be:
// deopt argument length, deopt arguments.., gc arguments...
@@ -531,6 +537,37 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
const bool LiveInDeopt =
SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn;
// Decide which deriver pointers will go on VRegs
const unsigned MaxTiedRegs = 15; // Max number of tied regs MI can have.
unsigned MaxVRegPtrs =
std::min(MaxTiedRegs, MaxRegistersForGCPointers.getValue());
// Use old spill scheme for cross-block relocates.
if (SI.StatepointInstr) {
const BasicBlock *BB = SI.StatepointInstr->getParent();
bool NonLocalReloc =
llvm::any_of(SI.GCRelocates, [BB](const GCRelocateInst *R) {
return R->getParent() != BB;
});
if (NonLocalReloc)
MaxVRegPtrs = 0;
}
LLVM_DEBUG(dbgs() << "Desiding how to lower GC Pointers:\n");
unsigned CurNumVRegs = 0;
for (const Value *P : SI.Ptrs) {
if (LowerAsVReg.size() == MaxVRegPtrs)
break;
SDValue PtrSD = Builder.getValue(P);
if (willLowerDirectly(PtrSD) || P->getType()->isVectorTy()) {
LLVM_DEBUG(dbgs() << "direct/spill "; PtrSD.dump(&Builder.DAG));
continue;
}
LLVM_DEBUG(dbgs() << "vreg "; PtrSD.dump(&Builder.DAG));
LowerAsVReg[PtrSD] = CurNumVRegs++;
}
LLVM_DEBUG(dbgs() << LowerAsVReg.size()
<< " derived pointers will go in vregs\n");
auto isGCValue = [&](const Value *V) {
auto *Ty = V->getType();
if (!Ty->isPtrOrPtrVectorTy())
@@ -542,7 +579,9 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
};
auto requireSpillSlot = [&](const Value *V) {
return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
if (isGCValue(V))
return !LowerAsVReg.count(Builder.getValue(V));
return !(LiveInDeopt || UseRegistersForDeoptValues);
};
// Before we actually start lowering (and allocating spill slots for values),
@@ -554,9 +593,14 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
if (requireSpillSlot(V))
reservePreviousStackSlotForValue(V, Builder);
}
for (unsigned i = 0; i < SI.Bases.size(); ++i) {
reservePreviousStackSlotForValue(SI.Bases[i], Builder);
reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
SDValue SDV = Builder.getValue(SI.Bases[i]);
if (!LowerAsVReg.count(SDV))
reservePreviousStackSlotForValue(SI.Bases[i], Builder);
SDV = Builder.getValue(SI.Ptrs[i]);
if (!LowerAsVReg.count(SDV))
reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
}
// First, prefix the list with the number of unique values to be
@@ -567,6 +611,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
// The vm state arguments are lowered in an opaque manner. We do not know
// what type of values are contained within.
LLVM_DEBUG(dbgs() << "Lowering deopt state\n");
for (const Value *V : SI.DeoptState) {
SDValue Incoming;
// If this is a function argument at a static frame index, generate it as
@@ -578,6 +623,8 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
}
if (!Incoming.getNode())
Incoming = Builder.getValue(V);
LLVM_DEBUG(dbgs() << "Value " << *V
<< " requireSpillSlot = " << requireSpillSlot(V) << "\n");
lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs,
Builder);
}
@@ -588,14 +635,15 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
// it's (lowered) derived pointer. i.e
// (base[0], ptr[0], base[1], ptr[1], ...)
for (unsigned i = 0; i < SI.Bases.size(); ++i) {
const Value *Base = SI.Bases[i];
lowerIncomingStatepointValue(Builder.getValue(Base),
/*RequireSpillSlot*/ true, Ops, MemRefs,
bool RequireSpillSlot;
SDValue Base = Builder.getValue(SI.Bases[i]);
RequireSpillSlot = !LowerAsVReg.count(Base);
lowerIncomingStatepointValue(Base, RequireSpillSlot, Ops, MemRefs,
Builder);
const Value *Ptr = SI.Ptrs[i];
lowerIncomingStatepointValue(Builder.getValue(Ptr),
/*RequireSpillSlot*/ true, Ops, MemRefs,
SDValue Derived = Builder.getValue(SI.Ptrs[i]);
RequireSpillSlot = !LowerAsVReg.count(Derived);
lowerIncomingStatepointValue(Derived, RequireSpillSlot, Ops, MemRefs,
Builder);
}
@@ -630,7 +678,9 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
SDValue SDV = Builder.getValue(V);
SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
if (Loc.getNode()) {
if (LowerAsVReg.count(SDV)) {
SpillMap[V] = None;
} else if (Loc.getNode()) {
SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
} else {
// Record value as visited, but not spilled. This is case for allocas
@@ -665,6 +715,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
assert(SI.Bases.size() == SI.Ptrs.size() &&
SI.Ptrs.size() <= SI.GCRelocates.size());
LLVM_DEBUG(dbgs() << "Lowering statepoint " << *SI.StatepointInstr << "\n");
#ifndef NDEBUG
for (auto *Reloc : SI.GCRelocates)
if (Reloc->getParent() == SI.StatepointInstr->getParent())
@@ -674,7 +725,9 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
// Lower statepoint vmstate and gcstate arguments
SmallVector<SDValue, 10> LoweredMetaArgs;
SmallVector<MachineMemOperand*, 16> MemRefs;
lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, SI, *this);
// Maps derived pointer SDValue to statepoint result of relocated pointer.
DenseMap<SDValue, int> LowerAsVReg;
lowerStatepointMetaArgs(LoweredMetaArgs, MemRefs, LowerAsVReg, SI, *this);
// Now that we've emitted the spills, we need to update the root so that the
// call sequence is ordered correctly.
@@ -788,14 +841,35 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
// Compute return values. Provide a glue output since we consume one as
// input. This allows someone else to chain off us as needed.
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<EVT, 8> NodeTys;
for (auto &Ptr : SI.Ptrs) {
SDValue SD = getValue(Ptr);
if (LowerAsVReg.count(SD)) {
NodeTys.push_back(SD.getValueType());
}
}
LLVM_DEBUG(dbgs() << "Statepoint has " << NodeTys.size() << " results\n");
assert(NodeTys.size() == LowerAsVReg.size() && "Inconsistent GC Ptr lowering");
NodeTys.push_back(MVT::Other);
NodeTys.push_back(MVT::Glue);
unsigned NumResults = NodeTys.size();
MachineSDNode *StatepointMCNode =
DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
DAG.setNodeMemRefs(StatepointMCNode, MemRefs);
SDNode *SinkNode = StatepointMCNode;
// Fill mapping from derived pointer to statepoint result denoting its
// relocated value.
auto &DPtrMap = StatepointLowering.DerivedPtrMap;
for (const auto *Relocate : SI.GCRelocates) {
Value *Derived = Relocate->getDerivedPtr();
SDValue SD = getValue(Derived);
if (LowerAsVReg.count(SD))
DPtrMap[Derived] = SDValue(StatepointMCNode, LowerAsVReg[SD]);
}
// Build the GC_TRANSITION_END node if necessary.
//
// See the comment above regarding GC_TRANSITION_START for the layout of
@@ -804,7 +878,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
SmallVector<SDValue, 8> TEOps;
// Add chain
TEOps.push_back(SDValue(StatepointMCNode, 0));
TEOps.push_back(SDValue(StatepointMCNode, NumResults - 2));
// Add GC transition arguments
for (const Value *V : SI.GCTransitionArgs) {
@@ -814,7 +888,7 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
}
// Add glue
TEOps.push_back(SDValue(StatepointMCNode, 1));
TEOps.push_back(SDValue(StatepointMCNode, NumResults - 1));
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -825,7 +899,12 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
}
// Replace original call
DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
// Call: ch,glue = CALL ...
// Statepoint: [gc relocates],ch,glue = STATEPOINT ...
unsigned NumSinkValues = SinkNode->getNumValues();
SDValue StatepointValues[2] = {SDValue(SinkNode, NumSinkValues - 2),
SDValue(SinkNode, NumSinkValues - 1)};
DAG.ReplaceAllUsesWith(CallNode, StatepointValues);
// Remove original call node
DAG.DeleteNode(CallNode);
@@ -927,7 +1006,7 @@ SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
setValue(&I, ReturnValue);
return;
}
// Result value will be used in a different basic block so we need to export
// it now. Default exporting mechanism will not work here because statepoint
// call has a different type than the actual call. It means that by default
@@ -1010,12 +1089,13 @@ void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
}
void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
const BasicBlock *StatepointBB = Relocate.getStatepoint()->getParent();
#ifndef NDEBUG
// Consistency check
// We skip this check for relocates not in the same basic block as their
// statepoint. It would be too expensive to preserve validation info through
// different basic blocks.
if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
if (StatepointBB == Relocate.getParent())
StatepointLowering.relocCallVisited(Relocate);
auto *Ty = Relocate.getType()->getScalarType();
@@ -1033,6 +1113,16 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
return;
}
// Relocate is local to statepoint block and its pointer was assigned
// to VReg. Use corresponding statepoint result.
auto &DPtrMap = StatepointLowering.DerivedPtrMap;
auto It = DPtrMap.find(DerivedPtr);
if (It != DPtrMap.end()) {
setValue(&Relocate, It->second);
assert(Relocate.getParent() == StatepointBB && "unexpected DPtrMap entry");
return;
}
auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()];
auto SlotIt = SpillMap.find(DerivedPtr);
assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
@@ -103,6 +103,10 @@ public:
return AllocatedStackSlots.test(Offset);
}
/// For each statepoint keep mapping from original derived pointer to
/// the statepoint node result defining its new value.
DenseMap<const Value *, SDValue> DerivedPtrMap;
private:
/// Maps pre-relocation value (gc pointer directly incoming into statepoint)
/// into it's location (currently only stack slots)
+11 -1
View File
@@ -1041,9 +1041,19 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
// Inherit previous memory operands.
MIB.cloneMemRefs(*MI);
for (auto &MO : MI->operands()) {
for (unsigned i = 0; i < MI->getNumOperands(); ++i) {
MachineOperand &MO = MI->getOperand(i);
if (!MO.isFI()) {
// Index of Def operand this Use it tied to.
// Since Defs are coming before Uses, if Use is tied, then
// index of Def must be smaller that index of that Use.
// Also, Defs preserve their position in new MI.
unsigned TiedTo = i;
if (MO.isReg() && MO.isTied())
TiedTo = MI->findTiedOperandIdx(i);
MIB.add(MO);
if (TiedTo < i)
MIB->tieOperands(TiedTo, MIB->getNumOperands() - 1);
continue;
}
File diff suppressed because it is too large Load Diff