Bug 885186 - Optimize x86/x64 register moves using xchg, xor swap, and push/pop. r=jandem

This commit is contained in:
Dan Gohman 2013-06-26 10:32:55 -07:00
parent 1973753238
commit b2f0e66ef3
8 changed files with 208 additions and 124 deletions

View File

@ -99,6 +99,9 @@ class MoveResolver
return disp_ == other.disp_;
return true;
}
bool operator !=(const MoveOperand &other) const {
return !operator==(other);
}
};
class Move

View File

@ -287,6 +287,10 @@ class AssemblerX86Shared
}
}
void xchgl(const Register &src, const Register &dest) {
masm.xchgl_rr(src.code(), dest.code());
}
void movsd(const FloatRegister &src, const FloatRegister &dest) {
JS_ASSERT(HasSSE2());
masm.movsd_rr(src.code(), dest.code());

View File

@ -14,24 +14,127 @@ using namespace js::ion;
MoveEmitterX86::MoveEmitterX86(MacroAssemblerSpecific &masm)
: inCycle_(false),
masm(masm),
pushedAtCycle_(-1),
pushedAtSpill_(-1),
spilledReg_(InvalidReg)
pushedAtCycle_(-1)
{
pushedAtStart_ = masm.framePushed();
}
// Examine the cycle in moves starting at position i. Determine if it's a
// simple cycle consisting of all register-to-register moves in a single class,
// and whether it can be implemented entirely by swaps.
size_t
MoveEmitterX86::characterizeCycle(const MoveResolver &moves, size_t i,
bool *allGeneralRegs, bool *allFloatRegs)
{
size_t swapCount = 0;
for (size_t j = i; ; j++) {
const Move &move = moves.getMove(j);
// If it isn't a cycle of registers of the same kind, we won't be able
// to optimize it.
if (!move.to().isGeneralReg())
*allGeneralRegs = false;
if (!move.to().isFloatReg())
*allFloatRegs = false;
if (!*allGeneralRegs && !*allFloatRegs)
return -1;
// The first and last move of the cycle are marked with inCycle(). Stop
// iterating when we see the last one.
if (j != i && move.inCycle())
break;
// Check that this move is actually part of the cycle. This is
// over-conservative when there are multiple reads from the same source,
// but that's expected to be rare.
if (move.from() != moves.getMove(j + 1).to()) {
*allGeneralRegs = false;
*allFloatRegs = false;
return -1;
}
swapCount++;
}
// Check that the last move cycles back to the first move.
const Move &move = moves.getMove(i + swapCount);
if (move.from() != moves.getMove(i).to()) {
*allGeneralRegs = false;
*allFloatRegs = false;
return -1;
}
return swapCount;
}
// If we can emit optimized code for the cycle in moves starting at position i,
// do so, and return true.
bool
MoveEmitterX86::maybeEmitOptimizedCycle(const MoveResolver &moves, size_t i,
bool allGeneralRegs, bool allFloatRegs, size_t swapCount)
{
if (allGeneralRegs && swapCount <= 2) {
// Use x86's swap-integer-registers instruction if we only have a few
// swaps. (x86 also has a swap between registers and memory but it's
// slow.)
for (size_t k = 0; k < swapCount; k++)
masm.xchg(moves.getMove(i + k).to().reg(), moves.getMove(i + k + 1).to().reg());
return true;
}
if (allFloatRegs && swapCount == 1) {
// There's no xchg for xmm registers, but if we only need a single swap,
// it's cheap to do an XOR swap.
FloatRegister a = moves.getMove(i).to().floatReg();
FloatRegister b = moves.getMove(i + 1).to().floatReg();
masm.xorpd(a, b);
masm.xorpd(b, a);
masm.xorpd(a, b);
return true;
}
return false;
}
void
MoveEmitterX86::emit(const MoveResolver &moves)
{
if (moves.hasCycles()) {
// Reserve stack for cycle resolution
masm.reserveStack(sizeof(double));
pushedAtCycle_ = masm.framePushed();
}
for (size_t i = 0; i < moves.numMoves(); i++) {
const Move &move = moves.getMove(i);
const MoveOperand &from = move.from();
const MoveOperand &to = move.to();
for (size_t i = 0; i < moves.numMoves(); i++)
emit(moves.getMove(i));
if (move.inCycle()) {
// If this is the end of a cycle for which we're using the stack,
// handle the end.
if (inCycle_) {
completeCycle(to, move.kind());
inCycle_ = false;
continue;
}
// Characterize the cycle.
bool allGeneralRegs = true, allFloatRegs = true;
size_t swapCount = characterizeCycle(moves, i, &allGeneralRegs, &allFloatRegs);
// Attempt to optimize it to avoid using the stack.
if (maybeEmitOptimizedCycle(moves, i, allGeneralRegs, allFloatRegs, swapCount)) {
i += swapCount;
continue;
}
// Otherwise use the stack.
breakCycle(to, move.kind());
inCycle_ = true;
}
// A normal move which is not part of a cycle.
if (move.kind() == Move::DOUBLE)
emitDoubleMove(from, to);
else
emitGeneralMove(from, to);
}
}
MoveEmitterX86::~MoveEmitterX86()
@ -40,17 +143,20 @@ MoveEmitterX86::~MoveEmitterX86()
}
Operand
MoveEmitterX86::cycleSlot() const
MoveEmitterX86::cycleSlot()
{
if (pushedAtCycle_ == -1) {
// Reserve stack for cycle resolution
masm.reserveStack(sizeof(double));
pushedAtCycle_ = masm.framePushed();
}
return Operand(StackPointer, masm.framePushed() - pushedAtCycle_);
}
Operand
MoveEmitterX86::spillSlot() const
{
return Operand(StackPointer, masm.framePushed() - pushedAtSpill_);
}
// Warning, do not use the resulting operand with pop instructions, since they
// compute the effective destination address after altering the stack pointer.
// Use toPopOperand if an Operand is needed for a pop.
Operand
MoveEmitterX86::toOperand(const MoveOperand &operand) const
{
@ -70,31 +176,32 @@ MoveEmitterX86::toOperand(const MoveOperand &operand) const
return Operand(operand.floatReg());
}
Register
MoveEmitterX86::tempReg()
// This is the same as toOperand except that it computes an Operand suitable for
// use in a pop.
Operand
MoveEmitterX86::toPopOperand(const MoveOperand &operand) const
{
if (spilledReg_ != InvalidReg)
return spilledReg_;
if (operand.isMemory()) {
if (operand.base() != StackPointer)
return Operand(operand.base(), operand.disp());
// For now, just pick edx/rdx as the eviction point. This is totally
// random, and if it ends up being bad, we can use actual heuristics later.
spilledReg_ = edx;
JS_ASSERT(operand.disp() >= 0);
#ifdef JS_CPU_X64
JS_ASSERT(edx == rdx);
#endif
if (pushedAtSpill_ == -1) {
masm.Push(spilledReg_);
pushedAtSpill_ = masm.framePushed();
} else {
masm.mov(spilledReg_, spillSlot());
// Otherwise, the stack offset may need to be adjusted.
// Note the adjustment by the stack slot here, to offset for the fact that pop
// computes its effective address after incrementing the stack pointer.
return Operand(StackPointer,
operand.disp() + (masm.framePushed() - sizeof(void *) - pushedAtStart_));
}
return spilledReg_;
if (operand.isGeneralReg())
return Operand(operand.reg());
JS_ASSERT(operand.isFloatReg());
return Operand(operand.floatReg());
}
void
MoveEmitterX86::breakCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind)
MoveEmitterX86::breakCycle(const MoveOperand &to, Move::Kind kind)
{
// There is some pattern:
// (A -> B)
@ -110,23 +217,15 @@ MoveEmitterX86::breakCycle(const MoveOperand &from, const MoveOperand &to, Move:
masm.movsd(to.floatReg(), cycleSlot());
}
} else {
if (to.isMemory()) {
Register temp = tempReg();
masm.mov(toOperand(to), temp);
masm.mov(temp, cycleSlot());
} else {
if (to.reg() == spilledReg_) {
// If the destination was spilled, restore it first.
masm.mov(spillSlot(), spilledReg_);
spilledReg_ = InvalidReg;
}
masm.mov(to.reg(), cycleSlot());
}
if (to.isMemory())
masm.Push(toOperand(to));
else
masm.Push(to.reg());
}
}
void
MoveEmitterX86::completeCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind)
MoveEmitterX86::completeCycle(const MoveOperand &to, Move::Kind kind)
{
// There is some pattern:
// (A -> B)
@ -143,35 +242,17 @@ MoveEmitterX86::completeCycle(const MoveOperand &from, const MoveOperand &to, Mo
}
} else {
if (to.isMemory()) {
Register temp = tempReg();
masm.mov(cycleSlot(), temp);
masm.mov(temp, toOperand(to));
masm.Pop(toPopOperand(to));
} else {
if (to.reg() == spilledReg_) {
// Make sure we don't re-clobber the spilled register later.
spilledReg_ = InvalidReg;
}
masm.mov(cycleSlot(), to.reg());
masm.Pop(to.reg());
}
}
}
void
MoveEmitterX86::emitMove(const MoveOperand &from, const MoveOperand &to)
MoveEmitterX86::emitGeneralMove(const MoveOperand &from, const MoveOperand &to)
{
if (to.isGeneralReg() && to.reg() == spilledReg_) {
// If the destination is the spilled register, make sure we
// don't re-clobber its value.
spilledReg_ = InvalidReg;
}
if (from.isGeneralReg()) {
if (from.reg() == spilledReg_) {
// If the source is a register that has been spilled, make sure
// to load the source back into that register.
masm.mov(spillSlot(), spilledReg_);
spilledReg_ = InvalidReg;
}
masm.mov(from.reg(), toOperand(to));
} else if (to.isGeneralReg()) {
JS_ASSERT(from.isMemory() || from.isEffectiveAddress());
@ -179,20 +260,31 @@ MoveEmitterX86::emitMove(const MoveOperand &from, const MoveOperand &to)
masm.mov(toOperand(from), to.reg());
else
masm.lea(toOperand(from), to.reg());
} else {
} else if (from.isMemory()) {
// Memory to memory gpr move.
Register reg = tempReg();
// Reload its previous value from the stack.
if (reg == from.base())
masm.mov(spillSlot(), from.base());
JS_ASSERT(from.isMemory() || from.isEffectiveAddress());
if (from.isMemory())
masm.mov(toOperand(from), reg);
else
masm.lea(toOperand(from), reg);
JS_ASSERT(to.base() != reg);
masm.mov(reg, toOperand(to));
#ifdef JS_CPU_X64
// x64 has a ScratchReg. Use it.
masm.mov(toOperand(from), ScratchReg);
masm.mov(ScratchReg, toOperand(to));
#else
// No ScratchReg; bounce it off the stack.
masm.Push(toOperand(from));
masm.Pop(toPopOperand(to));
#endif
} else {
// Effective address to memory move.
JS_ASSERT(from.isEffectiveAddress());
#ifdef JS_CPU_X64
// x64 has a ScratchReg. Use it.
masm.lea(toOperand(from), ScratchReg);
masm.mov(ScratchReg, toOperand(to));
#else
// This is tricky without a ScratchReg. We can't do an lea. Bounce the
// base register off the stack, then add the offset in place.
masm.Push(from.base());
masm.Pop(toPopOperand(to));
masm.addPtr(Imm32(from.disp()), toOperand(to));
#endif
}
}
@ -211,29 +303,6 @@ MoveEmitterX86::emitDoubleMove(const MoveOperand &from, const MoveOperand &to)
}
}
void
MoveEmitterX86::emit(const Move &move)
{
const MoveOperand &from = move.from();
const MoveOperand &to = move.to();
if (move.inCycle()) {
if (inCycle_) {
completeCycle(from, to, move.kind());
inCycle_ = false;
return;
}
breakCycle(from, to, move.kind());
inCycle_ = true;
}
if (move.kind() == Move::DOUBLE)
emitDoubleMove(from, to);
else
emitMove(from, to);
}
void
MoveEmitterX86::assertDone()
{
@ -245,9 +314,6 @@ MoveEmitterX86::finish()
{
assertDone();
if (pushedAtSpill_ != -1 && spilledReg_ != InvalidReg)
masm.mov(spillSlot(), spilledReg_);
masm.freeStack(masm.framePushed() - pushedAtStart_);
}

View File

@ -26,28 +26,23 @@ class MoveEmitterX86
// Original stack push value.
uint32_t pushedAtStart_;
// These store stack offsets to spill locations, snapshotting
// codegen->framePushed_ at the time they were allocated. They are -1 if no
// stack space has been allocated for that particular spill.
// This is a store stack offset for the cycle-break spill slot, snapshotting
// codegen->framePushed_ at the time it is allocated. -1 if not allocated.
int32_t pushedAtCycle_;
int32_t pushedAtSpill_;
// Register that is available for temporary use. It may be assigned
// InvalidReg. If no corresponding spill space has been assigned,
// then this register do not need to be spilled.
Register spilledReg_;
void assertDone();
Register tempReg();
Operand cycleSlot() const;
Operand spillSlot() const;
Operand cycleSlot();
Operand toOperand(const MoveOperand &operand) const;
Operand toPopOperand(const MoveOperand &operand) const;
void emitMove(const MoveOperand &from, const MoveOperand &to);
size_t characterizeCycle(const MoveResolver &moves, size_t i,
bool *allGeneralRegs, bool *allFloatRegs);
bool maybeEmitOptimizedCycle(const MoveResolver &moves, size_t i,
bool allGeneralRegs, bool allFloatRegs, size_t swapCount);
void emitGeneralMove(const MoveOperand &from, const MoveOperand &to);
void emitDoubleMove(const MoveOperand &from, const MoveOperand &to);
void breakCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind);
void completeCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind);
void emit(const Move &move);
void breakCycle(const MoveOperand &to, Move::Kind kind);
void completeCycle(const MoveOperand &to, Move::Kind kind);
public:
MoveEmitterX86(MacroAssemblerSpecific &masm);

View File

@ -428,6 +428,10 @@ class Assembler : public AssemblerX86Shared
masm.movq_rr(src.code(), dest.code());
}
void xchgq(const Register &src, const Register &dest) {
masm.xchgq_rr(src.code(), dest.code());
}
void andq(const Register &src, const Register &dest) {
masm.andq_rr(src.code(), dest.code());
}
@ -552,6 +556,9 @@ class Assembler : public AssemblerX86Shared
masm.movq_i64r(label->prev(), dest.code());
label->setPrev(masm.size());
}
void xchg(const Register &src, const Register &dest) {
xchgq(src, dest);
}
void lea(const Operand &src, const Register &dest) {
switch (src.kind()) {
case Operand::REG_DISP:

View File

@ -450,6 +450,9 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared
void addPtr(Imm32 imm, const Address &dest) {
addq(imm, Operand(dest));
}
void addPtr(Imm32 imm, const Operand &dest) {
addq(imm, dest);
}
void addPtr(ImmWord imm, const Register &dest) {
JS_ASSERT(dest != ScratchReg);
if ((intptr_t)imm.value <= INT32_MAX && (intptr_t)imm.value >= INT32_MIN) {

View File

@ -343,6 +343,9 @@ class Assembler : public AssemblerX86Shared
void mov(const Register &src, const Register &dest) {
movl(src, dest);
}
void xchg(const Register &src, const Register &dest) {
xchgl(src, dest);
}
void lea(const Operand &src, const Register &dest) {
return leal(src, dest);
}

View File

@ -488,6 +488,9 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
void addPtr(Imm32 imm, const Address &dest) {
addl(imm, Operand(dest));
}
void addPtr(Imm32 imm, const Operand &dest) {
addl(imm, dest);
}
void addPtr(const Address &src, const Register &dest) {
addl(Operand(src), dest);
}