Bug 885186 - Optimize x86/x64 register moves using xchg, xor swap, and push/pop. r=jandem

2024-09-13 09:24:08 -07:00 · 2013-06-26 10:32:55 -07:00 · 2013-06-26 10:32:55 -07:00 · b2f0e66ef3
commit b2f0e66ef3
parent 1973753238
8 changed files with 208 additions and 124 deletions
--- a/js/src/ion/MoveResolver.h
+++ b/js/src/ion/MoveResolver.h
@ -99,6 +99,9 @@ class MoveResolver
                return disp_ == other.disp_;
            return true;
        }
+        bool operator !=(const MoveOperand &other) const {
+            return !operator==(other);
+        }
    };

    class Move
--- a/js/src/ion/shared/Assembler-x86-shared.h
+++ b/js/src/ion/shared/Assembler-x86-shared.h
@ -287,6 +287,10 @@ class AssemblerX86Shared
        }
    }

+    void xchgl(const Register &src, const Register &dest) {
+        masm.xchgl_rr(src.code(), dest.code());
+    }
+
    void movsd(const FloatRegister &src, const FloatRegister &dest) {
        JS_ASSERT(HasSSE2());
        masm.movsd_rr(src.code(), dest.code());
--- a/js/src/ion/shared/MoveEmitter-x86-shared.cpp
+++ b/js/src/ion/shared/MoveEmitter-x86-shared.cpp
@ -14,24 +14,127 @@ using namespace js::ion;
 MoveEmitterX86::MoveEmitterX86(MacroAssemblerSpecific &masm)
  : inCycle_(false),
    masm(masm),
-    pushedAtCycle_(-1),
-    pushedAtSpill_(-1),
-    spilledReg_(InvalidReg)
+    pushedAtCycle_(-1)
 {
    pushedAtStart_ = masm.framePushed();
 }

+// Examine the cycle in moves starting at position i. Determine if it's a
+// simple cycle consisting of all register-to-register moves in a single class,
+// and whether it can be implemented entirely by swaps.
+size_t
+MoveEmitterX86::characterizeCycle(const MoveResolver &moves, size_t i,
+                                  bool *allGeneralRegs, bool *allFloatRegs)
+{
+    size_t swapCount = 0;
+
+    for (size_t j = i; ; j++) {
+        const Move &move = moves.getMove(j);
+
+        // If it isn't a cycle of registers of the same kind, we won't be able
+        // to optimize it.
+        if (!move.to().isGeneralReg())
+            *allGeneralRegs = false;
+        if (!move.to().isFloatReg())
+            *allFloatRegs = false;
+        if (!*allGeneralRegs && !*allFloatRegs)
+            return -1;
+
+        // The first and last move of the cycle are marked with inCycle(). Stop
+        // iterating when we see the last one.
+        if (j != i && move.inCycle())
+            break;
+
+        // Check that this move is actually part of the cycle. This is
+        // over-conservative when there are multiple reads from the same source,
+        // but that's expected to be rare.
+        if (move.from() != moves.getMove(j + 1).to()) {
+            *allGeneralRegs = false;
+            *allFloatRegs = false;
+            return -1;
+        }
+
+        swapCount++;
+    }
+
+    // Check that the last move cycles back to the first move.
+    const Move &move = moves.getMove(i + swapCount);
+    if (move.from() != moves.getMove(i).to()) {
+        *allGeneralRegs = false;
+        *allFloatRegs = false;
+        return -1;
+    }
+
+    return swapCount;
+}
+
+// If we can emit optimized code for the cycle in moves starting at position i,
+// do so, and return true.
+bool
+MoveEmitterX86::maybeEmitOptimizedCycle(const MoveResolver &moves, size_t i,
+                                        bool allGeneralRegs, bool allFloatRegs, size_t swapCount)
+{
+    if (allGeneralRegs && swapCount <= 2) {
+        // Use x86's swap-integer-registers instruction if we only have a few
+        // swaps. (x86 also has a swap between registers and memory but it's
+        // slow.)
+        for (size_t k = 0; k < swapCount; k++)
+            masm.xchg(moves.getMove(i + k).to().reg(), moves.getMove(i + k + 1).to().reg());
+        return true;
+    }
+
+    if (allFloatRegs && swapCount == 1) {
+        // There's no xchg for xmm registers, but if we only need a single swap,
+        // it's cheap to do an XOR swap.
+        FloatRegister a = moves.getMove(i).to().floatReg();
+        FloatRegister b = moves.getMove(i + 1).to().floatReg();
+        masm.xorpd(a, b);
+        masm.xorpd(b, a);
+        masm.xorpd(a, b);
+        return true;
+    }
+
+    return false;
+}
+
 void
 MoveEmitterX86::emit(const MoveResolver &moves)
 {
-    if (moves.hasCycles()) {
-        // Reserve stack for cycle resolution
-        masm.reserveStack(sizeof(double));
-        pushedAtCycle_ = masm.framePushed();
-    }
+    for (size_t i = 0; i < moves.numMoves(); i++) {
+        const Move &move = moves.getMove(i);
+        const MoveOperand &from = move.from();
+        const MoveOperand &to = move.to();

-    for (size_t i = 0; i < moves.numMoves(); i++)
-        emit(moves.getMove(i));
+        if (move.inCycle()) {
+            // If this is the end of a cycle for which we're using the stack,
+            // handle the end.
+            if (inCycle_) {
+                completeCycle(to, move.kind());
+                inCycle_ = false;
+                continue;
+            }
+
+            // Characterize the cycle.
+            bool allGeneralRegs = true, allFloatRegs = true;
+            size_t swapCount = characterizeCycle(moves, i, &allGeneralRegs, &allFloatRegs);
+
+            // Attempt to optimize it to avoid using the stack.
+            if (maybeEmitOptimizedCycle(moves, i, allGeneralRegs, allFloatRegs, swapCount)) {
+                i += swapCount;
+                continue;
+            }
+
+            // Otherwise use the stack.
+            breakCycle(to, move.kind());
+            inCycle_ = true;
+        }
+
+        // A normal move which is not part of a cycle.
+        if (move.kind() == Move::DOUBLE)
+            emitDoubleMove(from, to);
+        else
+            emitGeneralMove(from, to);
+    }
 }

 MoveEmitterX86::~MoveEmitterX86()
@ -40,17 +143,20 @@ MoveEmitterX86::~MoveEmitterX86()
 }

 Operand
-MoveEmitterX86::cycleSlot() const
+MoveEmitterX86::cycleSlot()
 {
+    if (pushedAtCycle_ == -1) {
+        // Reserve stack for cycle resolution
+        masm.reserveStack(sizeof(double));
+        pushedAtCycle_ = masm.framePushed();
+    }
+
    return Operand(StackPointer, masm.framePushed() - pushedAtCycle_);
 }

-Operand
-MoveEmitterX86::spillSlot() const
-{
-    return Operand(StackPointer, masm.framePushed() - pushedAtSpill_);
-}
-
+// Warning, do not use the resulting operand with pop instructions, since they
+// compute the effective destination address after altering the stack pointer.
+// Use toPopOperand if an Operand is needed for a pop.
 Operand
 MoveEmitterX86::toOperand(const MoveOperand &operand) const
 {
@ -70,31 +176,32 @@ MoveEmitterX86::toOperand(const MoveOperand &operand) const
    return Operand(operand.floatReg());
 }

-Register
-MoveEmitterX86::tempReg()
+// This is the same as toOperand except that it computes an Operand suitable for
+// use in a pop.
+Operand
+MoveEmitterX86::toPopOperand(const MoveOperand &operand) const
 {
-    if (spilledReg_ != InvalidReg)
-        return spilledReg_;
+    if (operand.isMemory()) {
+        if (operand.base() != StackPointer)
+            return Operand(operand.base(), operand.disp());

-    // For now, just pick edx/rdx as the eviction point. This is totally
-    // random, and if it ends up being bad, we can use actual heuristics later.
-    spilledReg_ = edx;
+        JS_ASSERT(operand.disp() >= 0);

-#ifdef JS_CPU_X64
-    JS_ASSERT(edx == rdx);
-#endif
-
-    if (pushedAtSpill_ == -1) {
-        masm.Push(spilledReg_);
-        pushedAtSpill_ = masm.framePushed();
-    } else {
-        masm.mov(spilledReg_, spillSlot());
+        // Otherwise, the stack offset may need to be adjusted.
+        // Note the adjustment by the stack slot here, to offset for the fact that pop
+        // computes its effective address after incrementing the stack pointer.
+        return Operand(StackPointer,
+                       operand.disp() + (masm.framePushed() - sizeof(void *) - pushedAtStart_));
    }
-    return spilledReg_;
+    if (operand.isGeneralReg())
+        return Operand(operand.reg());
+
+    JS_ASSERT(operand.isFloatReg());
+    return Operand(operand.floatReg());
 }

 void
-MoveEmitterX86::breakCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind)
+MoveEmitterX86::breakCycle(const MoveOperand &to, Move::Kind kind)
 {
    // There is some pattern:
    //   (A -> B)
@ -110,23 +217,15 @@ MoveEmitterX86::breakCycle(const MoveOperand &from, const MoveOperand &to, Move:
            masm.movsd(to.floatReg(), cycleSlot());
        }
    } else {
-        if (to.isMemory()) {
-            Register temp = tempReg();
-            masm.mov(toOperand(to), temp);
-            masm.mov(temp, cycleSlot());
-        } else {
-            if (to.reg() == spilledReg_) {
-                // If the destination was spilled, restore it first.
-                masm.mov(spillSlot(), spilledReg_);
-                spilledReg_ = InvalidReg;
-            }
-            masm.mov(to.reg(), cycleSlot());
-        }
+        if (to.isMemory())
+            masm.Push(toOperand(to));
+        else
+            masm.Push(to.reg());
    }
 }

 void
-MoveEmitterX86::completeCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind)
+MoveEmitterX86::completeCycle(const MoveOperand &to, Move::Kind kind)
 {
    // There is some pattern:
    //   (A -> B)
@ -143,35 +242,17 @@ MoveEmitterX86::completeCycle(const MoveOperand &from, const MoveOperand &to, Mo
        }
    } else {
        if (to.isMemory()) {
-            Register temp = tempReg();
-            masm.mov(cycleSlot(), temp);
-            masm.mov(temp, toOperand(to));
+            masm.Pop(toPopOperand(to));
        } else {
-            if (to.reg() == spilledReg_) {
-                // Make sure we don't re-clobber the spilled register later.
-                spilledReg_ = InvalidReg;
-            }
-            masm.mov(cycleSlot(), to.reg());
+            masm.Pop(to.reg());
        }
    }
 }

 void
-MoveEmitterX86::emitMove(const MoveOperand &from, const MoveOperand &to)
+MoveEmitterX86::emitGeneralMove(const MoveOperand &from, const MoveOperand &to)
 {
-    if (to.isGeneralReg() && to.reg() == spilledReg_) {
-        // If the destination is the spilled register, make sure we
-        // don't re-clobber its value.
-        spilledReg_ = InvalidReg;
-    }
-
    if (from.isGeneralReg()) {
-        if (from.reg() == spilledReg_) {
-            // If the source is a register that has been spilled, make sure
-            // to load the source back into that register.
-            masm.mov(spillSlot(), spilledReg_);
-            spilledReg_ = InvalidReg;
-        }
        masm.mov(from.reg(), toOperand(to));
    } else if (to.isGeneralReg()) {
        JS_ASSERT(from.isMemory() || from.isEffectiveAddress());
@ -179,20 +260,31 @@ MoveEmitterX86::emitMove(const MoveOperand &from, const MoveOperand &to)
            masm.mov(toOperand(from), to.reg());
        else
            masm.lea(toOperand(from), to.reg());
-    } else {
+    } else if (from.isMemory()) {
        // Memory to memory gpr move.
-        Register reg = tempReg();
-        // Reload its previous value from the stack.
-        if (reg == from.base())
-            masm.mov(spillSlot(), from.base());
-
-        JS_ASSERT(from.isMemory() || from.isEffectiveAddress());
-        if (from.isMemory())
-            masm.mov(toOperand(from), reg);
-        else
-            masm.lea(toOperand(from), reg);
-        JS_ASSERT(to.base() != reg);
-        masm.mov(reg, toOperand(to));
+#ifdef JS_CPU_X64
+        // x64 has a ScratchReg. Use it.
+        masm.mov(toOperand(from), ScratchReg);
+        masm.mov(ScratchReg, toOperand(to));
+#else
+        // No ScratchReg; bounce it off the stack.
+        masm.Push(toOperand(from));
+        masm.Pop(toPopOperand(to));
+#endif
+    } else {
+        // Effective address to memory move.
+        JS_ASSERT(from.isEffectiveAddress());
+#ifdef JS_CPU_X64
+        // x64 has a ScratchReg. Use it.
+        masm.lea(toOperand(from), ScratchReg);
+        masm.mov(ScratchReg, toOperand(to));
+#else
+        // This is tricky without a ScratchReg. We can't do an lea. Bounce the
+        // base register off the stack, then add the offset in place.
+        masm.Push(from.base());
+        masm.Pop(toPopOperand(to));
+        masm.addPtr(Imm32(from.disp()), toOperand(to));
+#endif
    }
 }

@ -211,29 +303,6 @@ MoveEmitterX86::emitDoubleMove(const MoveOperand &from, const MoveOperand &to)
    }
 }

-void
-MoveEmitterX86::emit(const Move &move)
-{
-    const MoveOperand &from = move.from();
-    const MoveOperand &to = move.to();
-
-    if (move.inCycle()) {
-        if (inCycle_) {
-            completeCycle(from, to, move.kind());
-            inCycle_ = false;
-            return;
-        }
-
-        breakCycle(from, to, move.kind());
-        inCycle_ = true;
-    }
-    
-    if (move.kind() == Move::DOUBLE)
-        emitDoubleMove(from, to);
-    else
-        emitMove(from, to);
-}
-
 void
 MoveEmitterX86::assertDone()
 {
@ -245,9 +314,6 @@ MoveEmitterX86::finish()
 {
    assertDone();

-    if (pushedAtSpill_ != -1 && spilledReg_ != InvalidReg)
-        masm.mov(spillSlot(), spilledReg_);
-
    masm.freeStack(masm.framePushed() - pushedAtStart_);
 }

--- a/js/src/ion/shared/MoveEmitter-x86-shared.h
+++ b/js/src/ion/shared/MoveEmitter-x86-shared.h
@ -26,28 +26,23 @@ class MoveEmitterX86
    // Original stack push value.
    uint32_t pushedAtStart_;

-    // These store stack offsets to spill locations, snapshotting
-    // codegen->framePushed_ at the time they were allocated. They are -1 if no
-    // stack space has been allocated for that particular spill.
+    // This is a store stack offset for the cycle-break spill slot, snapshotting
+    // codegen->framePushed_ at the time it is allocated. -1 if not allocated.
    int32_t pushedAtCycle_;
-    int32_t pushedAtSpill_;
-
-    // Register that is available for temporary use. It may be assigned
-    // InvalidReg. If no corresponding spill space has been assigned,
-    // then this register do not need to be spilled.
-    Register spilledReg_;

    void assertDone();
-    Register tempReg();
-    Operand cycleSlot() const;
-    Operand spillSlot() const;
+    Operand cycleSlot();
    Operand toOperand(const MoveOperand &operand) const;
+    Operand toPopOperand(const MoveOperand &operand) const;

-    void emitMove(const MoveOperand &from, const MoveOperand &to);
+    size_t characterizeCycle(const MoveResolver &moves, size_t i,
+                             bool *allGeneralRegs, bool *allFloatRegs);
+    bool maybeEmitOptimizedCycle(const MoveResolver &moves, size_t i,
+                                 bool allGeneralRegs, bool allFloatRegs, size_t swapCount);
+    void emitGeneralMove(const MoveOperand &from, const MoveOperand &to);
    void emitDoubleMove(const MoveOperand &from, const MoveOperand &to);
-    void breakCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind);
-    void completeCycle(const MoveOperand &from, const MoveOperand &to, Move::Kind kind);
-    void emit(const Move &move);
+    void breakCycle(const MoveOperand &to, Move::Kind kind);
+    void completeCycle(const MoveOperand &to, Move::Kind kind);

  public:
    MoveEmitterX86(MacroAssemblerSpecific &masm);
--- a/js/src/ion/x64/Assembler-x64.h
+++ b/js/src/ion/x64/Assembler-x64.h
@ -428,6 +428,10 @@ class Assembler : public AssemblerX86Shared
        masm.movq_rr(src.code(), dest.code());
    }

+    void xchgq(const Register &src, const Register &dest) {
+        masm.xchgq_rr(src.code(), dest.code());
+    }
+
    void andq(const Register &src, const Register &dest) {
        masm.andq_rr(src.code(), dest.code());
    }
@ -552,6 +556,9 @@ class Assembler : public AssemblerX86Shared
        masm.movq_i64r(label->prev(), dest.code());
        label->setPrev(masm.size());
    }
+    void xchg(const Register &src, const Register &dest) {
+        xchgq(src, dest);
+    }
    void lea(const Operand &src, const Register &dest) {
        switch (src.kind()) {
          case Operand::REG_DISP:
--- a/js/src/ion/x64/MacroAssembler-x64.h
+++ b/js/src/ion/x64/MacroAssembler-x64.h
@ -450,6 +450,9 @@ class MacroAssemblerX64 : public MacroAssemblerX86Shared
    void addPtr(Imm32 imm, const Address &dest) {
        addq(imm, Operand(dest));
    }
+    void addPtr(Imm32 imm, const Operand &dest) {
+        addq(imm, dest);
+    }
    void addPtr(ImmWord imm, const Register &dest) {
        JS_ASSERT(dest != ScratchReg);
        if ((intptr_t)imm.value <= INT32_MAX && (intptr_t)imm.value >= INT32_MIN) {
--- a/js/src/ion/x86/Assembler-x86.h
+++ b/js/src/ion/x86/Assembler-x86.h
@ -343,6 +343,9 @@ class Assembler : public AssemblerX86Shared
    void mov(const Register &src, const Register &dest) {
        movl(src, dest);
    }
+    void xchg(const Register &src, const Register &dest) {
+        xchgl(src, dest);
+    }
    void lea(const Operand &src, const Register &dest) {
        return leal(src, dest);
    }
--- a/js/src/ion/x86/MacroAssembler-x86.h
+++ b/js/src/ion/x86/MacroAssembler-x86.h
@ -488,6 +488,9 @@ class MacroAssemblerX86 : public MacroAssemblerX86Shared
    void addPtr(Imm32 imm, const Address &dest) {
        addl(imm, Operand(dest));
    }
+    void addPtr(Imm32 imm, const Operand &dest) {
+        addl(imm, dest);
+    }
    void addPtr(const Address &src, const Register &dest) {
        addl(Operand(src), dest);
    }