Bug 535706 - nanojit: fix regstate updates for X64. r=edwsmith.

--HG-- extra : convert_revision : 6fb5f71fba861a333e17de644fe49de60bb7a1bd
2024-09-13 09:24:08 -07:00 · 2010-03-01 09:46:17 +11:00 · 2010-03-01 09:46:17 +11:00 · 23edcdcbce
commit 23edcdcbce
parent 62a7855349
10 changed files with 284 additions and 195 deletions
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -2088,7 +2088,7 @@ namespace nanojit
    /**
     * move regs around so the SavedRegs contains the highest priority regs.
     */
-    void Assembler::evictScratchRegs()
+    void Assembler::evictScratchRegsExcept(RegisterMask ignore)
    {
        // find the top GpRegs that are candidates to put in SavedRegs

@ -2099,7 +2099,7 @@ namespace nanojit
        int len=0;
        RegAlloc *regs = &_allocator;
        for (Register r = FirstReg; r <= LastReg; r = nextreg(r)) {
-            if (rmask(r) & GpRegs) {
+            if (rmask(r) & GpRegs & ~ignore) {
                LIns *ins = regs->getActive(r);
                if (ins) {
                    if (canRemat(ins)) {
@ -2156,7 +2156,7 @@ namespace nanojit
        }

        // now evict everything else.
-        evictSomeActiveRegs(~SavedRegs);
+        evictSomeActiveRegs(~(SavedRegs | ignore));
    }

    void Assembler::evictAllActiveRegs()
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -337,7 +337,7 @@ namespace nanojit
            void        registerResetAll();
            void        evictAllActiveRegs();
            void        evictSomeActiveRegs(RegisterMask regs);
-            void        evictScratchRegs();
+            void        evictScratchRegsExcept(RegisterMask ignore);
            void        intersectRegisterState(RegAlloc& saved);
            void        unionRegisterState(RegAlloc& saved);
            void        assignSaved(RegAlloc &saved, RegisterMask skip);
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -826,14 +826,14 @@ Assembler::asm_call(LInsp ins)
         * sequence we'd get would be something like:
         *     MOV {R0-R3},params        [from below]
         *     BL function               [from below]
-         *     MOV {R0-R3},spilled data  [from evictScratchRegs()]
+         *     MOV {R0-R3},spilled data  [from evictScratchRegsExcept()]
         *     MOV Dx,{R0,R1}            [from here]
         * which is clearly broken.
         *
         * This is not a problem for non-floating point calls, because the
         * restoring of spilled data into R0 is done via a call to
         * deprecated_prepResultReg(R0) in the other branch of this if-then-else,
-         * meaning that evictScratchRegs() will not modify R0. However,
+         * meaning that evictScratchRegsExcept() will not modify R0. However,
         * deprecated_prepResultReg is not aware of the concept of using a register pair
         * (R0,R1) for the result of a single operation, so it can only be
         * used here with the ultimate VFP register, and not R0/R1, which
@ -846,7 +846,7 @@ Assembler::asm_call(LInsp ins)
    // Do this after we've handled the call result, so we don't
    // force the call result to be spilled unnecessarily.

-    evictScratchRegs();
+    evictScratchRegsExcept(0);

    const CallInfo* call = ins->callInfo();
    ArgSize sizes[MAXARGS];
--- a/js/src/nanojit/NativeMIPS.cpp
+++ b/js/src/nanojit/NativeMIPS.cpp
@ -1568,7 +1568,7 @@ namespace nanojit
        // Do this after we've handled the call result, so we don't
        // force the call result to be spilled unnecessarily.

-        evictScratchRegs();
+        evictScratchRegsExcept(0);

        const CallInfo* call = ins->callInfo();
        ArgSize sizes[MAXARGS];
--- a/js/src/nanojit/NativePPC.cpp
+++ b/js/src/nanojit/NativePPC.cpp
@ -688,7 +688,7 @@ namespace nanojit
        // Do this after we've handled the call result, so we don't
        // force the call result to be spilled unnecessarily.

-        evictScratchRegs();
+        evictScratchRegsExcept(0);

        const CallInfo* call = ins->callInfo();
        ArgSize sizes[MAXARGS];
--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@ -159,7 +159,7 @@ namespace nanojit
        // Do this after we've handled the call result, so we don't
        // force the call result to be spilled unnecessarily.

-        evictScratchRegs();
+        evictScratchRegsExcept(0);

        const CallInfo* call = ins->callInfo();

--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@ -620,63 +620,35 @@ namespace nanojit
        }
    }

-    // register allocation for 2-address style ops of the form R = R (op) B
-    void Assembler::regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb) {
-#ifdef _DEBUG
-        RegisterMask originalAllow = allow;
-#endif
-        LIns *a = ins->oprnd1();
-        LIns *b = ins->oprnd2();
-        if (a != b) {
-            rb = findRegFor(b, allow);
-            allow &= ~rmask(rb);
-        }
-        rr = deprecated_prepResultReg(ins, allow);
-        // if this is last use of a in reg, we can re-use result reg
-        if (!a->isInReg()) {
-            ra = findSpecificRegForUnallocated(a, rr);
-        } else if (!(allow & rmask(a->getReg()))) {
-            // 'a' already has a register assigned, but it's not valid.
-            // To make sure floating point operations stay in FPU registers
-            // as much as possible, make sure that only a few opcodes are
-            // reserving GPRs.
-            NanoAssert(a->isop(LIR_quad) || a->isop(LIR_float) ||
-                       a->isop(LIR_ldf) || a->isop(LIR_ldfc) ||
-                       a->isop(LIR_ldq) || a->isop(LIR_ldqc) ||
-                       a->isop(LIR_ld32f) || a->isop(LIR_ldc32f) ||
-                       a->isop(LIR_u2f) || a->isop(LIR_fcall));
-            allow &= ~rmask(rr);
-            ra = findRegFor(a, allow);
-        } else {
-            ra = a->getReg();
-        }
-        if (a == b) {
-            rb = ra;
-        }
-        NanoAssert(originalAllow & rmask(rr));
-        NanoAssert(originalAllow & rmask(ra));
-        NanoAssert(originalAllow & rmask(rb));
-    }
-
    void Assembler::asm_qbinop(LIns *ins) {
        asm_arith(ins);
    }

    void Assembler::asm_shift(LIns *ins) {
-        // shift require rcx for shift count
+        // Shift requires rcx for shift count.
+        LIns *a = ins->oprnd1();
        LIns *b = ins->oprnd2();
        if (b->isconst()) {
            asm_shift_imm(ins);
            return;
        }
+
        Register rr, ra;
-        if (b != ins->oprnd1()) {
+        if (a != b) {
            findSpecificRegFor(b, RCX);
-            regalloc_unary(ins, GpRegs & ~rmask(RCX), rr, ra);
+            beginOp1Regs(ins, GpRegs & ~rmask(RCX), rr, ra);
        } else {
-            // a == b means both must be in RCX
-            regalloc_unary(ins, rmask(RCX), rr, ra);
+            // Nb: this is just like beginOp1Regs() except that it asserts
+            // that ra is in GpRegs instead of rmask(RCX)) -- this is
+            // necessary for the a==b case because 'a' might not be in RCX
+            // (which is ok, the MR(rr, ra) below will move it into RCX).
+            rr = prepareResultReg(ins, rmask(RCX));
+
+            // If 'a' isn't in a register, it can be clobbered by 'ins'.
+            ra = a->isInReg() ? a->getReg() : rr;
+            NanoAssert(rmask(ra) & GpRegs);
        }
+
        switch (ins->opcode()) {
        default:
            TODO(asm_shift);
@ -689,11 +661,14 @@ namespace nanojit
        }
        if (rr != ra)
            MR(rr, ra);
+
+        endOpRegs(ins, rr, ra);
    }

    void Assembler::asm_shift_imm(LIns *ins) {
        Register rr, ra;
-        regalloc_unary(ins, GpRegs, rr, ra);
+        beginOp1Regs(ins, GpRegs, rr, ra);
+
        int shift = ins->oprnd2()->imm32() & 63;
        switch (ins->opcode()) {
        default: TODO(shiftimm);
@ -706,6 +681,8 @@ namespace nanojit
        }
        if (rr != ra)
            MR(rr, ra);
+
+        endOpRegs(ins, rr, ra);
    }

    static bool isImm32(LIns *ins) {
@ -715,21 +692,22 @@ namespace nanojit
        return ins->isconst() ? ins->imm32() : int32_t(ins->imm64());
    }

-    // binary op, integer regs, rhs is int32 const
+    // Binary op, integer regs, rhs is int32 constant.
    void Assembler::asm_arith_imm(LIns *ins) {
        LIns *b = ins->oprnd2();
        int32_t imm = getImm32(b);
        LOpcode op = ins->opcode();
        Register rr, ra;
        if (op == LIR_mul || op == LIR_mulxov) {
-            // imul has true 3-addr form, it doesn't clobber ra
-            rr = deprecated_prepResultReg(ins, GpRegs);
-            LIns *a = ins->oprnd1();
-            ra = findRegFor(a, GpRegs);
+            // Special case: imul-by-imm has true 3-addr form.  So we don't
+            // need the MR(rr, ra) after the IMULI.
+            beginOp1Regs(ins, GpRegs, rr, ra);
            IMULI(rr, ra, imm);
+            endOpRegs(ins, rr, ra);
            return;
        }
-        regalloc_unary(ins, GpRegs, rr, ra);
+
+        beginOp1Regs(ins, GpRegs, rr, ra);
        if (isS8(imm)) {
            switch (ins->opcode()) {
            default: TODO(arith_imm8);
@ -765,35 +743,63 @@ namespace nanojit
        }
        if (rr != ra)
            MR(rr, ra);
+
+        endOpRegs(ins, rr, ra);
    }

-    void Assembler::asm_div_mod(LIns *ins) {
-        LIns *div;
-        if (ins->opcode() == LIR_mod) {
-            // LIR_mod expects the LIR_div to be near
-            div = ins->oprnd1();
-            deprecated_prepResultReg(ins, rmask(RDX));
-        } else {
-            div = ins;
-            evictIfActive(RDX);
-        }
-
+    // Generates code for a LIR_div that doesn't have a subsequent LIR_mod.
+    void Assembler::asm_div(LIns *div) {
        NanoAssert(div->isop(LIR_div));
+        LIns *a = div->oprnd1();
+        LIns *b = div->oprnd2();

-        LIns *lhs = div->oprnd1();
-        LIns *rhs = div->oprnd2();
+        evictIfActive(RDX);
+        prepareResultReg(div, rmask(RAX));

-        deprecated_prepResultReg(div, rmask(RAX));
+        Register rb = findRegFor(b, GpRegs & ~(rmask(RAX)|rmask(RDX)));
+        Register ra = a->isInReg() ? a->getReg() : RAX;

-        Register rhsReg = findRegFor(rhs, GpRegs & ~(rmask(RAX)|rmask(RDX)));
-        Register lhsReg = !lhs->isInReg()
-                          ? findSpecificRegForUnallocated(lhs, RAX)
-                          : lhs->getReg();
-        IDIV(rhsReg);
+        IDIV(rb);
        SARI(RDX, 31);
        MR(RDX, RAX);
-        if (RAX != lhsReg)
-            MR(RAX, lhsReg);
+        if (RAX != ra)
+            MR(RAX, ra);
+
+        freeResourcesOf(div);
+        if (!a->isInReg()) {
+            NanoAssert(ra == RAX);
+            findSpecificRegForUnallocated(a, RAX);
+        }
+    }
+
+    // Generates code for a LIR_mod(LIR_div(divL, divR)) sequence.
+    void Assembler::asm_div_mod(LIns *mod) {
+        LIns *div = mod->oprnd1();
+
+        NanoAssert(mod->isop(LIR_mod));
+        NanoAssert(div->isop(LIR_div));
+
+        LIns *divL = div->oprnd1();
+        LIns *divR = div->oprnd2();
+
+        prepareResultReg(mod, rmask(RDX));
+        prepareResultReg(div, rmask(RAX));
+
+        Register rDivR = findRegFor(divR, GpRegs & ~(rmask(RAX)|rmask(RDX)));
+        Register rDivL = divL->isInReg() ? divL->getReg() : RAX;
+
+        IDIV(rDivR);
+        SARI(RDX, 31);
+        MR(RDX, RAX);
+        if (RAX != rDivL)
+            MR(RAX, rDivL);
+
+        freeResourcesOf(mod);
+        freeResourcesOf(div);
+        if (!divL->isInReg()) {
+            NanoAssert(rDivL == RAX);
+            findSpecificRegForUnallocated(divL, RAX);
+        }
    }

    // binary op with integer registers
@ -807,9 +813,13 @@ namespace nanojit
            asm_shift(ins);
            return;
        case LIR_mod:
-        case LIR_div:
            asm_div_mod(ins);
            return;
+        case LIR_div:
+            // Nb: if the div feeds into a mod it will be handled by
+            // asm_div_mod() rather than here.
+            asm_div(ins);
+            return;
        default:
            break;
        }
@ -819,7 +829,7 @@ namespace nanojit
            asm_arith_imm(ins);
            return;
        }
-        regalloc_binary(ins, GpRegs, rr, ra, rb);
+        beginOp2Regs(ins, GpRegs, rr, ra, rb);
        switch (ins->opcode()) {
        default:            TODO(asm_arith);
        case LIR_or:        ORLRR(rr, rb);  break;
@ -838,16 +848,15 @@ namespace nanojit
        case LIR_qaddp:     ADDQRR(rr, rb); break;
        }
        if (rr != ra)
-            MR(rr,ra);
+            MR(rr, ra);
+
+        endOpRegs(ins, rr, ra);
    }

-    // binary op with fp registers
+    // Binary op with fp registers.
    void Assembler::asm_fop(LIns *ins) {
-        // NB, rb is always filled in by regalloc_binary, 
-        // but compilers can't always tell that: init to UnspecifiedReg
-        // to avoid a warning.
-        Register rr, ra, rb = UnspecifiedReg;
-        regalloc_binary(ins, FpRegs, rr, ra, rb);
+        Register rr, ra, rb = UnspecifiedReg;   // init to shut GCC up
+        beginOp2Regs(ins, FpRegs, rr, ra, rb);
        switch (ins->opcode()) {
        default:       TODO(asm_fop);
        case LIR_fdiv: DIVSD(rr, rb); break;
@ -858,28 +867,29 @@ namespace nanojit
        if (rr != ra) {
            asm_nongp_copy(rr, ra);
        }
+
+        endOpRegs(ins, rr, ra);
    }

    void Assembler::asm_neg_not(LIns *ins) {
        Register rr, ra;
-        regalloc_unary(ins, GpRegs, rr, ra);
-        NanoAssert(IsGpReg(ra));
+        beginOp1Regs(ins, GpRegs, rr, ra);
+
        if (ins->isop(LIR_not))
            NOT(rr);
        else
            NEG(rr);
        if (rr != ra)
            MR(rr, ra);
+
+        endOpRegs(ins, rr, ra);
    }

    void Assembler::asm_call(LIns *ins) {
-        Register retReg = ( ins->isop(LIR_fcall) ? XMM0 : retRegs[0] );
-        deprecated_prepResultReg(ins, rmask(retReg));
+        Register rr = ( ins->isop(LIR_fcall) ? XMM0 : retRegs[0] );
+        prepareResultReg(ins, rmask(rr));

-        // Do this after we've handled the call result, so we don't
-        // force the call result to be spilled unnecessarily.
-
-        evictScratchRegs();
+        evictScratchRegsExcept(rmask(rr));

        const CallInfo *call = ins->callInfo();
        ArgSize sizes[MAXARGS];
@ -906,6 +916,9 @@ namespace nanojit
            CALLRAX();
        }

+        // Call this now so that the arg setup can involve 'rr'. 
+        freeResourcesOf(ins);
+
    #ifdef _WIN64
        int stk_used = 32; // always reserve 32byte shadow area
    #else
@ -994,14 +1007,15 @@ namespace nanojit

    void Assembler::asm_q2i(LIns *ins) {
        Register rr, ra;
-        regalloc_unary(ins, GpRegs, rr, ra);
+        beginOp1Regs(ins, GpRegs, rr, ra);
        NanoAssert(IsGpReg(ra));
        MOVLR(rr, ra);  // 32bit mov zeros the upper 32bits of the target
+        endOpRegs(ins, rr, ra);
    }

    void Assembler::asm_promote(LIns *ins) {
        Register rr, ra;
-        regalloc_unary(ins, GpRegs, rr, ra);
+        beginOp1Regs(ins, GpRegs, rr, ra);
        NanoAssert(IsGpReg(ra));
        if (ins->isop(LIR_u2q)) {
            MOVLR(rr, ra);      // 32bit mov zeros the upper 32bits of the target
@ -1009,38 +1023,44 @@ namespace nanojit
            NanoAssert(ins->isop(LIR_i2q));
            MOVSXDR(rr, ra);    // sign extend 32->64
        }
+        endOpRegs(ins, rr, ra);
    }

-    // the CVTSI2SD instruction only writes to the low 64bits of the target
+    // The CVTSI2SD instruction only writes to the low 64bits of the target
    // XMM register, which hinders register renaming and makes dependence
    // chains longer.  So we precede with XORPS to clear the target register.

    void Assembler::asm_i2f(LIns *ins) {
-        Register r = deprecated_prepResultReg(ins, FpRegs);
-        Register b = findRegFor(ins->oprnd1(), GpRegs);
-        CVTSI2SD(r, b);     // cvtsi2sd xmmr, b  only writes xmm:0:64
-        XORPS(r);           // xorps xmmr,xmmr to break dependency chains
+        LIns *a = ins->oprnd1();
+        NanoAssert(ins->isF64() && a->isI32());
+
+        Register rr = prepareResultReg(ins, FpRegs);
+        Register ra = findRegFor(a, GpRegs);
+        CVTSI2SD(rr, ra);   // cvtsi2sd xmmr, b  only writes xmm:0:64
+        XORPS(rr);          // xorps xmmr,xmmr to break dependency chains
+        freeResourcesOf(ins);
    }

    void Assembler::asm_u2f(LIns *ins) {
-        Register r = deprecated_prepResultReg(ins, FpRegs);
-        Register b = findRegFor(ins->oprnd1(), GpRegs);
-        NanoAssert(ins->oprnd1()->isI32());
-        // since oprnd1 value is 32bit, its okay to zero-extend the value without worrying about clobbering.
-        CVTSQ2SD(r, b);     // convert int64 to double
-        XORPS(r);           // xorps xmmr,xmmr to break dependency chains
-        MOVLR(b, b);        // zero extend u32 to int64
+        LIns *a = ins->oprnd1();
+        NanoAssert(ins->isF64() && a->isI32());
+
+        Register rr = prepareResultReg(ins, FpRegs);
+        Register ra = findRegFor(a, GpRegs);
+        // Because oprnd1 is 32bit, it's ok to zero-extend it without worrying about clobbering.
+        CVTSQ2SD(rr, ra);   // convert int64 to double
+        XORPS(rr);          // xorps xmmr,xmmr to break dependency chains
+        MOVLR(ra, ra);      // zero extend u32 to int64
+        freeResourcesOf(ins);
    }

    void Assembler::asm_f2i(LIns *ins) {
-        LIns *lhs = ins->oprnd1();
-
-        NanoAssert(ins->isI32() && lhs->isF64());
-        Register r = prepareResultReg(ins, GpRegs);
-        Register b = findRegFor(lhs, FpRegs);
-
-        CVTSD2SI(r, b);
+        LIns *a = ins->oprnd1();
+        NanoAssert(ins->isI32() && a->isF64());

+        Register rr = prepareResultReg(ins, GpRegs);
+        Register rb = findRegFor(a, FpRegs);
+        CVTSD2SI(rr, rb);
        freeResourcesOf(ins);
    }

@ -1052,11 +1072,16 @@ namespace nanojit
        NanoAssert((ins->isop(LIR_cmov)  && iftrue->isI32() && iffalse->isI32()) ||
                   (ins->isop(LIR_qcmov) && iftrue->isI64() && iffalse->isI64()));

-        // this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
-        // (This is true on Intel, is it true on all architectures?)
-        const Register rr = deprecated_prepResultReg(ins, GpRegs);
-        const Register rf = findRegFor(iffalse, GpRegs & ~rmask(rr));
+        Register rr = prepareResultReg(ins, GpRegs);

+        Register rf = findRegFor(iffalse, GpRegs & ~rmask(rr));
+
+        // If 'iftrue' isn't in a register, it can be clobbered by 'ins'.
+        Register rt = iftrue->isInReg() ? iftrue->getReg() : rr;
+
+        // WARNING: We cannot generate any code that affects the condition
+        // codes between the MRcc generation here and the asm_cmp() call
+        // below.  See asm_cmp() for more details.
        LOpcode condop = cond->opcode();
        if (ins->opcode() == LIR_cmov) {
            switch (condop) {
@ -1085,7 +1110,15 @@ namespace nanojit
            default:                        NanoAssert(0);    break;
            }
        }
-        /*const Register rt =*/ findSpecificRegFor(iftrue, rr);
+        if (rr != rt)
+            MR(rr, rt);
+
+        freeResourcesOf(ins);
+        if (!iftrue->isInReg()) {
+            NanoAssert(rt == rr);
+            findSpecificRegForUnallocated(iftrue, rr);
+        }
+
        asm_cmp(cond);
    }

@ -1099,7 +1132,7 @@ namespace nanojit
        if (condop >= LIR_feq && condop <= LIR_fge)
            return asm_fbranch(onFalse, cond, target);

-        // We must ensure there's room for the instr before calculating
+        // We must ensure there's room for the instruction before calculating
        // the offset.  And the offset determines the opcode (8bit or 32bit).
        if (target && isTargetWithinS8(target)) {
            if (onFalse) {
@ -1158,7 +1191,7 @@ namespace nanojit
                }
            }
        }
-        NIns *patch = _nIns;            // addr of instr to patch
+        NIns *patch = _nIns;    // address of instruction to patch
        asm_cmp(cond);
        return patch;
    }
@ -1176,6 +1209,9 @@ namespace nanojit
            JO( 8, target);
    }

+    // WARNING: this function cannot generate code that will affect the
+    // condition codes prior to the generation of the test/cmp.  See
+    // Nativei386.cpp:asm_cmp() for details.
    void Assembler::asm_cmp(LIns *cond) {
        LIns *b = cond->oprnd2();
        if (isImm32(b)) {
@ -1220,8 +1256,8 @@ namespace nanojit
        }
    }

-    // compiling floating point branches
-    // discussion in https://bugzilla.mozilla.org/show_bug.cgi?id=443886
+    // Compiling floating point branches.
+    // Discussion in https://bugzilla.mozilla.org/show_bug.cgi?id=443886.
    //
    //  fucom/p/pp: c3 c2 c0   jae ja    jbe jb je jne
    //  ucomisd:     Z  P  C   !C  !C&!Z C|Z C  Z  !Z
@ -1231,7 +1267,7 @@ namespace nanojit
    //  less    <    0  0  1             T   T     T
    //  equal   =    1  0  0   T         T      T
    //
-    //  here's the cases, using conditionals:
+    //  Here are the cases, using conditionals:
    //
    //  branch  >=  >   <=       <        =
    //  ------  --- --- ---      ---      ---
@ -1281,7 +1317,7 @@ namespace nanojit
            }
            patch = _nIns;
        }
-        fcmp(a, b);
+        asm_fcmp(a, b);
        return patch;
    }

@ -1292,7 +1328,7 @@ namespace nanojit
        if (op == LIR_feq) {
            // result = ZF & !PF, must do logic on flags
            // r = al|bl|cl|dl, can only use rh without rex prefix
-            Register r = deprecated_prepResultReg(ins, 1<<RAX|1<<RCX|1<<RDX|1<<RBX);
+            Register r = prepareResultReg(ins, 1<<RAX|1<<RCX|1<<RDX|1<<RBX);
            MOVZX8(r, r);       // movzx8   r,rl     r[8:63] = 0
            X86_AND8R(r);       // and      rl,rh    rl &= rh
            X86_SETNP(r);       // setnp    rh       rh = !PF
@ -1305,22 +1341,30 @@ namespace nanojit
                op = LIR_fge;
                LIns *t = a; a = b; b = t;
            }
-            Register r = deprecated_prepResultReg(ins, GpRegs); // x64 can use any GPR as setcc target
+            Register r = prepareResultReg(ins, GpRegs); // x64 can use any GPR as setcc target
            MOVZX8(r, r);
            if (op == LIR_fgt)
                SETA(r);
            else
                SETAE(r);
        }
-        fcmp(a, b);
+
+        freeResourcesOf(ins);
+
+        asm_fcmp(a, b);
    }

-    void Assembler::fcmp(LIns *a, LIns *b) {
+    // WARNING: This function cannot generate any code that will affect the
+    // condition codes prior to the generation of the ucomisd.  See asm_cmp()
+    // for more details.
+    void Assembler::asm_fcmp(LIns *a, LIns *b) {
        Register ra, rb;
        findRegFor2(FpRegs, a, ra, FpRegs, b, rb);
        UCOMISD(ra, rb);
    }

+    // WARNING: the code generated by this function must not affect the
+    // condition codes.  See asm_cmp() for details.
    void Assembler::asm_restore(LIns *ins, Register r) {
        if (ins->isop(LIR_alloc)) {
            int d = arDisp(ins);
@ -1336,11 +1380,10 @@ namespace nanojit
        }
        else {
            int d = findMemFor(ins);
-            if (IsFpReg(r)) {
-                NanoAssert(ins->isN64());
-                // load 64bits into XMM.  don't know if double or int64, assume double.
+            if (ins->isF64()) {
+                NanoAssert(IsFpReg(r));
                MOVSDRM(r, d, FP);
-            } else if (ins->isN64()) {
+            } else if (ins->isI64()) {
                NanoAssert(IsGpReg(r));
                MOVQRM(r, d, FP);
            } else {
@ -1353,8 +1396,10 @@ namespace nanojit

    void Assembler::asm_cond(LIns *ins) {
        LOpcode op = ins->opcode();
+
        // unlike x86-32, with a rex prefix we can use any GP register as an 8bit target
-        Register r = deprecated_prepResultReg(ins, GpRegs);
+        Register r = prepareResultReg(ins, GpRegs);
+
        // SETcc only sets low 8 bits, so extend
        MOVZX8(r, r);
        switch (op) {
@ -1379,6 +1424,8 @@ namespace nanojit
        case LIR_quge:
        case LIR_uge:   SETAE(r);   break;
        }
+        freeResourcesOf(ins);
+
        asm_cmp(ins);
    }

@ -1409,18 +1456,17 @@ namespace nanojit
        }
    }

-    void Assembler::regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &dr, Register &rb) {
+    // Register setup for load ops.  Pairs with endLoadRegs().
+    void Assembler::beginLoadRegs(LIns *ins, RegisterMask allow, Register &rr, int32_t &dr, Register &rb) {
        dr = ins->disp();
        LIns *base = ins->oprnd1();
        rb = getBaseReg(base, dr, BaseRegs);
-        if (!ins->isInRegMask(allow)) {
-            rr = deprecated_prepResultReg(ins, allow & ~rmask(rb));
-        } else {
-            // keep already assigned register
-            rr = ins->getReg();
-            NanoAssert(allow & rmask(rr));
-            deprecated_freeRsrcOf(ins, false);
-        }
+        rr = prepareResultReg(ins, allow & ~rmask(rb));
+    }
+
+    // Register clean-up for load ops.  Pairs with beginLoadRegs().
+    void Assembler::endLoadRegs(LIns* ins) {
+        freeResourcesOf(ins);
    }

    void Assembler::asm_load64(LIns *ins) {
@ -1429,19 +1475,19 @@ namespace nanojit
        switch (ins->opcode()) {
            case LIR_ldq:
            case LIR_ldqc:
-                regalloc_load(ins, GpRegs, rr, dr, rb);
+                beginLoadRegs(ins, GpRegs, rr, dr, rb);
                NanoAssert(IsGpReg(rr));
                MOVQRM(rr, dr, rb);     // general 64bit load, 32bit const displacement
                break;
            case LIR_ldf:
            case LIR_ldfc:
-                regalloc_load(ins, FpRegs, rr, dr, rb);
+                beginLoadRegs(ins, FpRegs, rr, dr, rb);
                NanoAssert(IsFpReg(rr));
                MOVSDRM(rr, dr, rb);    // load 64bits into XMM
                break;
            case LIR_ld32f:
            case LIR_ldc32f:
-                regalloc_load(ins, FpRegs, rr, dr, rb);
+                beginLoadRegs(ins, FpRegs, rr, dr, rb);
                NanoAssert(IsFpReg(rr));
                CVTSS2SD(rr, rr);
                MOVSSRM(rr, dr, rb);
@ -1450,14 +1496,14 @@ namespace nanojit
                NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
                break;
        }
-
+        endLoadRegs(ins);
    }

    void Assembler::asm_load32(LIns *ins) {
        NanoAssert(ins->isI32());
        Register r, b;
        int32_t d;
-        regalloc_load(ins, GpRegs, r, d, b);
+        beginLoadRegs(ins, GpRegs, r, d, b);
        LOpcode op = ins->opcode();
        switch(op) {
            case LIR_ldzb:
@ -1484,6 +1530,7 @@ namespace nanojit
                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
                break;
        }
+        endLoadRegs(ins);
    }

    void Assembler::asm_store64(LOpcode op, LIns *value, int d, LIns *base) {
@ -1542,8 +1589,6 @@ namespace nanojit
                NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
                break;
        }
-
-
    }

    void Assembler::asm_int(LIns *ins) {
@ -1597,11 +1642,11 @@ namespace nanojit
        uint32_t a = ins->paramArg();
        uint32_t kind = ins->paramKind();
        if (kind == 0) {
-            // ordinary param
-            // first four or six args always in registers for x86_64 ABI
+            // Ordinary param.  First four or six args always in registers for x86_64 ABI.
            if (a < (uint32_t)NumArgRegs) {
                // incoming arg in register
-                deprecated_prepResultReg(ins, rmask(argRegs[a]));
+                prepareResultReg(ins, rmask(argRegs[a]));
+                // No code to generate.
            } else {
                // todo: support stack based args, arg 0 is at [FP+off] where off
                // is the # of regs to be pushed in genProlog()
@ -1609,24 +1654,60 @@ namespace nanojit
            }
        }
        else {
-            // saved param
-            deprecated_prepResultReg(ins, rmask(savedRegs[a]));
+            // Saved param.
+            prepareResultReg(ins, rmask(savedRegs[a]));
+            // No code to generate.
+        }
+        freeResourcesOf(ins);
+    }
+
+    // Register setup for 2-address style unary ops of the form R = (op) R.
+    // Pairs with endOpRegs().
+    void Assembler::beginOp1Regs(LIns* ins, RegisterMask allow, Register &rr, Register &ra) {
+        LIns* a = ins->oprnd1();
+
+        rr = prepareResultReg(ins, allow);
+
+        // If 'a' isn't in a register, it can be clobbered by 'ins'.
+        ra = a->isInReg() ? a->getReg() : rr;
+        NanoAssert(rmask(ra) & allow);
+    }
+
+    // Register setup for 2-address style binary ops of the form R = R (op) B.
+    // Pairs with endOpRegs().
+    void Assembler::beginOp2Regs(LIns *ins, RegisterMask allow, Register &rr, Register &ra,
+                                 Register &rb) {
+        LIns *a = ins->oprnd1();
+        LIns *b = ins->oprnd2();
+        if (a != b) {
+            rb = findRegFor(b, allow);
+            allow &= ~rmask(rb);
+        }
+        rr = prepareResultReg(ins, allow);
+
+        // If 'a' isn't in a register, it can be clobbered by 'ins'.
+        ra = a->isInReg() ? a->getReg() : rr;
+        NanoAssert(rmask(ra) & allow);
+
+        if (a == b) {
+            rb = ra;
        }
    }

-    // register allocation for 2-address style unary ops of the form R = (op) R
-    void Assembler::regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra) {
-        LIns *a = ins->oprnd1();
-        rr = deprecated_prepResultReg(ins, allow);
-        // if this is last use of a in reg, we can re-use result reg
+    // Register clean-up for 2-address style unary ops of the form R = (op) R.
+    // Pairs with beginOp1Regs() and beginOp2Regs().
+    void Assembler::endOpRegs(LIns* ins, Register rr, Register ra) {
+        LIns* a = ins->oprnd1();
+
+        // We're finished with 'ins'.
+        NanoAssert(ins->getReg() == rr);
+        freeResourcesOf(ins);
+
+        // If 'a' isn't in a register yet, that means it's clobbered by 'ins'.
        if (!a->isInReg()) {
-            ra = findSpecificRegForUnallocated(a, rr);
-        } else {
-            // 'a' already has a register assigned.  Caller must emit a copy
-            // to rr once instr code is generated.  (ie  mov rr,ra ; op rr)
-            ra = a->getReg();
-        }
-        NanoAssert(allow & rmask(rr));
+            NanoAssert(ra == rr);
+            findSpecificRegForUnallocated(a, ra);
+         }
    }

    static const AVMPLUS_ALIGN16(int64_t) negateMask[] = {0x8000000000000000LL,0};
@ -1634,7 +1715,7 @@ namespace nanojit
    void Assembler::asm_fneg(LIns *ins) {
        Register rr, ra;
        if (isS32((uintptr_t)negateMask) || isTargetWithinS32((NIns*)negateMask)) {
-            regalloc_unary(ins, FpRegs, rr, ra);
+            beginOp1Regs(ins, FpRegs, rr, ra);
            if (isS32((uintptr_t)negateMask)) {
                // builtin code is in bottom or top 2GB addr space, use absolute addressing
                XORPSA(rr, (int32_t)(uintptr_t)negateMask);
@ -1644,14 +1725,17 @@ namespace nanojit
            }
            if (ra != rr)
                asm_nongp_copy(rr,ra);
+            endOpRegs(ins, rr, ra);
+
        } else {
-            // this is just hideous - can't use RIP-relative load, can't use
+            // This is just hideous - can't use RIP-relative load, can't use
            // absolute-address load, and cant move imm64 const to XMM.
            // so do it all in a GPR.  hrmph.
-            rr = deprecated_prepResultReg(ins, GpRegs);
+            rr = prepareResultReg(ins, GpRegs);
            ra = findRegFor(ins->oprnd1(), GpRegs & ~rmask(rr));
            XORQRR(rr, ra);                                     // xor rr, ra
            asm_quad(rr, negateMask[0], /*canClobberCCs*/true); // mov rr, 0x8000000000000000
+            freeResourcesOf(ins);
        }
    }

--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@ -399,15 +399,18 @@ namespace nanojit
        void asm_shift(LIns*);\
        void asm_shift_imm(LIns*);\
        void asm_arith_imm(LIns*);\
-        void regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra);\
-        void regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb);\
-        void regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &d, Register &rb);\
+        void beginOp1Regs(LIns *ins, RegisterMask allow, Register &rr, Register &ra);\
+        void beginOp2Regs(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb);\
+        void endOpRegs(LIns *ins, Register rr, Register ra);\
+        void beginLoadRegs(LIns *ins, RegisterMask allow, Register &rr, int32_t &d, Register &rb);\
+        void endLoadRegs(LIns *ins);\
        void dis(NIns *p, int bytes);\
        void asm_cmp(LIns*);\
        void asm_cmp_imm(LIns*);\
-        void fcmp(LIns*, LIns*);\
+        void asm_fcmp(LIns*, LIns*);\
        NIns* asm_fbranch(bool, LIns*, NIns*);\
-        void asm_div_mod(LIns *i);\
+        void asm_div(LIns *ins);\
+        void asm_div_mod(LIns *ins);\
        int max_stk_used;\
        void PUSHR(Register r);\
        void POPR(Register r);\
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@ -167,7 +167,7 @@ namespace nanojit
        // Do this after we've handled the call result, so we don't
        // force the call result to be spilled unnecessarily.

-        evictScratchRegs();
+        evictScratchRegsExcept(0);

        const CallInfo* call = ins->callInfo();
        // must be signed, not unsigned
@ -393,13 +393,17 @@ namespace nanojit

        } else {
            int d = findMemFor(ins);
-            if (rmask(r) & GpRegs) {
+            if (ins->isI32()) {
+                NanoAssert(rmask(r) & GpRegs);
                LD(r, d, FP);
-            } else if (rmask(r) & XmmRegs) {
-                SSE_LDQ(r, d, FP);
            } else {
-                NanoAssert(rmask(r) & x87Regs);
-                FLDQ(d, FP);
+                NanoAssert(ins->isF64());
+                if (rmask(r) & XmmRegs) {
+                    SSE_LDQ(r, d, FP);
+                } else {
+                    NanoAssert(rmask(r) & x87Regs);
+                    FLDQ(d, FP);
+                }
            }
        }
    }
@ -976,7 +980,7 @@ namespace nanojit
        }
    }

-    // This is called when we have a mod(div(divL, divR)) sequence.
+    // Generates code for a LIR_mod(LIR_div(divL, divR)) sequence.
    void Assembler::asm_div_mod(LInsp mod)
    {
        LInsp div = mod->oprnd1();
@ -992,12 +996,10 @@ namespace nanojit
        prepareResultReg(div, rmask(EAX));

        Register rDivR = findRegFor(divR, (GpRegs & ~(rmask(EAX)|rmask(EDX))));
-
        Register rDivL = !divL->isInReg() ? EAX : divL->getReg();

        DIV(rDivR);
        CDQ();     // sign-extend EAX into EDX:EAX
-
        if (EAX != rDivL)
            MR(EAX, rDivL);

@ -1028,7 +1030,6 @@ namespace nanojit
    //
    void Assembler::asm_neg_not(LInsp ins)
    {
-        LOpcode op = ins->opcode();
        LIns* lhs = ins->oprnd1();

        Register rr = prepareResultReg(ins, GpRegs);
@ -1036,11 +1037,12 @@ namespace nanojit
        // If 'lhs' isn't in a register, it can be clobbered by 'ins'.
        Register ra = !lhs->isInReg() ? rr : lhs->getReg();

-        if (op == LIR_not)
+        if (ins->isop(LIR_not)) {
            NOT(rr);
-        else
+        } else {
+            NanoAssert(ins->isop(LIR_neg));
            NEG(rr);
-
+        }
        if (rr != ra)
            MR(rr, ra);

--- a/js/src/vprof/vprof.h
+++ b/js/src/vprof/vprof.h
@ -121,7 +121,7 @@ extern void* _tprof_before_id;
 #ifndef DOPROF
 #ifndef VMCFG_SYMBIAN
 #define _vprof(v,...)
-#define _nvprof(e,v,...)
+#define _nvprof(e,v)
 #define _hprof(h,n,...)
 #define _nhprof(e,v,n,...)
 #define _ntprof(e)
@ -139,7 +139,7 @@ extern void* _tprof_before_id;
    ;\
 }

-#define _nvprof(e,v,...) \
+#define _nvprof(e,v) \
 { \
    static void* id = 0; \
    (id != 0) ? \