/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is [Open Source Virtual Machine].
 *
 * The Initial Developer of the Original Code is
 * Adobe System Incorporated.
 * Portions created by the Initial Developer are Copyright (C) 2004-2007
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *   Adobe AS3 Team
 *   Mozilla TraceMonkey Team
 *   Asko Tontti <atontti@cc.hut.fi>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#ifdef _MAC
// for MakeDataExecutable
#include <CoreServices/CoreServices.h>
#endif

#if defined AVMPLUS_UNIX || defined AVMPLUS_MAC
#include <sys/mman.h>
#include <errno.h>
#include <stdlib.h>
#endif
#include "nanojit.h"

namespace nanojit
{
    #ifdef FEATURE_NANOJIT

    #ifdef NJ_VERBOSE
        const char *regNames[] = {
            "eax", "ecx", "edx", "ebx", "esp", "ebp", "esi", "edi",
            "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7",
            "f0",  "f1",  "f2",  "f3",  "f4",  "f5",  "f6",  "f7"
        };
    #endif

    const Register Assembler::argRegs[] = { ECX, EDX };
    const Register Assembler::retRegs[] = { EAX, EDX };
    const Register Assembler::savedRegs[] = { EBX, ESI, EDI };

    const static uint8_t max_abi_regs[] = {
        2, /* ABI_FASTCALL */
        1, /* ABI_THISCALL */
        0, /* ABI_STDCALL */
        0  /* ABI_CDECL */
    };


    void Assembler::nInit(AvmCore* core)
    {
        (void) core;
        OSDep::getDate();
    }

    NIns* Assembler::genPrologue()
    {
        /**
         * Prologue
         */
        uint32_t stackNeeded = STACK_GRANULARITY * _activation.highwatermark;

        uint32_t stackPushed =
            STACK_GRANULARITY + // returnaddr
            STACK_GRANULARITY + // ebp
            STACK_GRANULARITY;  // dummy

        if (!_thisfrag->lirbuf->explicitSavedRegs)
            stackPushed += NumSavedRegs * STACK_GRANULARITY;

        uint32_t aligned = alignUp(stackNeeded + stackPushed, NJ_ALIGN_STACK);
        uint32_t amt = aligned - stackPushed;

        // Reserve stackNeeded bytes, padded
        // to preserve NJ_ALIGN_STACK-byte alignment.
        if (amt)
        {
            SUBi(SP, amt);
        }

        verbose_only( outputAddr=true; asm_output("[frag entry]"); )
        NIns *fragEntry = _nIns;
        MR(FP, SP); // Establish our own FP.
        PUSHr(FP); // Save caller's FP.

        if (!_thisfrag->lirbuf->explicitSavedRegs) {
            PUSHr(FP); // dummy
            for (int i = 0; i < NumSavedRegs; ++i)
                PUSHr(savedRegs[i]);
        }

        return fragEntry;
    }

    void Assembler::nFragExit(LInsp guard)
    {
        SideExit *exit = guard->record()->exit;
        bool trees = _frago->core()->config.tree_opt;
        Fragment *frag = exit->target;
        GuardRecord *lr = 0;
        bool destKnown = (frag && frag->fragEntry);
        // Generate jump to epilog and initialize lr.
        // If the guard is LIR_xtbl, use a jump table with epilog in every entry
        if (guard->isop(LIR_xtbl)) {
            lr = guard->record();
            Register r = EBX;
            SwitchInfo* si = guard->record()->exit->switchInfo;
            emitJumpTable(si, _epilogue);
            JMP_indirect(r);
            LEAmi4(r, si->table, r);
        } else {
            // If the guard already exists, use a simple jump.
            if (destKnown && !trees) {
                JMP(frag->fragEntry);
                lr = 0;
            } else {  // target doesn't exist. Use 0 jump offset and patch later
                lr = guard->record();
                JMP_long(_epilogue);
                lr->jmp = _nIns;
            }
        }

        // first restore ESP from EBP, undoing SUBi(SP,amt) from genPrologue
        MR(SP,FP);

        // return value is GuardRecord*
        LDi(EAX, int(lr));
    }

    NIns *Assembler::genEpilogue()
    {
        RET();
        if (!_thisfrag->lirbuf->explicitSavedRegs) {
            for (int i = NumSavedRegs - 1; i >= 0; --i)
                POPr(savedRegs[i]);
            POPr(FP); // dummy
        }
        POPr(FP); // Restore caller's FP.
        MR(SP,FP); // pop the stack frame
        return  _nIns;
    }

    void Assembler::asm_call(LInsp ins)
    {
        const CallInfo* call = ins->callInfo();
        // must be signed, not unsigned
        uint32_t iargs = call->count_iargs();
        int32_t fargs = call->count_args() - iargs;

        uint32_t max_regs = max_abi_regs[call->_abi];
        if (max_regs > iargs)
            max_regs = iargs;

        int32_t istack = iargs-max_regs;  // first 2 4B args are in registers
        int32_t extra = 0;
        const int32_t pushsize = 4*istack + 8*fargs; // actual stack space used

#if _MSC_VER
        // msc is slack, and MIR doesn't do anything extra, so lets use this
        // call-site alignment to at least have code size parity with MIR.
        uint32_t align = 4;//NJ_ALIGN_STACK;
#else
        uint32_t align = NJ_ALIGN_STACK;
#endif

        if (pushsize) {
            // stack re-alignment
            // only pop our adjustment amount since callee pops args in FASTCALL mode
            extra = alignUp(pushsize, align) - pushsize;
            if (call->_abi == ABI_CDECL) {
                // with CDECL only, caller pops args
                ADDi(SP, extra+pushsize);
            } else if (extra > 0) {
                ADDi(SP, extra);
            }
        }

        NanoAssert(ins->isop(LIR_call) || ins->isop(LIR_fcall));
        CALL(call);

        // make sure fpu stack is empty before call (restoreCallerSaved)
        NanoAssert(_allocator.isFree(FST0));
        // note: this code requires that ref arguments (ARGSIZE_Q)
        // be one of the first two arguments
        // pre-assign registers to the first N 4B args based on the calling convention
        uint32_t n = 0;

        ArgSize sizes[2*MAXARGS];
        uint32_t argc = call->get_sizes(sizes);

        for(uint32_t i=0; i < argc; i++)
        {
            uint32_t j = argc-i-1;
            ArgSize sz = sizes[j];
            Register r = UnknownReg;
            if (n < max_regs && sz != ARGSIZE_F) {
                r = argRegs[n++]; // tell asm_arg what reg to use
            }
            asm_arg(sz, ins->arg(j), r);
        }

        if (extra > 0)
            SUBi(SP, extra);
    }

    void Assembler::nMarkExecute(Page* page, int flags)
    {
        NanoAssert(sizeof(Page) == NJ_PAGE_SIZE);
        #if defined WIN32 || defined WIN64
            DWORD dwIgnore;
            static const DWORD kProtFlags[4] =
            {
                PAGE_READONLY,            // 0
                PAGE_READWRITE,            // PAGE_WRITE
                PAGE_EXECUTE_READ,        // PAGE_EXEC
                PAGE_EXECUTE_READWRITE    // PAGE_EXEC|PAGE_WRITE
            };
            DWORD prot = kProtFlags[flags & (PAGE_WRITE|PAGE_EXEC)];
            BOOL res = VirtualProtect(page, NJ_PAGE_SIZE, prot, &dwIgnore);
            if (!res)
            {
                // todo: we can't abort or assert here, we have to fail gracefully.
                NanoAssertMsg(false, "FATAL ERROR: VirtualProtect() failed\n");
            }
        #elif defined AVMPLUS_UNIX || defined AVMPLUS_MAC
            static const int kProtFlags[4] =
            {
                PROT_READ,                        // 0
                PROT_READ|PROT_WRITE,            // PAGE_WRITE
                PROT_READ|PROT_EXEC,            // PAGE_EXEC
                PROT_READ|PROT_WRITE|PROT_EXEC    // PAGE_EXEC|PAGE_WRITE
            };
            int prot = kProtFlags[flags & (PAGE_WRITE|PAGE_EXEC)];
            intptr_t addr = (intptr_t)page;
            addr &= ~((uintptr_t)NJ_PAGE_SIZE - 1);
            NanoAssert(addr == (intptr_t)page);
            #if defined SOLARIS
            if (mprotect((char *)addr, NJ_PAGE_SIZE, prot) == -1)
            #else
            if (mprotect((void *)addr, NJ_PAGE_SIZE, prot) == -1)
            #endif
            {
                // todo: we can't abort or assert here, we have to fail gracefully.
                NanoAssertMsg(false, "FATAL ERROR: mprotect(PROT_EXEC) failed\n");
                abort();
            }
        #else
            (void)page;
        #endif
    }

    Register Assembler::nRegisterAllocFromSet(int set)
    {
        Register r;
        RegAlloc &regs = _allocator;
    #ifdef WIN32
        _asm
        {
            mov ecx, regs
            bsf eax, set                    // i = first bit set
            btr RegAlloc::free[ecx], eax    // free &= ~rmask(i)
            mov r, eax
        }
    #elif defined WIN64
        unsigned long tr, fr;
        _BitScanForward(&tr, set);
        _bittestandreset(&fr, tr);
        regs.free = fr;
        r = tr;
    #else
        asm(
            "bsf    %1, %%eax\n\t"
            "btr    %%eax, %2\n\t"
            "movl   %%eax, %0\n\t"
            : "=m"(r) : "m"(set), "m"(regs.free) : "%eax", "memory" );
    #endif /* WIN32 */
        return r;
    }

    void Assembler::nRegisterResetAll(RegAlloc& a)
    {
        // add scratch registers to our free list for the allocator
        a.clear();
        a.used = 0;
        a.free = SavedRegs | ScratchRegs;
        if (!config.sse2)
            a.free &= ~XmmRegs;
        debug_only( a.managed = a.free; )
    }

    NIns* Assembler::nPatchBranch(NIns* branch, NIns* targ)
    {
        NIns* was = 0;
        intptr_t offset = intptr_t(targ) - intptr_t(branch);
        if (branch[0] == JMP32) {
            was = branch + *(int32_t*)&branch[1] + 5;
            *(int32_t*)&branch[1] = offset - 5;
            VALGRIND_DISCARD_TRANSLATIONS(&branch[1], sizeof(int32_t));
        } else if (branch[0] == JCC32) {
            was = branch + *(int32_t*)&branch[2] + 6;
            *(int32_t*)&branch[2] = offset - 6;
            VALGRIND_DISCARD_TRANSLATIONS(&branch[2], sizeof(int32_t));
        } else
            NanoAssertMsg(0, "Unknown branch type in nPatchBranch");
        return was;
    }

    RegisterMask Assembler::hint(LIns* i, RegisterMask allow)
    {
        uint32_t op = i->opcode();
        int prefer = allow;
        if (op == LIR_call) {
            prefer &= rmask(retRegs[0]);
        }
        else if (op == LIR_fcall) {
            prefer &= rmask(FST0);
        }
        else if (op == LIR_iparam) {
            uint32_t max_regs = max_abi_regs[_thisfrag->lirbuf->abi];
            if (i->paramArg() < max_regs)
                prefer &= rmask(Register(i->paramArg()));
        }
        else if (op == LIR_callh || (op == LIR_rsh && i->oprnd1()->opcode()==LIR_callh)) {
            prefer &= rmask(retRegs[1]);
        }
        else if (i->isCmp()) {
            prefer &= AllowableFlagRegs;
        }
        else if (i->isconst()) {
            prefer &= ScratchRegs;
        }
        return (_allocator.free & prefer) ? prefer : allow;
    }

    void Assembler::asm_qjoin(LIns *ins)
    {
        int d = findMemFor(ins);
        AvmAssert(d);
        LIns* lo = ins->oprnd1();
        LIns* hi = ins->oprnd2();

        Reservation *resv = getresv(ins);
        Register rr = resv->reg;

        if (rr != UnknownReg && (rmask(rr) & FpRegs))
            evict(rr);

        if (hi->isconst())
        {
            STi(FP, d+4, hi->imm32());
        }
        else
        {
            Register r = findRegFor(hi, GpRegs);
            ST(FP, d+4, r);
        }

        if (lo->isconst())
        {
            STi(FP, d, lo->imm32());
        }
        else
        {
            // okay if r gets recycled.
            Register r = findRegFor(lo, GpRegs);
            ST(FP, d, r);
        }

        freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
    }

    void Assembler::asm_load(int d, Register r)
    {
        if (rmask(r) & FpRegs)
        {
            if (rmask(r) & XmmRegs) {
                SSE_LDQ(r, d, FP);
            } else {
                FLDQ(d, FP);
            }
        }
        else
        {
            LD(r, d, FP);
        }
    }

    void Assembler::asm_restore(LInsp i, Reservation *resv, Register r)
    {
        if (i->isop(LIR_ialloc)) {
            verbose_only( if (_logc->lcbits & LC_RegAlloc) {
                            outputForEOL("  <= remat %s size %d",
                            _thisfrag->lirbuf->names->formatRef(i), i->size()); } )
            LEA(r, disp(resv), FP);
        }
        else if (i->isconst()) {
            if (!resv->arIndex) {
                i->resv()->clear();
            }
            LDi(r, i->imm32());
        }
        else {
            int d = findMemFor(i);
            verbose_only( if (_logc->lcbits & LC_RegAlloc) {
                            outputForEOL("  <= restore %s",
                            _thisfrag->lirbuf->names->formatRef(i)); } )
            asm_load(d,r);
        }
    }

    void Assembler::asm_store32(LIns *value, int dr, LIns *base)
    {
        if (value->isconst())
        {
            Register rb = getBaseReg(base, dr, GpRegs);
            int c = value->imm32();
            STi(rb, dr, c);
        }
        else
        {
            // make sure what is in a register
            Reservation *rA, *rB;
            Register ra, rb;
            if (base->isop(LIR_ialloc)) {
                rb = FP;
                dr += findMemFor(base);
                ra = findRegFor(value, GpRegs);
            } else if (base->isconst()) {
                // absolute address
                dr += base->imm32();
                ra = findRegFor(value, GpRegs);
                rb = UnknownReg;
            } else {
                findRegFor2(GpRegs, value, rA, base, rB);
                ra = rA->reg;
                rb = rB->reg;
            }
            ST(rb, dr, ra);
        }
    }

    void Assembler::asm_spill(Register rr, int d, bool pop, bool quad)
    {
        (void)quad;
        if (d)
        {
            // save to spill location
            if (rmask(rr) & FpRegs)
            {
                if (rmask(rr) & XmmRegs) {
                    SSE_STQ(d, FP, rr);
                } else {
                    FSTQ((pop?1:0), d, FP);
                }
            }
            else
            {
                ST(FP, d, rr);
            }
        }
        else if (pop && (rmask(rr) & x87Regs))
        {
            // pop the fpu result since it isn't used
            FSTP(FST0);
        }
    }

    void Assembler::asm_load64(LInsp ins)
    {
        LIns* base = ins->oprnd1();
        int db = ins->disp();
        Reservation *resv = getresv(ins);
        Register rr = resv->reg;

        if (rr != UnknownReg && rmask(rr) & XmmRegs)
        {
            freeRsrcOf(ins, false);
            Register rb = getBaseReg(base, db, GpRegs);
            SSE_LDQ(rr, db, rb);
        }
        else
        {
            int dr = disp(resv);
            Register rb;
            if (base->isop(LIR_ialloc)) {
                rb = FP;
                db += findMemFor(base);
            } else {
                rb = findRegFor(base, GpRegs);
            }
            resv->reg = UnknownReg;

            // don't use an fpu reg to simply load & store the value.
            if (dr)
                asm_mmq(FP, dr, rb, db);

            freeRsrcOf(ins, false);

            if (rr != UnknownReg)
            {
                NanoAssert(rmask(rr)&FpRegs);
                _allocator.retire(rr);
                FLDQ(db, rb);
            }
        }
    }

    void Assembler::asm_store64(LInsp value, int dr, LInsp base)
    {
        if (value->isconstq())
        {
            // if a constant 64-bit value just store it now rather than
            // generating a pointless store/load/store sequence
            Register rb;
            if (base->isop(LIR_ialloc)) {
                rb = FP;
                dr += findMemFor(base);
            } else {
                rb = findRegFor(base, GpRegs);
            }
            STi(rb, dr+4, value->imm64_1());
            STi(rb, dr,   value->imm64_0());
            return;
        }

        if (value->isop(LIR_ldq) || value->isop(LIR_ldqc) || value->isop(LIR_qjoin))
        {
            // value is 64bit struct or int64_t, or maybe a double.
            // it may be live in an FPU reg.  Either way, don't
            // put it in an FPU reg just to load & store it.

            // a) if we know it's not a double, this is right.
            // b) if we guarded that its a double, this store could be on
            // the side exit, copying a non-double.
            // c) maybe its a double just being stored.  oh well.

            if (config.sse2) {
                Register rv = findRegFor(value, XmmRegs);
                Register rb;
                if (base->isop(LIR_ialloc)) {
                    rb = FP;
                    dr += findMemFor(base);
                } else {
                    rb = findRegFor(base, GpRegs);
                }
                SSE_STQ(dr, rb, rv);
                return;
            }

            int da = findMemFor(value);
            Register rb;
            if (base->isop(LIR_ialloc)) {
                rb = FP;
                dr += findMemFor(base);
            } else {
                rb = findRegFor(base, GpRegs);
            }
            asm_mmq(rb, dr, FP, da);
            return;
        }

        Register rb;
        if (base->isop(LIR_ialloc)) {
            rb = FP;
            dr += findMemFor(base);
        } else {
            rb = findRegFor(base, GpRegs);
        }

        // if value already in a reg, use that, otherwise
        // try to get it into XMM regs before FPU regs.
        Reservation* rA = getresv(value);
        Register rv;
        int pop = !rA || rA->reg==UnknownReg;
        if (pop) {
            rv = findRegFor(value, config.sse2 ? XmmRegs : FpRegs);
        } else {
            rv = rA->reg;
        }

        if (rmask(rv) & XmmRegs) {
            SSE_STQ(dr, rb, rv);
        } else {
            FSTQ(pop, dr, rb);
        }
    }

    /**
     * copy 64 bits: (rd+dd) <- (rs+ds)
     */
    void Assembler::asm_mmq(Register rd, int dd, Register rs, int ds)
    {
        // value is either a 64bit struct or maybe a float
        // that isn't live in an FPU reg.  Either way, don't
        // put it in an FPU reg just to load & store it.
        if (config.sse2)
        {
            // use SSE to load+store 64bits
            Register t = registerAlloc(XmmRegs);
            _allocator.addFree(t);
            SSE_STQ(dd, rd, t);
            SSE_LDQ(t, ds, rs);
        }
        else
        {
            // get a scratch reg
            Register t = registerAlloc(GpRegs & ~(rmask(rd)|rmask(rs)));
            _allocator.addFree(t);
            ST(rd, dd+4, t);
            LD(t, ds+4, rs);
            ST(rd, dd, t);
            LD(t, ds, rs);
        }
    }

    NIns* Assembler::asm_branch(bool branchOnFalse, LInsp cond, NIns* targ)
    {
        NIns* at = 0;
        LOpcode condop = cond->opcode();
        NanoAssert(cond->isCond());
        if (condop >= LIR_feq && condop <= LIR_fge)
        {
            return asm_jmpcc(branchOnFalse, cond, targ);
        }
        // produce the branch
        if (branchOnFalse)
        {
            if (condop == LIR_eq)
                JNE(targ);
            else if (condop == LIR_ov)
                JNO(targ);
            else if (condop == LIR_lt)
                JNL(targ);
            else if (condop == LIR_le)
                JNLE(targ);
            else if (condop == LIR_gt)
                JNG(targ);
            else if (condop == LIR_ge)
                JNGE(targ);
            else if (condop == LIR_ult)
                JNB(targ);
            else if (condop == LIR_ule)
                JNBE(targ);
            else if (condop == LIR_ugt)
                JNA(targ);
            else //if (condop == LIR_uge)
                JNAE(targ);
        }
        else // op == LIR_xt
        {
            if (condop == LIR_eq)
                JE(targ);
            else if (condop == LIR_ov)
                JO(targ);
            else if (condop == LIR_lt)
                JL(targ);
            else if (condop == LIR_le)
                JLE(targ);
            else if (condop == LIR_gt)
                JG(targ);
            else if (condop == LIR_ge)
                JGE(targ);
            else if (condop == LIR_ult)
                JB(targ);
            else if (condop == LIR_ule)
                JBE(targ);
            else if (condop == LIR_ugt)
                JA(targ);
            else //if (condop == LIR_uge)
                JAE(targ);
        }
        at = _nIns;
        asm_cmp(cond);
        return at;
    }

    void Assembler::asm_switch(LIns* ins, NIns* exit)
    {
        LIns* diff = ins->oprnd1();
        findSpecificRegFor(diff, EBX);
        JMP(exit);
    }

    void Assembler::asm_cmp(LIns *cond)
    {
        LOpcode condop = cond->opcode();

        // LIR_ov recycles the flags set by arithmetic ops
        if (condop == LIR_ov)
            return;

        LInsp lhs = cond->oprnd1();
        LInsp rhs = cond->oprnd2();
        Reservation *rA, *rB;

        NanoAssert((!lhs->isQuad() && !rhs->isQuad()) || (lhs->isQuad() && rhs->isQuad()));

        // Not supported yet.
        NanoAssert(!lhs->isQuad() && !rhs->isQuad());

        // ready to issue the compare
        if (rhs->isconst())
        {
            int c = rhs->imm32();
            if (c == 0 && cond->isop(LIR_eq)) {
                Register r = findRegFor(lhs, GpRegs);
                TEST(r,r);
            }
            else if (!rhs->isQuad()) {
                Register r = getBaseReg(lhs, c, GpRegs);
                CMPi(r, c);
            }
        }
        else
        {
            findRegFor2(GpRegs, lhs, rA, rhs, rB);
            Register ra = rA->reg;
            Register rb = rB->reg;
            CMP(ra, rb);
        }
    }

    void Assembler::asm_loop(LInsp ins, NInsList& loopJumps)
    {
        JMP_long(0);
        loopJumps.add(_nIns);

        // If the target we are looping to is in a different fragment, we have to restore
        // SP since we will target fragEntry and not loopEntry.
        if (ins->record()->exit->target != _thisfrag)
            MR(SP,FP);
    }

    void Assembler::asm_fcond(LInsp ins)
    {
        // only want certain regs
        Register r = prepResultReg(ins, AllowableFlagRegs);
        asm_setcc(r, ins);

        // SETcc only sets low 8 bits, so extend
        MOVZX8(r,r);
        SETNP(r);

        asm_fcmp(ins);
    }

    void Assembler::asm_cond(LInsp ins)
    {
        // only want certain regs
        LOpcode op = ins->opcode();
        Register r = prepResultReg(ins, AllowableFlagRegs);
        // SETcc only sets low 8 bits, so extend
        MOVZX8(r,r);
        if (op == LIR_eq)
            SETE(r);
        else if (op == LIR_ov)
            SETO(r);
        else if (op == LIR_lt)
            SETL(r);
        else if (op == LIR_le)
            SETLE(r);
        else if (op == LIR_gt)
            SETG(r);
        else if (op == LIR_ge)
            SETGE(r);
        else if (op == LIR_ult)
            SETB(r);
        else if (op == LIR_ule)
            SETBE(r);
        else if (op == LIR_ugt)
            SETA(r);
        else // if (op == LIR_uge)
            SETAE(r);
        asm_cmp(ins);
    }

    void Assembler::asm_arith(LInsp ins)
    {
        LOpcode op = ins->opcode();
        LInsp lhs = ins->oprnd1();

        if (op == LIR_mod) {
            /* LIR_mod expects the LIR_div to be near (no interference from the register allocator) */
            findSpecificRegFor(lhs, EDX);
            prepResultReg(ins, 1<<EDX);
            evict(EAX);
            return;
        }

        LInsp rhs = ins->oprnd2();

        bool forceReg;
        RegisterMask allow = GpRegs;
        Register rb = UnknownReg;

        switch (op) {
        case LIR_div:
            forceReg = true;
            rb = findRegFor(rhs, (GpRegs ^ (rmask(EAX)|rmask(EDX))));
            allow = 1<<EAX;
            evict(EDX);
            break;
        case LIR_mul:
            forceReg = true;
            break;
        case LIR_lsh:
        case LIR_rsh:
        case LIR_ush:
            forceReg = !rhs->isconst();
            if (forceReg) {
                rb = findSpecificRegFor(rhs, ECX);
                allow &= ~rmask(rb);
            }
            break;
        case LIR_add:
        case LIR_iaddp:
            if (lhs->isop(LIR_ialloc) && rhs->isconst()) {
                // add alloc+const, use lea
                Register rr = prepResultReg(ins, allow);
                int d = findMemFor(lhs) + rhs->imm32();
                LEA(rr, d, FP);
                return;
            }
            /* fall through */
        default:
            forceReg = !rhs->isconst();
            break;
        }

        // if we need a register for the rhs and don't have one yet, get it
        if (forceReg && lhs != rhs && rb == UnknownReg) {
            rb = findRegFor(rhs, allow);
            allow &= ~rmask(rb);
        }

        Register rr = prepResultReg(ins, allow);
        Reservation* rA = getresv(lhs);
        Register ra;
        // if this is last use of lhs in reg, we can re-use result reg
        if (rA == 0 || (ra = rA->reg) == UnknownReg)
            ra = findSpecificRegFor(lhs, rr);
        // else, rA already has a register assigned.

        if (forceReg)
        {
            if (lhs == rhs)
                rb = ra;

            switch (op) {
            case LIR_add:
            case LIR_iaddp:
                ADD(rr, rb);
                break;
            case LIR_sub:
                SUB(rr, rb);
                break;
            case LIR_mul:
                MUL(rr, rb);
                break;
            case LIR_and:
                AND(rr, rb);
                break;
            case LIR_or:
                OR(rr, rb);
                break;
            case LIR_xor:
                XOR(rr, rb);
                break;
            case LIR_lsh:
                SHL(rr, rb);
                break;
            case LIR_rsh:
                SAR(rr, rb);
                break;
            case LIR_ush:
                SHR(rr, rb);
                break;
            case LIR_div:
            case LIR_mod:
                 DIV(rb);
                 CDQ();
                break;
            default:
                NanoAssertMsg(0, "Unsupported");
            }
        }
        else
        {
            int c = rhs->imm32();
            switch (op) {
            case LIR_iaddp:
                // this doesn't set cc's, only use it when cc's not required.
                LEA(rr, c, ra);
                ra = rr; // suppress mov
                break;
            case LIR_add:
                ADDi(rr, c);
                break;
            case LIR_sub:
                SUBi(rr, c);
                break;
            case LIR_and:
                ANDi(rr, c);
                break;
            case LIR_or:
                ORi(rr, c);
                break;
            case LIR_xor:
                XORi(rr, c);
                break;
            case LIR_lsh:
                SHLi(rr, c);
                break;
            case LIR_rsh:
                SARi(rr, c);
                break;
            case LIR_ush:
                SHRi(rr, c);
                break;
            default:
                NanoAssertMsg(0, "Unsupported");
                break;
            }
        }

        if ( rr != ra )
            MR(rr,ra);
    }

    void Assembler::asm_neg_not(LInsp ins)
    {
        LOpcode op = ins->opcode();
        Register rr = prepResultReg(ins, GpRegs);

        LIns* lhs = ins->oprnd1();
        Reservation *rA = getresv(lhs);
        // if this is last use of lhs in reg, we can re-use result reg
        Register ra;
        if (rA == 0 || (ra=rA->reg) == UnknownReg)
            ra = findSpecificRegFor(lhs, rr);
        // else, rA already has a register assigned.

        if (op == LIR_not)
            NOT(rr);
        else
            NEG(rr);

        if ( rr != ra )
            MR(rr,ra);
    }

    void Assembler::asm_ld(LInsp ins)
    {
        LOpcode op = ins->opcode();
        LIns* base = ins->oprnd1();
        int32_t d = ins->disp();
        Register rr = prepResultReg(ins, GpRegs);

        if (base->isconst()) {
            intptr_t addr = base->imm32();
            addr += d;
            if (op == LIR_ldcb)
                LD8Zdm(rr, addr);
            else if (op == LIR_ldcs)
                LD16Zdm(rr, addr);
            else
                LDdm(rr, addr);
            return;
        }

        /* Search for add(X,Y) */
        if (base->opcode() == LIR_piadd) {
            int scale = 0;
            LIns *lhs = base->oprnd1();
            LIns *rhs = base->oprnd2();

            /* See if we can bypass any SHLs, by searching for
             * add(X, shl(Y,Z)) -> mov r, [X+Y*Z]
             */
            if (rhs->opcode() == LIR_pilsh && rhs->oprnd2()->isconst()) {
                scale = rhs->oprnd2()->imm32();
                if (scale >= 1 && scale <= 3)
                    rhs = rhs->oprnd1();
                else
                    scale = 0;
            }

            Register rleft;
            Reservation *rL = getresv(lhs);

            /* Does LHS have a register yet? If not, re-use the result reg.
             * @todo -- If LHS is const, we could eliminate a register use.
             */
            if (rL == NULL || rL->reg == UnknownReg)
                rleft = findSpecificRegFor(lhs, rr);
            else
                rleft = rL->reg;

            Register rright = UnknownReg;
            Reservation *rR = getresv(rhs);

            /* Does RHS have a register yet? If not, try to re-use the result reg. */
            if (rr != rleft && (rR == NULL || rR->reg == UnknownReg))
                rright = findSpecificRegFor(rhs, rr);
            if (rright == UnknownReg)
                rright = findRegFor(rhs, GpRegs & ~(rmask(rleft)));

            if (op == LIR_ldcb)
                LD8Zsib(rr, d, rleft, rright, scale);
            else if (op == LIR_ldcs)
                LD16Zsib(rr, d, rleft, rright, scale);
            else
                LDsib(rr, d, rleft, rright, scale);

            return;
        }

        Register ra = getBaseReg(base, d, GpRegs);
        if (op == LIR_ldcb)
            LD8Z(rr, d, ra);
        else if (op == LIR_ldcs)
            LD16Z(rr, d, ra);
        else
            LD(rr, d, ra);
    }

    void Assembler::asm_cmov(LInsp ins)
    {
        LOpcode op = ins->opcode();
        LIns* condval = ins->oprnd1();
        LIns* iftrue  = ins->oprnd2();
        LIns* iffalse = ins->oprnd3();

        NanoAssert(condval->isCmp());
        NanoAssert(op == LIR_qcmov || (!iftrue->isQuad() && !iffalse->isQuad()));

        const Register rr = prepResultReg(ins, GpRegs);

        // this code assumes that neither LD nor MR nor MRcc set any of the condition flags.
        // (This is true on Intel, is it true on all architectures?)
        const Register iffalsereg = findRegFor(iffalse, GpRegs & ~rmask(rr));
        if (op == LIR_cmov) {
            switch (condval->opcode())
            {
                // note that these are all opposites...
                case LIR_eq:    MRNE(rr, iffalsereg);   break;
                case LIR_ov:    MRNO(rr, iffalsereg);   break;
                case LIR_lt:    MRGE(rr, iffalsereg);   break;
                case LIR_le:    MRG(rr, iffalsereg);    break;
                case LIR_gt:    MRLE(rr, iffalsereg);   break;
                case LIR_ge:    MRL(rr, iffalsereg);    break;
                case LIR_ult:   MRAE(rr, iffalsereg);   break;
                case LIR_ule:   MRA(rr, iffalsereg);    break;
                case LIR_ugt:   MRBE(rr, iffalsereg);   break;
                case LIR_uge:   MRB(rr, iffalsereg);    break;
                debug_only( default: NanoAssert(0); break; )
            }
        } else if (op == LIR_qcmov) {
            NanoAssert(0);
        }
        /*const Register iftruereg =*/ findSpecificRegFor(iftrue, rr);
        asm_cmp(condval);
    }

    void Assembler::asm_qhi(LInsp ins)
    {
        Register rr = prepResultReg(ins, GpRegs);
        LIns *q = ins->oprnd1();
        int d = findMemFor(q);
        LD(rr, d+4, FP);
    }

    void Assembler::asm_param(LInsp ins)
    {
        uint32_t a = ins->paramArg();
        uint32_t kind = ins->paramKind();
        if (kind == 0) {
            // ordinary param
            AbiKind abi = _thisfrag->lirbuf->abi;
            uint32_t abi_regcount = max_abi_regs[abi];
            if (a < abi_regcount) {
                // incoming arg in register
                prepResultReg(ins, rmask(argRegs[a]));
            } else {
                // incoming arg is on stack, and EBP points nearby (see genPrologue)
                Register r = prepResultReg(ins, GpRegs);
                int d = (a - abi_regcount) * sizeof(intptr_t) + 8;
                LD(r, d, FP);
            }
        }
        else {
            // saved param
            prepResultReg(ins, rmask(savedRegs[a]));
        }
    }

    void Assembler::asm_int(LInsp ins)
    {
        Register rr = prepResultReg(ins, GpRegs);
        int32_t val = ins->imm32();
        if (val == 0)
            XOR(rr,rr);
        else
            LDi(rr, val);
    }

    void Assembler::asm_quad(LInsp ins)
    {
        Reservation *rR = getresv(ins);
        Register rr = rR->reg;
        if (rr != UnknownReg)
        {
            // @todo -- add special-cases for 0 and 1
            _allocator.retire(rr);
            rR->reg = UnknownReg;
            NanoAssert((rmask(rr) & FpRegs) != 0);

            const double d = ins->imm64f();
            const uint64_t q = ins->imm64();
            if (rmask(rr) & XmmRegs) {
                if (q == 0.0) {
                    // test (int64)0 since -0.0 == 0.0
                    SSE_XORPDr(rr, rr);
                } else if (d == 1.0) {
                    // 1.0 is extremely frequent and worth special-casing!
                    static const double k_ONE = 1.0;
                    LDSDm(rr, &k_ONE);
                } else {
                    findMemFor(ins);
                    const int d = disp(rR);
                    SSE_LDQ(rr, d, FP);
                }
            } else {
                if (q == 0.0) {
                    // test (int64)0 since -0.0 == 0.0
                    FLDZ();
                } else if (d == 1.0) {
                    FLD1();
                } else {
                    findMemFor(ins);
                    int d = disp(rR);
                    FLDQ(d,FP);
                }
            }
        }

        // @todo, if we used xor, ldsd, fldz, etc above, we don't need mem here
        int d = disp(rR);
        freeRsrcOf(ins, false);
        if (d)
        {
            STi(FP,d+4,ins->imm64_1());
            STi(FP,d,  ins->imm64_0());
        }
    }

    void Assembler::asm_qlo(LInsp ins)
    {
        LIns *q = ins->oprnd1();

        if (!config.sse2)
        {
            Register rr = prepResultReg(ins, GpRegs);
            int d = findMemFor(q);
            LD(rr, d, FP);
        }
        else
        {
            Reservation *resv = getresv(ins);
            Register rr = resv->reg;
            if (rr == UnknownReg) {
                // store quad in spill loc
                int d = disp(resv);
                freeRsrcOf(ins, false);
                Register qr = findRegFor(q, XmmRegs);
                SSE_MOVDm(d, FP, qr);
            } else {
                freeRsrcOf(ins, false);
                Register qr = findRegFor(q, XmmRegs);
                SSE_MOVD(rr,qr);
            }
        }
    }

    void Assembler::asm_fneg(LInsp ins)
    {
        if (config.sse2)
        {
            LIns *lhs = ins->oprnd1();

            Register rr = prepResultReg(ins, XmmRegs);
            Reservation *rA = getresv(lhs);
            Register ra;

            // if this is last use of lhs in reg, we can re-use result reg
            if (rA == 0 || (ra = rA->reg) == UnknownReg) {
                ra = findSpecificRegFor(lhs, rr);
            } else if ((rmask(ra) & XmmRegs) == 0) {
                /* We need this case on AMD64, because it's possible that
                 * an earlier instruction has done a quadword load and reserved a
                 * GPR.  If so, ask for a new register.
                 */
                ra = findRegFor(lhs, XmmRegs);
            }
            // else, rA already has a register assigned.

#if defined __SUNPRO_CC
            // from Sun Studio C++ Readme: #pragma align inside namespace requires mangled names
            static uint32_t temp[] = {0, 0, 0, 0, 0, 0, 0};
            static uint32_t *negateMask = (uint32_t *)alignUp(temp, 16);
            negateMask[1] = 0x80000000;
#else
            static const AVMPLUS_ALIGN16(uint32_t) negateMask[] = {0,0x80000000,0,0};
#endif
            SSE_XORPD(rr, negateMask);

            if (rr != ra)
                SSE_MOVSD(rr, ra);
        }
        else
        {
            Register rr = prepResultReg(ins, FpRegs);

            LIns* lhs = ins->oprnd1();

            // lhs into reg, prefer same reg as result
            Reservation* rA = getresv(lhs);
            // if this is last use of lhs in reg, we can re-use result reg
            if (rA == 0 || rA->reg == UnknownReg)
                findSpecificRegFor(lhs, rr);
            // else, rA already has a different reg assigned

            NanoAssert(getresv(lhs)!=0 && getresv(lhs)->reg==FST0);
            // assume that the lhs is in ST(0) and rhs is on stack
            FCHS();

            // if we had more than one fpu reg, this is where
            // we would move ra into rr if rr != ra.
        }
    }

    void Assembler::asm_arg(ArgSize sz, LInsp p, Register r)
    {
        if (sz == ARGSIZE_Q)
        {
            // ref arg - use lea
            if (r != UnknownReg)
            {
                // arg in specific reg
                int da = findMemFor(p);
                LEA(r, da, FP);
            }
            else
            {
                NanoAssert(0); // not supported
            }
        }
        else if (sz == ARGSIZE_LO)
        {
            if (r != UnknownReg) {
                // arg goes in specific register
                if (p->isconst()) {
                    LDi(r, p->imm32());
                } else {
                    Reservation* rA = getresv(p);
                    if (rA) {
                        if (rA->reg == UnknownReg) {
                            // load it into the arg reg
                            int d = findMemFor(p);
                            if (p->isop(LIR_ialloc)) {
                                LEA(r, d, FP);
                            } else {
                                LD(r, d, FP);
                            }
                        } else {
                            // it must be in a saved reg
                            MR(r, rA->reg);
                        }
                    }
                    else {
                        // this is the last use, so fine to assign it
                        // to the scratch reg, it's dead after this point.
                        findSpecificRegFor(p, r);
                    }
                }
            }
            else {
                asm_pusharg(p);
            }
        }
        else
        {
            NanoAssert(sz == ARGSIZE_F);
            asm_farg(p);
        }
    }

    void Assembler::asm_pusharg(LInsp p)
    {
        // arg goes on stack
        Reservation* rA = getresv(p);
        if (rA == 0 && p->isconst())
        {
            // small const we push directly
            PUSHi(p->imm32());
        }
        else if (rA == 0 || p->isop(LIR_ialloc))
        {
            Register ra = findRegFor(p, GpRegs);
            PUSHr(ra);
        }
        else if (rA->reg == UnknownReg)
        {
            PUSHm(disp(rA), FP);
        }
        else
        {
            PUSHr(rA->reg);
        }
    }

    void Assembler::asm_farg(LInsp p)
    {
        NanoAssert(p->isQuad());
        Register r = findRegFor(p, FpRegs);
        if (rmask(r) & XmmRegs) {
            SSE_STQ(0, SP, r);
        } else {
            FSTPQ(0, SP);
            /* It's possible that the same LIns* with r=FST0 will appear in the argument list more
             * than once.  In this case FST0 will not have been evicted and the multiple pop
             * actions will unbalance the FPU stack.  A quick fix is to always evict FST0 manually.
             */
            evict(FST0);
        }
        SUBi(ESP,8);
    }

    void Assembler::asm_fop(LInsp ins)
    {
        LOpcode op = ins->opcode();
        if (config.sse2)
        {
            LIns *lhs = ins->oprnd1();
            LIns *rhs = ins->oprnd2();

            RegisterMask allow = XmmRegs;
            Register rb = UnknownReg;
            if (lhs != rhs) {
                rb = findRegFor(rhs,allow);
                allow &= ~rmask(rb);
            }

            Register rr = prepResultReg(ins, allow);
            Reservation *rA = getresv(lhs);
            Register ra;

            // if this is last use of lhs in reg, we can re-use result reg
            if (rA == 0 || (ra = rA->reg) == UnknownReg) {
                ra = findSpecificRegFor(lhs, rr);
            } else if ((rmask(ra) & XmmRegs) == 0) {
                /* We need this case on AMD64, because it's possible that
                 * an earlier instruction has done a quadword load and reserved a
                 * GPR.  If so, ask for a new register.
                 */
                ra = findRegFor(lhs, XmmRegs);
            }
            else {
                // rA already has a register assigned but maybe not from the allow set
                ra = findRegFor(lhs, allow);
            }

            if (lhs == rhs)
                rb = ra;

            if (op == LIR_fadd)
                SSE_ADDSD(rr, rb);
            else if (op == LIR_fsub)
                SSE_SUBSD(rr, rb);
            else if (op == LIR_fmul)
                SSE_MULSD(rr, rb);
            else //if (op == LIR_fdiv)
                SSE_DIVSD(rr, rb);

            if (rr != ra)
                SSE_MOVSD(rr, ra);
        }
        else
        {
            // we swap lhs/rhs on purpose here, works out better
            // if you only have one fpu reg.  use divr/subr.
            LIns* rhs = ins->oprnd1();
            LIns* lhs = ins->oprnd2();
            Register rr = prepResultReg(ins, rmask(FST0));

            // make sure rhs is in memory
            int db = findMemFor(rhs);

            // lhs into reg, prefer same reg as result
            Reservation* rA = getresv(lhs);
            // last use of lhs in reg, can reuse rr
            if (rA == 0 || rA->reg == UnknownReg)
                findSpecificRegFor(lhs, rr);
            // else, rA already has a different reg assigned

            NanoAssert(getresv(lhs)!=0 && getresv(lhs)->reg==FST0);
            // assume that the lhs is in ST(0) and rhs is on stack
            if (op == LIR_fadd)
                { FADD(db, FP); }
            else if (op == LIR_fsub)
                { FSUBR(db, FP); }
            else if (op == LIR_fmul)
                { FMUL(db, FP); }
            else if (op == LIR_fdiv)
                { FDIVR(db, FP); }
        }
    }

    void Assembler::asm_i2f(LInsp ins)
    {
        // where our result goes
        Register rr = prepResultReg(ins, FpRegs);
        if (rmask(rr) & XmmRegs)
        {
            // todo support int value in memory
            Register gr = findRegFor(ins->oprnd1(), GpRegs);
            SSE_CVTSI2SD(rr, gr);
        }
        else
        {
            int d = findMemFor(ins->oprnd1());
            FILD(d, FP);
        }
    }

    Register Assembler::asm_prep_fcall(Reservation *rR, LInsp ins)
    {
        if (rR) {
            Register rr;
            if ((rr=rR->reg) != UnknownReg && (rmask(rr) & XmmRegs))
                evict(rr);
        }
        return prepResultReg(ins, rmask(FST0));
    }

    void Assembler::asm_u2f(LInsp ins)
    {
        // where our result goes
        Register rr = prepResultReg(ins, FpRegs);
        if (rmask(rr) & XmmRegs)
        {
            // don't call findRegFor, we want a reg we can stomp on for a very short time,
            // not a reg that will continue to be associated with the LIns
            Register gr = registerAlloc(GpRegs);

            // technique inspired by gcc disassembly
            // Edwin explains it:
            //
            // gr is 0..2^32-1
            //
            //     sub gr,0x80000000
            //
            // now gr is -2^31..2^31-1, i.e. the range of int, but not the same value
            // as before
            //
            //     cvtsi2sd rr,gr
            //
            // rr is now a double with the int value range
            //
            //     addsd rr, 2147483648.0
            //
            // adding back double(0x80000000) makes the range 0..2^32-1.

            static const double k_NEGONE = 2147483648.0;
            SSE_ADDSDm(rr, &k_NEGONE);

            SSE_CVTSI2SD(rr, gr);

            Reservation* resv = getresv(ins->oprnd1());
            Register xr;
            if (resv && (xr = resv->reg) != UnknownReg && (rmask(xr) & GpRegs))
            {
                LEA(gr, 0x80000000, xr);
            }
            else
            {
                const int d = findMemFor(ins->oprnd1());
                SUBi(gr, 0x80000000);
                LD(gr, d, FP);
            }

            // ok, we're done with it
            _allocator.addFree(gr);
        }
        else
        {
            const int disp = -8;
            const Register base = SP;
            Register gr = findRegFor(ins->oprnd1(), GpRegs);
            NanoAssert(rr == FST0);
            FILDQ(disp, base);
            STi(base, disp+4, 0);   // high 32 bits = 0
            ST(base, disp, gr);     // low 32 bits = unsigned value
        }
    }

    void Assembler::asm_nongp_copy(Register r, Register s)
    {
        if ((rmask(r) & XmmRegs) && (rmask(s) & XmmRegs)) {
            SSE_MOVSD(r, s);
        } else if ((rmask(r) & GpRegs) && (rmask(s) & XmmRegs)) {
            SSE_MOVD(r, s);
        } else {
            if (rmask(r) & XmmRegs) {
                // x87 -> xmm
                NanoAssertMsg(false, "Should not move data from GPR to XMM");
            } else {
                // xmm -> x87
                NanoAssertMsg(false, "Should not move data from GPR/XMM to x87 FPU");
            }
        }
    }

    NIns * Assembler::asm_jmpcc(bool branchOnFalse, LIns *cond, NIns *targ)
    {
        LOpcode c = cond->opcode();
        if (config.sse2 && c != LIR_feq) {
            LIns *lhs = cond->oprnd1();
            LIns *rhs = cond->oprnd2();
            if (c == LIR_flt) {
                LIns *t = lhs; lhs = rhs; rhs = t;
                c = LIR_fgt;
            }
            else if (c == LIR_fle) {
                LIns *t = lhs; lhs = rhs; rhs = t;
                c = LIR_fge;
            }

            if (c == LIR_fgt) {
                if (branchOnFalse) { JNA(targ); } else { JA(targ); }
            }
            else { // if (c == LIR_fge)
                if (branchOnFalse) { JNAE(targ); } else { JAE(targ); }
            }
            NIns *at = _nIns;
            Reservation *rA, *rB;
            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
            SSE_UCOMISD(rA->reg, rB->reg);
            return at;
        }

        if (branchOnFalse)
            JP(targ);
        else
            JNP(targ);
        NIns *at = _nIns;
        asm_fcmp(cond);
        return at;
    }

    void Assembler::asm_setcc(Register r, LIns *cond)
    {
        LOpcode c = cond->opcode();
        if (config.sse2 && c != LIR_feq) {
            MOVZX8(r,r);
            LIns *lhs = cond->oprnd1();
            LIns *rhs = cond->oprnd2();
            if (c == LIR_flt) {
                LIns *t = lhs; lhs = rhs; rhs = t;
                SETA(r);
            }
            else if (c == LIR_fle) {
                LIns *t = lhs; lhs = rhs; rhs = t;
                SETAE(r);
            }
            else if (c == LIR_fgt) {
                SETA(r);
            }
            else { // if (c == LIR_fge)
                SETAE(r);
            }
            Reservation *rA, *rB;
            findRegFor2(XmmRegs, lhs, rA, rhs, rB);
            SSE_UCOMISD(rA->reg, rB->reg);
            return;
        }
        // SETcc only sets low 8 bits, so extend
        MOVZX8(r,r);
        SETNP(r);
        asm_fcmp(cond);
    }

    void Assembler::asm_fcmp(LIns *cond)
    {
        LOpcode condop = cond->opcode();
        NanoAssert(condop >= LIR_feq && condop <= LIR_fge);
        LIns* lhs = cond->oprnd1();
        LIns* rhs = cond->oprnd2();

        int mask;
        if (condop == LIR_feq)
            mask = 0x44;
        else if (condop == LIR_fle)
            mask = 0x41;
        else if (condop == LIR_flt)
            mask = 0x05;
        else if (condop == LIR_fge) {
            // swap, use le
            condop = LIR_fle;
            LIns* t = lhs; lhs = rhs; rhs = t;
            mask = 0x41;
        } else { // if (condop == LIR_fgt)
            // swap, use lt
            condop = LIR_flt;
            LIns* t = lhs; lhs = rhs; rhs = t;
            mask = 0x05;
        }

        if (config.sse2)
        {
            // UNORDERED:    ZF,PF,CF <- 111;
            // GREATER_THAN: ZF,PF,CF <- 000;
            // LESS_THAN:    ZF,PF,CF <- 001;
            // EQUAL:        ZF,PF,CF <- 100;

            if (condop == LIR_feq && lhs == rhs) {
                // nan check
                Register r = findRegFor(lhs, XmmRegs);
                SSE_UCOMISD(r, r);
            }
            else {
                evict(EAX);
                TEST_AH(mask);
                LAHF();
                Reservation *rA, *rB;
                findRegFor2(XmmRegs, lhs, rA, rhs, rB);
                SSE_UCOMISD(rA->reg, rB->reg);
            }
        }
        else
        {
            evict(EAX);
            TEST_AH(mask);
            FNSTSW_AX();
            NanoAssert(lhs->isQuad() && rhs->isQuad());
            Reservation *rA;
            if (lhs != rhs)
            {
                // compare two different numbers
                int d = findMemFor(rhs);
                rA = getresv(lhs);
                int pop = !rA || rA->reg == UnknownReg;
                findSpecificRegFor(lhs, FST0);
                // lhs is in ST(0) and rhs is on stack
                FCOM(pop, d, FP);
            }
            else
            {
                // compare n to itself, this is a NaN test.
                rA = getresv(lhs);
                int pop = !rA || rA->reg == UnknownReg;
                findSpecificRegFor(lhs, FST0);
                // value in ST(0)
                if (pop)
                    FCOMPP();
                else
                    FCOMP();
                FLDr(FST0); // DUP
            }
        }
    }

    void Assembler::nativePageReset()
    {
    }

    void Assembler::nativePageSetup()
    {
        if (!_nIns)         _nIns       = pageAlloc();
        if (!_nExitIns)  _nExitIns = pageAlloc(true);
    }

    // Reset the _nIns pointer to the starting value. This can be used to roll
    // back the instruction pointer in case an error occurred during the code
    // generation.
    void Assembler::resetInstructionPointer()
    {
        _nIns = _startingIns;
    }

    // Store the starting _nIns value so that it can be reset later.
    void Assembler::recordStartingInstructionPointer()
    {
        _startingIns = _nIns;
    }

    // enough room for n bytes
    void Assembler::underrunProtect(int n)
    {
        NanoAssertMsg(n<=LARGEST_UNDERRUN_PROT, "constant LARGEST_UNDERRUN_PROT is too small");
        NIns *eip = this->_nIns;
        Page *p = (Page*)pageTop(eip-1);
        NIns *top = (NIns*) &p->code[0];
        if (eip - n < top) {
            // We are done with the current page.  Tell Valgrind that new code
            // has been generated.
            VALGRIND_DISCARD_TRANSLATIONS(pageTop(p), NJ_PAGE_SIZE);
            _nIns = pageAlloc(_inExit);
            JMP(eip);
        }
    }

    #endif /* FEATURE_NANOJIT */
}