b=449526, TM: fix up ARM code generation / softfloat

2024-09-13 09:24:08 -07:00 · 2008-09-02 22:29:23 -07:00 · 2008-09-02 22:29:23 -07:00 · 420e72ed85
commit 420e72ed85
parent e3916b316d
9 changed files with 519 additions and 175 deletions
--- a/js/src/builtins.tbl
+++ b/js/src/builtins.tbl
@ -98,6 +98,20 @@ BUILTIN3(Object_p_propertyIsEnumerable,
 BUILTIN2(BooleanToNumber,       LO, LO, F,      jsdouble,  JSContext*, jsint, 1, 1)
 BUILTIN2(ObjectToString,        LO,     LO, P,  JSString*, JSContext*, JSObject*, 0, 0)
 BUILTIN3(Array_1int,            LO, LO, LO, P,  JSObject*, JSContext*, JSObject*, jsint, 0, 0)
 // soft float
 BUILTIN1(fneg,   F,       F,  jsdouble, jsdouble, 1, 1)
 BUILTIN1(i2f,    LO,      F,  jsdouble, jsint,    1, 1)
 BUILTIN1(u2f,    LO,      F,  jsdouble, jsuint,   1, 1)
 BUILTIN2(fcmpeq, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
 BUILTIN2(fcmplt, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
 BUILTIN2(fcmple, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
 BUILTIN2(fcmpgt, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
 BUILTIN2(fcmpge, F,   F,  LO, jsint,    jsdouble, jsdouble, 1, 1)
 BUILTIN2(fmul,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
 BUILTIN2(fadd,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
 BUILTIN2(fdiv,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
 BUILTIN2(fsub,   F,   F,  F,  jsdouble, jsdouble, jsdouble, 1, 1)
 BUILTIN3(Array_1str,            LO, LO, LO, P,  JSObject*, JSContext*, JSObject*, JSString*, 0, 0)
 BUILTIN4(Array_2obj,            LO, LO, LO, LO, P, JSObject*, JSContext*, JSObject*, JSObject*, JSObject**, 0, 0)
 BUILTIN5(Array_3num,            LO, LO, F, F, F, P, JSObject*, JSContext*, JSObject*, jsdouble, jsdouble, jsdouble, 0, 0)
--- a/js/src/jsbuiltins.cpp
+++ b/js/src/jsbuiltins.cpp
@ -719,6 +719,79 @@ js_Array_3num(JSContext* cx, JSObject* proto, jsdouble n1, jsdouble n2, jsdouble
            return NULL;)
 }
 /* soft float */
 jsdouble FASTCALL
 js_fneg(jsdouble x)
 {
    return -x;
 }
 jsdouble FASTCALL
 js_i2f(jsint i)
 {
    return i;
 }
 jsdouble FASTCALL
 js_u2f(jsuint u)
 {
    return u;
 }
 jsint FASTCALL
 js_fcmpeq(jsdouble x, jsdouble y)
 {
    return x==y;
 }
 jsint FASTCALL
 js_fcmplt(jsdouble x, jsdouble y)
 {
    return x < y;
 }
 jsint FASTCALL
 js_fcmple(jsdouble x, jsdouble y)
 {
    return x <= y;
 }
 jsint FASTCALL
 js_fcmpgt(jsdouble x, jsdouble y)
 {
    return x > y;
 }
 jsint FASTCALL
 js_fcmpge(jsdouble x, jsdouble y)
 {
    return x >= y;
 }
 jsdouble FASTCALL
 js_fmul(jsdouble x, jsdouble y)
 {
    return x * y;
 }
 jsdouble FASTCALL
 js_fadd(jsdouble x, jsdouble y)
 {
    return x + y;
 }
 jsdouble FASTCALL
 js_fdiv(jsdouble x, jsdouble y)
 {
    return x / y;
 }
 jsdouble FASTCALL
 js_fsub(jsdouble x, jsdouble y)
 {
    return x - y;
 }
 #define LO ARGSIZE_LO
 #define F  ARGSIZE_F
 #define Q  ARGSIZE_Q
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
@ -1,4 +1,4 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 * vim: set ts=4 sw=4 et tw=99:
 *
 * ***** BEGIN LICENSE BLOCK *****
@ -277,12 +277,59 @@ Oracle::clear()
    _dontDemote.reset();
 }
 static bool isi2f(LInsp i)
 {
    if (i->isop(LIR_i2f))
        return true;
 #ifdef NANOJIT_ARM
    if (i->isop(LIR_qjoin) &&
        i->oprnd1()->isop(LIR_call) &&
        i->oprnd2()->isop(LIR_callh))
    {
        if (i->oprnd1()->imm8() == F_i2f)
            return true;
    }
 #endif
    return false;
 }
 static bool isu2f(LInsp i)
 {
    if (i->isop(LIR_u2f))
        return true;
 #ifdef NANOJIT_ARM
    if (i->isop(LIR_qjoin) &&
        i->oprnd1()->isop(LIR_call) &&
        i->oprnd2()->isop(LIR_callh))
    {
        if (i->oprnd1()->imm8() == F_u2f)
            return true;
    }
 #endif
    return false;
 }
 static LInsp iu2fArg(LInsp i)
 {
 #ifdef NANOJIT_ARM
    if (i->isop(LIR_qjoin))
        return i->oprnd1()->arg(0);
 #endif
    return i->oprnd1();
 }
 static LIns* demote(LirWriter *out, LInsp i)
 {
    if (i->isCall())
        return callArgN(i, 0);
-    if (i->isop(LIR_i2f) || i->isop(LIR_u2f))
+    if (isi2f(i) || isu2f(i))
-        return i->oprnd1();
+        return iu2fArg(i);
    if (i->isconst())
        return i;
    AvmAssert(i->isconstq());
@ -294,14 +341,14 @@ static LIns* demote(LirWriter *out, LInsp i)
 static bool isPromoteInt(LIns* i)
 {
    jsdouble d;
-    return i->isop(LIR_i2f) || i->isconst() ||
+    return isi2f(i) || i->isconst() ||
        (i->isconstq() && ((d = i->constvalf()) == (jsdouble)(jsint)d) && !JSDOUBLE_IS_NEGZERO(d));
 }
 static bool isPromoteUint(LIns* i)
 {
    jsdouble d;
-    return i->isop(LIR_u2f) || i->isconst() ||
+    return isu2f(i) || i->isconst() ||
        (i->isconstq() && ((d = i->constvalf()) == (jsdouble)(jsuint)d));
 }
@ -324,6 +371,92 @@ static bool overflowSafe(LIns* i)
            ((c->constval() > 0)));
 }
 #ifdef NANOJIT_ARM
 class SoftFloatFilter: public LirWriter
 {
 public:
    SoftFloatFilter(LirWriter* out):
        LirWriter(out)
    {
    }
    LInsp quadCall(uint32_t fid, LInsp args[]) {
        LInsp qlo, qhi;
        qlo = out->insCall(fid, args);
        qhi = out->ins1(LIR_callh, qlo);
        return out->qjoin(qlo, qhi);
    }
    LInsp ins1(LOpcode v, LInsp s0)
    {
        if (v == LIR_fneg)
            return quadCall(F_fneg, &s0);
        if (v == LIR_i2f)
            return quadCall(F_i2f, &s0);
        if (v == LIR_u2f)
            return quadCall(F_u2f, &s0);
        return out->ins1(v, s0);
    }
    LInsp ins2(LOpcode v, LInsp s0, LInsp s1)
    {
        LInsp args[2];
        LInsp bv;
        // change the numeric value and order of these LIR opcodes and die
        if (LIR_fadd <= v && v <= LIR_fdiv) {
            static uint32_t fmap[] = { F_fadd, F_fsub, F_fmul, F_fdiv };
            args[0] = s1;
            args[1] = s0;
            return quadCall(fmap[v - LIR_fadd], args);
        }
        if (LIR_feq <= v && v <= LIR_fge) {
            static uint32_t fmap[] = { F_fcmpeq, F_fcmplt, F_fcmpgt, F_fcmple, F_fcmpge };
            args[0] = s1;
            args[1] = s0;
            bv = out->insCall(fmap[v - LIR_feq], args);
            return out->ins2(LIR_eq, bv, out->insImm(1));
        }
        // not really a softfloat filter, but needed on ARM --
        // arm doesn't mask shifts to 31 like x86 does
        if (v == LIR_lsh ||
            v == LIR_rsh ||
            v == LIR_ush)
        {
            if (s1->isconst())
                s1->setimm16(s1->constval() & 31);
            else
                s1 = out->ins2(LIR_and, s1, out->insImm(31));
            return out->ins2(v, s0, s1);
        }
        return out->ins2(v, s0, s1);
    }
    LInsp insCall(uint32_t fid, LInsp args[])
    {
        // if the return type is ARGSIZE_F, we have
        // to do a quadCall ( qjoin(call,callh) )
        if ((builtins[fid]._argtypes & 3) == ARGSIZE_F)
            return quadCall(fid, args);
        return out->insCall(fid, args);
    }
 };
 #endif
 class FuncFilter: public LirWriter
 {
    TraceRecorder& recorder;
@ -427,9 +560,8 @@ public:
          case F_DoubleToUint32:
            if (s0->isconstq())
                return out->insImm(js_DoubleToECMAUint32(s0->constvalf()));
-            if (s0->isop(LIR_i2f) || s0->isop(LIR_u2f)) {
+            if (isi2f(s0) || isu2f(s0))
-                return s0->oprnd1();
+                return iu2fArg(s0);
            }
            break;
          case F_DoubleToInt32:
            if (s0->isconstq())
@ -442,9 +574,9 @@ public:
                    return out->ins2(op, demote(out, lhs), demote(out, rhs));
                }
            }
-            if (s0->isop(LIR_i2f) || s0->isop(LIR_u2f)) {
+            if (isi2f(s0) || isu2f(s0))
-                return s0->oprnd1();
+                return iu2fArg(s0);
-            }
+            // XXX ARM -- check for qjoin(call(F_UnboxDouble),call(F_UnboxDouble))
            if (s0->isCall() && s0->fid() == F_UnboxDouble) {
                LIns* args2[] = { callArgN(s0, 0) };
                return out->insCall(F_UnboxInt32, args2);
@ -688,6 +820,9 @@ TraceRecorder::TraceRecorder(JSContext* cx, GuardRecord* _anchor, Fragment* _fra
 #ifdef DEBUG
    if (verbose_debug)
        lir = verbose_filter = new (&gc) VerboseWriter(&gc, lir, lirbuf->names);
 #endif
 #ifdef NANOJIT_ARM
    lir = float_filter = new (&gc) SoftFloatFilter(lir);
 #endif
    lir = cse_filter = new (&gc) CseFilter(lir, &gc);
    lir = expr_filter = new (&gc) ExprFilter(lir);
@ -732,6 +867,9 @@ TraceRecorder::~TraceRecorder()
    delete cse_filter;
    delete expr_filter;
    delete func_filter;
 #ifdef NANOJIT_ARM
    delete float_filter;
 #endif
    delete lir_buf_writer;
 }
@ -1435,7 +1573,7 @@ TraceRecorder::checkType(jsval& v, uint8 t, bool& unstable)
        if (!isNumber(v))
            return false; /* not a number? type mismatch */
        LIns* i = get(&v);
-        if (!i->isop(LIR_i2f)) {
+        if (!isi2f(i)) {
            debug_only_v(printf("int slot is !isInt32, slot #%d, triggering re-compilation\n",
                                !isGlobal(&v)
                                ? nativeStackOffset(&v)
@ -1445,11 +1583,11 @@ TraceRecorder::checkType(jsval& v, uint8 t, bool& unstable)
            return true; /* keep checking types, but request re-compilation */
        }
        /* Looks good, slot is an int32, the last instruction should be i2f. */
-        JS_ASSERT(isInt32(v) && i->isop(LIR_i2f));
+        JS_ASSERT(isInt32(v) && (i->isop(LIR_i2f) || i->isop(LIR_qjoin)));
        /* We got the final LIR_i2f as we expected. Overwrite the value in that
           slot with the argument of i2f since we want the integer store to flow along
           the loop edge, not the casted value. */
-        set(&v, i->oprnd1());
+        set(&v, iu2fArg(i));
        return true;
    }
    if (t == JSVAL_DOUBLE) {
@ -2080,10 +2218,11 @@ js_ExecuteTree(JSContext* cx, Fragment** treep, uintN& inlineCallCount,
    /* execute previously recorded trace */
    TreeInfo* ti = (TreeInfo*)f->vmprivate;
-    debug_only_v(printf("entering trace at %s:%u@%u, native stack slots: %u\n",
+    debug_only_v(printf("entering trace at %s:%u@%u, native stack slots: %u code: %p\n",
                        cx->fp->script->filename,
                        js_PCToLineNumber(cx, cx->fp->script, cx->fp->regs->pc),
-                        cx->fp->regs->pc - cx->fp->script->code, ti->maxNativeStackSlots););
+                        cx->fp->regs->pc - cx->fp->script->code, ti->maxNativeStackSlots,
                        f->code()););
    JSTraceMonitor* tm = &JS_TRACE_MONITOR(cx);
    unsigned ngslots = tm->globalSlots->length();
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@ -221,6 +221,9 @@ class TraceRecorder {
    nanojit::LirWriter*     cse_filter;
    nanojit::LirWriter*     expr_filter;
    nanojit::LirWriter*     func_filter;
 #ifdef NANOJIT_ARM
    nanojit::LirWriter*     float_filter;
 #endif
    nanojit::LIns*          cx_ins;
    nanojit::LIns*          gp_ins;
    nanojit::LIns*          eos_ins;
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -788,38 +788,29 @@ namespace nanojit
 		internalReset();  // clear the reservation tables and regalloc
 		NanoAssert(_branchStateMap->isEmpty());
 		_branchStateMap = 0;
-		
+
-		#if defined(UNDER_CE)
+#ifdef AVMPLUS_ARM
 		// If we've modified the code, we need to flush so we don't end up trying 
 		// to execute junk
 # if defined(UNDER_CE)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
-		#elif defined(AVMPLUS_LINUX) && defined(AVMPLUS_ARM)
+# elif defined(AVMPLUS_LINUX)
-			// N A S T Y - obviously have to fix this
+		// XXX fixme flush adjacent pages together
-		// determine our page range
+		for (int i = 0; i < 2; i++) {
 			Page *p = (i == 0) ? _nativePages : _nativeExitPages;
-		Page *page=0, *first=0, *last=0;
+			while (p) {
-		for (int i=2;i!=0;i--) {
+				flushCache((NIns*)p, (NIns*)((intptr_t)(p) + NJ_PAGE_SIZE));
-			page = first = last = (i==2 ? _nativePages : _nativeExitPages);
+				p = p->next;
 			while (page)
 			{
 				if (page<first)
 					first = page;
 				if (page>last)
 					last = page;
 				page = page->next;
 			}
 			register unsigned long _beg __asm("a1") = (unsigned long)(first);
 			register unsigned long _end __asm("a2") = (unsigned long)(last+NJ_PAGE_SIZE);
 			register unsigned long _flg __asm("a3") = 0;
 			register unsigned long _swi __asm("r7") = 0xF0002;
 			__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
 		}
-		#endif
+# endif
-	#ifdef AVMPLUS_PORTING_API
+#endif
 # ifdef AVMPLUS_PORTING_API
 		NanoJIT_PortAPI_FlushInstructionCache(_nIns, _endJit1Addr);
 		NanoJIT_PortAPI_FlushInstructionCache(_nExitIns, _endJit2Addr);
-	#endif
+# endif
 	}
 	void Assembler::copyRegisters(RegAlloc* copyTo)
@ -861,7 +852,7 @@ namespace nanojit
 			switch(op)
 			{
 				default:
-					NanoAssertMsg(false, "unsupported LIR instruction");
+					NanoAssertMsgf(false, ("unsupported LIR instruction: %d (~0x40: %d)\n",op, op&~LIR64));
 					break;
 				case LIR_short:
@ -1063,7 +1054,20 @@ namespace nanojit
 					Register rb = UnknownReg;
 					RegisterMask allow = GpRegs;
-					if (lhs != rhs && (op == LIR_mul || !rhs->isconst()))
+					bool forceReg = (op == LIR_mul || !rhs->isconst());
 #ifdef NANOJIT_ARM
 					// Arm can't do an immediate op with immediates
 					// outside of +/-255 (for AND) r outside of
 					// 0..255 for others.
 					if (!forceReg)
 					{
 						if (rhs->isconst() && !isU8(rhs->constval()))
 							forceReg = true;
 					}
 #endif
 					if (lhs != rhs && forceReg)
 					{
 						if ((rb = asm_binop_rhs_reg(ins)) == UnknownReg) {
 							rb = findRegFor(rhs, allow);
@ -1079,7 +1083,7 @@ namespace nanojit
 						ra = findSpecificRegFor(lhs, rr);
 					// else, rA already has a register assigned.
-					if (!rhs->isconst() || op == LIR_mul)
+					if (forceReg)
 					{
 						if (lhs == rhs)
 							rb = ra;
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -1546,7 +1546,12 @@ namespace nanojit
 		}
 		else {
 			if (ref->isCall()) {
-				copyName(ref, _functions[ref->fid()]._name, funccounts.add(ref->fid()));
+				if (ref->isop(LIR_callh)) {
 					// we've presumably seen the other half already
 					ref = ref->oprnd1();
 				} else {
 					copyName(ref, _functions[ref->fid()]._name, funccounts.add(ref->fid()));
 				}
 			} else {
                NanoAssert(ref->opcode() < sizeof(lirNames) / sizeof(lirNames[0]));
 				copyName(ref, lirNames[ref->opcode()], lircounts.add(ref->opcode()));
@ -1652,7 +1657,6 @@ namespace nanojit
 			case LIR_fle:
 			case LIR_fgt:
 			case LIR_fge:
 			case LIR_qjoin:
            case LIR_qiadd:
            case LIR_qiand:
            case LIR_qilsh:
@ -1662,6 +1666,12 @@ namespace nanojit
 					formatRef(i->oprnd2()));
 				break;
 			case LIR_qjoin:
 				sprintf(s, "%s (%s), %s", lirNames[op],
 					formatIns(i->oprnd1()), 
 					formatRef(i->oprnd2()));
 				break;
 			case LIR_qcmov:
 			case LIR_cmov:
                sprintf(s, "%s ? %s : %s", 
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -149,11 +149,14 @@ namespace nanojit
 	#define DECLARE_PLATFORM_ASSEMBLER()\
 		const static Register argRegs[4], retRegs[2];\
 		void LD32_nochk(Register r, int32_t imm);\
 		void BL(NIns*);\
 		void BL_far(NIns*);\
 		void CALL(const CallInfo*);\
 		void underrunProtect(int bytes);\
 		bool has_cmov;\
 		void nativePageReset();\
 		void nativePageSetup();\
 		void flushCache(NIns*,NIns*);\
 		int* _nSlot;\
 		int* _nExitSlot;
@ -232,6 +235,7 @@ ShiftOperator;
 		*(--_nIns) = (NIns)( COND_AL | OP_IMM | ((_r)<<16) | ((_r)<<12) | ((_imm)&0xFF) );\
 		asm_output2("and %s,%d",gpn(_r),(_imm));}\
 	else if ((_imm)<0 && (_imm)>-256) {\
 		underrunProtect(8);\
 		*(--_nIns) = (NIns)( COND_AL | ((_r)<<16) | ((_r)<<12) | (Scratch) );\
 		asm_output2("and %s,%s",gpn(_r),gpn(Scratch));\
 		*(--_nIns) = (NIns)( COND_AL | (0x3E<<20) | ((Scratch)<<12) | (((_imm)^0xFFFFFFFF)&0xFF) );\
@ -532,6 +536,7 @@ ShiftOperator;
 //#define INT3()  underrunProtect(1); *(--_nIns) = 0xcc;  asm_output("int3")
 //#define RET() INT3()
 #define BKPT_nochk() do { *(--_nIns) = (NIns)( (0xE<<24) | (0x12<<20) | (0x7<<4) ); } while (0);
 // this is pushing a reg
 #define PUSHr(_r)  do {\
@ -564,49 +569,66 @@ ShiftOperator;
 	*(--_nIns) = (NIns)( COND_AL | (0x8B<<20) | (SP<<16) | (_mask) );\
 	asm_output1("pop %x", (_mask));} while (0)
-// takes an offset (right?)
+#define PC_OFFSET_FROM(target,frompc) ((intptr_t)(target) - ((intptr_t)(frompc) + 8))
-#define JMP_long_nochk_offset(_off) do {\
+#define JMP_S24_OFFSET_OK(offs) ((-(1<<24)) <= (offs) && (offs) < (1<<24))
 	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((_off)>>2) & 0xFFFFFF) );	\
 	asm_output1("jmp_l_n 0x%08x",(_off));} while (0)
-// take an address, not an offset
+// (XXX This ought to be a function instead of a macro)
-#define JMP(t)	do {\
+//
-	underrunProtect(4);\
+// Branch to target address _t with condition _c, doing underrun
-	intptr_t tt = (intptr_t)(t) - ((intptr_t)_nIns + 4);\
+// checks (_chk == 1) or skipping them (_chk == 0).
-	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
+//
-	asm_output1("JMP 0x%08x\n",(unsigned int)(t)); } while (0)
+// If the jump fits in a relative jump (+/-32MB), emit that.
 // If the jump is unconditional, emit the dest address inline in
 // the instruction stream and load it into pc.
 // If the jump has a condition, but noone's mucked with _nIns and our _nSlot
 // pointer is valid, stick the constant in the slot and emit a conditional
 // load into pc.
 // Otherwise, emit the conditional load into pc from a nearby constant,
 // and emit a jump to jump over it it in case the condition fails.
 //
 // NB: JMP_nochk depends on this not calling samepage() when _c == AL
 #define B_cond_chk(_c,_t,_chk) do {										\
 		int32 offs = PC_OFFSET_FROM(_t,(intptr_t)(_nIns)-4);			\
 		if (JMP_S24_OFFSET_OK(offs)) {									\
 			if(_chk) underrunProtect(4);								\
 			*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | (((offs)>>2) & 0xFFFFFF) ); \
 		} else if (_c == AL) {											\
 			if(_chk) underrunProtect(8);								\
 			*(--_nIns) = (NIns)(_t);									\
 			*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | 0x4 ); \
 		} else if (samepage(_nIns,_nSlot)) {							\
 			if(_chk) underrunProtect(8);								\
 			*(++_nSlot) = (NIns)(_t);									\
 			offs = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);			\
 			NanoAssert(offs < 0);										\
 			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | ((-offs) & 0xFFFFFF) ); \
 		} else {														\
 			if(_chk) underrunProtect(24);								\
 			*(--_nIns) = (NIns)(_t);									\
 			*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | ((-4)>>2) & 0xFFFFFF ); \
 			*(--_nIns) = (NIns)( ((_c)<<28) | (0x51<<20) | (PC<<16) | (PC<<12) | 0x0 ); \
 		}																\
 		asm_output2("%s %p\n", _c == AL ? "jmp" : "b(cnd)", (void*)(_t)); \
 	} while(0)
-#define JMP_nochk(t)	do {\
+#define B_cond(_c,_t) \
-	intptr_t tt = (intptr_t)(t) - ((intptr_t)_nIns + 4);\
+	B_cond_chk(_c,_t,1)
 	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
 	asm_output1("JMP 0x%08x\n",(unsigned int)(t)); } while (0)
-#define JMP_long_placeholder()	do {JMP_long(0xffffffff); } while(0)
+// NB: don't use COND_AL here, we shift the condition into place!
 #define JMP(_t) \
 	B_cond_chk(AL,_t,1)
-#define JMP_long(_t)	do {\
+#define JMP_nochk(_t) \
-	underrunProtect(4);\
+	B_cond_chk(AL,_t,0)
 	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((_t)>>2) & 0xFFFFFF) );	\
 	asm_output1("JMP_long 0x%08x\n", (unsigned int)(_t) ); } while (0)
 #define BL(_t)	do {\
 	underrunProtect(4);\
 	intptr_t _tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
 	*(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((_tt)>>2) & 0xFFFFFF) );	\
 	asm_output2("BL 0x%08x offset=%d",(intptr_t)(_nIns) + (_tt),(_tt)) } while (0)
 #define JMP_long_nochk(_t)	do {\
 	intptr_t tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
 	*(--_nIns) = (NIns)( COND_AL | (0xA<<24) | (((tt)>>2) & 0xFFFFFF) );	\
 	asm_output1("JMP_l_n 0x%08x\n", (unsigned int)(_t)) } while (0)
 #define B_cond(_c,_t)\
 	underrunProtect(4);\
 	intptr_t tt = (intptr_t)(_t) - ((intptr_t)_nIns + 4);\
 	*(--_nIns) = (NIns)( ((_c)<<28) | (0xA<<24) | ((tt >>2)& 0xFFFFFF) );	\
 	asm_output2("b(cond) 0x%08x (%tX)",(unsigned int)(_t), tt);
 // emit a placeholder that will be filled in later by nPatchBranch;
 // emit two breakpoint instructions in case something goes wrong with
 // the patching.
 #define JMP_long_placeholder()	do {							\
 		underrunProtect(8);										\
 		BKPT_nochk();											\
 		BKPT_nochk();											\
 	} while(0)
 #define JA(t)	do {B_cond(HI,t); asm_output1("ja 0x%08x",(unsigned int)t); } while(0)
 #define JNA(t)	do {B_cond(LS,t); asm_output1("jna 0x%08x",(unsigned int)t); } while(0)
--- a/js/src/nanojit/NativeThumb.cpp
+++ b/js/src/nanojit/NativeThumb.cpp
@ -148,7 +148,14 @@ namespace nanojit
 		{
 			// target doesn't exit yet.  emit jump to epilog, and set up to patch later.
 			lr = placeGuardRecord(guard);
-            BL(_epilogue);
+
 #ifdef NJ_THUMB_JIT
 			BL(_epilogue);
 #else
 			// we need to know that there's an extra immediate value available
 			// for us; always force a far jump here.
 			BL_far(_epilogue);
 #endif
 			lr->jmp = _nIns;
 		}
@ -196,6 +203,25 @@ namespace nanojit
 	void Assembler::asm_call(LInsp ins)
 	{
        const CallInfo* call = callInfoFor(ins->fid());
 		uint32_t atypes = call->_argtypes;
 		uint32_t roffset = 0;
 		// we need to detect if we have arg0 as LO followed by arg1 as F;
 		// in that case, we need to skip using r1 -- the F needs to be
 		// loaded in r2/r3, at least according to the ARM EABI and gcc 4.2's
 		// generated code.
 		bool arg0IsInt32FollowedByFloat = false;
 		while ((atypes & 3) != ARGSIZE_NONE) {
 			if (((atypes >> 4) & 3) == ARGSIZE_LO &&
 				((atypes >> 2) & 3) == ARGSIZE_F &&
 				((atypes >> 6) & 3) == ARGSIZE_NONE)
 			{
 				arg0IsInt32FollowedByFloat = true;
 				break;
 			}
 			atypes >>= 2;
 		}
 		CALL(call);
        ArgSize sizes[10];
        uint32_t argc = call->get_sizes(sizes);
@ -205,8 +231,11 @@ namespace nanojit
            ArgSize sz = sizes[j];
            NanoAssert(sz == ARGSIZE_LO || sz == ARGSIZE_Q);
    		// pre-assign registers R0-R3 for arguments (if they fit)
-            Register r = i < 4 ? argRegs[i] : UnknownReg;
+            Register r = (i+roffset) < 4 ? argRegs[i+roffset] : UnknownReg;
            asm_arg(sz, ins->arg(j), r);
 			if (i == 0 && arg0IsInt32FollowedByFloat)
 				roffset = 1;
 		}
 	}
@ -277,19 +306,28 @@ namespace nanojit
 		// This is ALWAYS going to be a long branch (using the BL instruction)
 		// Which is really 2 instructions, so we need to modify both
 		// XXX -- this is B, not BL, at least on non-Thumb..
 		// branch+2 because PC is always 2 instructions ahead on ARM/Thumb
 		int32_t offset = int(target) - int(branch+2);
-//printf("---patching branch at %X to location %X (%d)\n", branch, target, offset);
+		//printf("---patching branch at 0x%08x to location 0x%08x (%d-0x%08x)\n", branch, target, offset, offset);
 #ifdef NJ_THUMB_JIT
 		NanoAssert(-(1<<21) <= offset && offset < (1<<21)); 
 		*branch++ = (NIns)(0xF000 | (offset>>12)&0x7FF);
 		*branch =   (NIns)(0xF800 | (offset>>1)&0x7FF);
 #else
-		// ARM goodness, using unconditional B
+		// We have 2 words to work with here -- if offset is in range of a 24-bit
-		*branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2)& 0xFFFFFF) );
+		// relative jump, emit that; otherwise, we do a pc-relative load into pc.
 		if (-(1<<24) <= offset & offset < (1<<24)) {
 			// ARM goodness, using unconditional B
 			*branch = (NIns)( COND_AL | (0xA<<24) | ((offset >>2) & 0xFFFFFF) );
 		} else {
 			// LDR pc,[pc]
 			*branch++ = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | (PC<<12) | ( 0x004 ) );
 			*branch = (NIns)target;
 		}
 #endif
 	}
@ -451,37 +489,6 @@ namespace nanojit
 		}
 	}
 	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
 	{
 		NIns* save = _nIns;
 #ifdef NJ_THUMB_JIT
 		NIns* was =  (NIns*) (((((*(at+2))&0x7ff)<<12) | (((*(at+1))&0x7ff)<<1)) + (at-2+2));
 		_nIns = at + 2;
 #else
 		NIns* was = (NIns*) (((*at&0xFFFFFF)<<2));
 	    _nIns = at + 1;
 #endif
 		BL(target);
 	#ifdef AVMPLUS_PORTING_API
 		NanoJIT_PortAPI_FlushInstructionCache(save, _nIns);
 	#endif
 		#if defined(UNDER_CE)
 		// we changed the code, so we need to do this (sadly)
 			FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
 		#elif defined(AVMPLUS_LINUX)
 			// Just need to clear this one page (not even the whole page really)
 			//Page *page = (Page*)pageTop(_nIns);
 			register unsigned long _beg __asm("a1") = (unsigned long)(_nIns);
 			register unsigned long _end __asm("a2") = (unsigned long)(_nIns+2);
 			register unsigned long _flg __asm("a3") = 0;
 			register unsigned long _swi __asm("r7") = 0xF0002;
 			__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
 		#endif
 		_nIns = save;
 		return was;
 	}
 	void Assembler::nativePageReset()
 	{
 		#ifdef NJ_THUMB_JIT
@ -521,21 +528,55 @@ namespace nanojit
 		#else
 		if (!_nSlot)
 		{
-			// This needs to be done or the samepage macro gets confused
+			// This needs to be done or the samepage macro gets confused; pageAlloc
 			// gives us a pointer to just past the end of the page.
 			_nIns--;
 			_nExitIns--;
 			// constpool starts at top of page and goes down,
 			// code starts at bottom of page and moves up
-			_nSlot = (int*)(pageTop(_nIns)+1);
+			_nSlot = pageDataStart(_nIns); //(int*)(&((Page*)pageTop(_nIns))->lir[0]);
 		}
 		#endif
 	}
 	void Assembler::flushCache(NIns* n1, NIns* n2) {
 #if defined(UNDER_CE)
 		// we changed the code, so we need to do this (sadly)
 		FlushInstructionCache(GetCurrentProcess(), NULL, NULL);
 #elif defined(AVMPLUS_LINUX)
 		// Just need to clear this one page (not even the whole page really)
 		//Page *page = (Page*)pageTop(_nIns);
 		register unsigned long _beg __asm("a1") = (unsigned long)(n1);
 		register unsigned long _end __asm("a2") = (unsigned long)(n2);
 		register unsigned long _flg __asm("a3") = 0;
 		register unsigned long _swi __asm("r7") = 0xF0002;
 		__asm __volatile ("swi 0 	@ sys_cacheflush" : "=r" (_beg) : "0" (_beg), "r" (_end), "r" (_flg), "r" (_swi));
 #endif
 	}
 #ifdef NJ_THUMB_JIT
 	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
 	{
 		NIns* save = _nIns;
 		NIns* was =  (NIns*) (((((*(at+2))&0x7ff)<<12) | (((*(at+1))&0x7ff)<<1)) + (at-2+2));
 		_nIns = at + 2;
 		BL(target);
 		flushCache(_nIns, _nIns+2);
 #ifdef AVMPLUS_PORTING_API
 		// XXX save.._nIns+2? really?
 		NanoJIT_PortAPI_FlushInstructionCache(save, _nIns+2);
 #endif
 		_nIns = save;
 		return was;
 	}
 	void Assembler::STi(Register b, int32_t d, int32_t v)
 	{
 		ST(b, d, Scratch);
@ -551,6 +592,7 @@ namespace nanojit
 	void Assembler::underrunProtect(int bytes)
 	{
 		// perhaps bytes + sizeof(PageHeader)/sizeof(NIns) + 4 ?
 		intptr_t u = bytes + 4;
 		if (!samepage(_nIns-u, _nIns-1)) {
 			NIns* target = _nIns;
@ -855,45 +897,94 @@ namespace nanojit
 	}
 #else // ARM_JIT
-		void Assembler::underrunProtect(int bytes)
+	NIns* Assembler::asm_adjustBranch(NIns* at, NIns* target)
 		{
 			intptr_t u = (bytes) + 4;
 			if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
 				 (!samepage((intptr_t)_nIns-u,_nIns)) )
 			{
 				NIns* target = _nIns;
 				_nIns = pageAlloc(_inExit);
 				JMP_nochk(target);
 				_nSlot = pageTop(_nIns);
 			}
 		}		
 	bool isB24(NIns *target, NIns *cur)
 	{
-		int offset = int(target)-int(cur-2+2);
+		// This always got emitted as a BL_far sequence; at points
-		return (-(1<<24) <= offset && offset < (1<<24));
+		// to the first of 4 instructions.  Ensure that we're where
 		// we think we were..
 		NanoAssert(at[1] == (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) ));
 		NanoAssert(at[2] == (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) ));
 		NIns* was = (NIns*) at[3];
 		at[3] = (NIns)target;
 		flushCache(at, at+4);
 #ifdef AVMPLUS_PORTING_API
 		NanoJIT_PortAPI_FlushInstructionCache(at, at+4);
 #endif
 		return was;
 	}
 	void Assembler::underrunProtect(int bytes)
 	{
 		intptr_t u = bytes + sizeof(PageHeader)/sizeof(NIns) + 8;
 		if ( (samepage(_nIns,_nSlot) && (((intptr_t)_nIns-u) <= intptr_t(_nSlot+1))) ||
 			 (!samepage((intptr_t)_nIns-u,_nIns)) )
 		{
 			NIns* target = _nIns;
 			_nIns = pageAlloc(_inExit);
 			// XXX _nIns at this point points to one past the end of
 			// the page, intended to be written into using *(--_nIns).
 			// However, (guess) something seems to be storing the value
 			// of _nIns as is, and then later generating a jump to a bogus
 			// address.  So pre-decrement to ensure that it's always
 			// valid; we end up skipping using the last instruction this
 			// way.
 			_nIns--;
 			// Update slot, either to _nIns (if decremented above), or
 			// _nIns-1 once the above bug is fixed/found.
 			_nSlot = pageDataStart(_nIns);
 			// If samepage() is used on _nIns and _nSlot, it'll fail, since _nIns
 			// points to one past the end of the page right now.  Assume that 
 			// JMP_nochk won't ever try to write to _nSlot, and so won't ever
 			// check samepage().  See B_cond_chk macro.
 			JMP_nochk(target);
 		} else if (!_nSlot) {
 			// make sure that there's always a slot pointer
 			_nSlot = pageDataStart(_nIns);
 		}
 	}
 	void Assembler::BL_far(NIns* addr) {
 		// we have to stick an immediate into the stream and make lr
 		// point to the right spot before branching
 		underrunProtect(16);
 		// the address
 		*(--_nIns) = (NIns)((addr));
 		// bx ip             // branch to the address we loaded earlier
 		*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
 		// add lr, [pc + #4] // set lr to be past the address that we wrote
 		*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
 		// ldr ip, [pc + #4] // load the address into ip, reading it from [pc+4]
 		*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
 		asm_output1("bl %p (32-bit)", addr);
 	}
 	void Assembler::BL(NIns* addr) {
 		intptr_t offs = PC_OFFSET_FROM(addr,(intptr_t)_nIns-4);
 		if (JMP_S24_OFFSET_OK(offs)) {
 			// we can do this with a single BL call
 			underrunProtect(4);
 			*(--_nIns) = (NIns)( COND_AL | (0xB<<24) | (((offs)>>2) & 0xFFFFFF) ); \
 			asm_output1("bl %p", addr);
 		} else {
 			BL_far(addr);
 		}
 	}
 	void Assembler::CALL(const CallInfo *ci)
 	{
        intptr_t addr = ci->_address;
-		if (isB24((NIns*)addr, _nIns))
+		BL((NIns*)addr);
-		{
+		asm_output1("   (call %s)", ci->_name);
 			// we can do this with a single BL call
 			underrunProtect(4);
 			BL(addr);
 			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 		else
 		{
 			underrunProtect(16);
 			*(--_nIns) = (NIns)((addr));
 			*(--_nIns) = (NIns)( COND_AL | (0x9<<21) | (0xFFF<<8) | (1<<4) | (IP) );
 			*(--_nIns) = (NIns)( COND_AL | OP_IMM | (1<<23) | (PC<<16) | (LR<<12) | (4) );
 			*(--_nIns) = (NIns)( COND_AL | (0x59<<20) | (PC<<16) | (IP<<12) | (4));
 			asm_output2("call %08X:%s", addr, ci->_name);
 		}
 	}
 #endif // NJ_THUMB_JIT
@ -937,31 +1028,18 @@ namespace nanojit
 	#else
 		// We can always reach the const pool, since it's on the same page (<4096)
-
+		underrunProtect(8);
 		if (!_nSlot)
 			_nSlot = pageTop(_nIns);
 		if ( (_nSlot+1) >= (_nIns-1) )
 		{
 			// This would overrun the code, so we need a new page
 			// and a jump to that page
 			NIns* target = _nIns;
 			_nIns = pageAlloc(_inExit);
 			JMP_nochk(target);
 			// reset the slot
 			_nSlot = pageTop(_nIns);
 		}
 		*(++_nSlot) = (int)imm;
-		int offset = (int)(_nSlot) - (int)(_nIns+1);
+		//fprintf (stderr, "wrote slot(2) %p with %08x, jmp @ %p\n", _nSlot, (intptr_t)imm, _nIns-1);
-		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | -(offset));
+		int offset = PC_OFFSET_FROM(_nSlot,(intptr_t)(_nIns)-4);
 		NanoAssert(JMP_S24_OFFSET_OK(offset) && (offset < 0));
 		*(--_nIns) = (NIns)( COND_AL | (0x51<<20) | (PC<<16) | ((r)<<12) | ((-offset) & 0xFFFFFF) );
 		asm_output2("ld %s,%d",gpn(r),imm);
 	#endif
 	}
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@ -156,6 +156,7 @@ namespace nanojit
 #define alignUp(x,s)		((((uintptr_t)(x))+(((uintptr_t)s)-1))&~(((uintptr_t)s)-1))
 #define pageTop(x)			( (int*)alignTo(x,NJ_PAGE_SIZE) )
 #define pageDataStart(x)    ( (int*)(alignTo(x,NJ_PAGE_SIZE) + sizeof(PageHeader)) )
 #define pageBottom(x)		( (int*)(alignTo(x,NJ_PAGE_SIZE)+NJ_PAGE_SIZE)-1 )
 #define samepage(x,y)		(pageTop(x) == pageTop(y))