Merge tracemonkey to mozilla-central.

2024-09-13 09:24:08 -07:00 · 2009-12-17 23:18:40 -05:00 · 2009-12-17 23:18:40 -05:00 · cc4ca115a3
commit cc4ca115a3
parent dee329ec61 8b449748c7
55 changed files with 2723 additions and 1776 deletions
--- a/js/src/js.msg
+++ b/js/src/js.msg
@ -86,7 +86,7 @@ MSG_DEF(JSMSG_MORE_ARGS_NEEDED,         3, 3, JSEXN_TYPEERR, "{0} requires more
 MSG_DEF(JSMSG_BAD_CHAR,                 4, 1, JSEXN_INTERNALERR, "invalid format character {0}")
 MSG_DEF(JSMSG_BAD_TYPE,                 5, 1, JSEXN_TYPEERR, "unknown type {0}")
 MSG_DEF(JSMSG_ALLOC_OVERFLOW,           6, 0, JSEXN_INTERNALERR, "allocation size overflow")
-MSG_DEF(JSMSG_CANT_UNLOCK,              7, 0, JSEXN_INTERNALERR, "can't unlock memory")
+MSG_DEF(JSMSG_UNUSED7,                  7, 0, JSEXN_NONE, "")
 MSG_DEF(JSMSG_INCOMPATIBLE_PROTO,       8, 3, JSEXN_TYPEERR, "{0}.prototype.{1} called on incompatible {2}")
 MSG_DEF(JSMSG_NO_CONSTRUCTOR,           9, 1, JSEXN_TYPEERR, "{0} has no constructor")
 MSG_DEF(JSMSG_CANT_ALIAS,              10, 3, JSEXN_TYPEERR, "can't alias {0} to {1} in class {2}")
--- a/js/src/jsapi-tests/testXDR.cpp
+++ b/js/src/jsapi-tests/testXDR.cpp
@ -119,6 +119,7 @@ BEGIN_TEST(testXDR_bug525481)
    JSXDRState *r = JS_XDRNewMem(cx, JSXDR_DECODE);
    JS_XDRMemSetData(r, frozen, nbytes);
    CHECK(JS_XDRScript(r, &script));
+    JS_DestroyScript(cx, script);
    JS_XDRDestroy(r);  // this frees `frozen`
    return true;
 }
--- a/js/src/jsapi.cpp
+++ b/js/src/jsapi.cpp
@ -2008,19 +2008,16 @@ JS_LockGCThingRT(JSRuntime *rt, void *thing)
 JS_PUBLIC_API(JSBool)
 JS_UnlockGCThing(JSContext *cx, void *thing)
 {
-    JSBool ok;
-
    CHECK_REQUEST(cx);
-    ok = js_UnlockGCThingRT(cx->runtime, thing);
-    if (!ok)
-        JS_ReportErrorNumber(cx, js_GetErrorMessage, NULL, JSMSG_CANT_UNLOCK);
-    return ok;
+    js_UnlockGCThingRT(cx->runtime, thing);
+    return true;
 }

 JS_PUBLIC_API(JSBool)
 JS_UnlockGCThingRT(JSRuntime *rt, void *thing)
 {
-    return js_UnlockGCThingRT(rt, thing);
+    js_UnlockGCThingRT(rt, thing);
+    return true;
 }

 JS_PUBLIC_API(void)
@ -2543,7 +2540,7 @@ JS_PUBLIC_API(JSBool)
 JS_IsAboutToBeFinalized(JSContext *cx, void *thing)
 {
    JS_ASSERT(thing);
-    return js_IsAboutToBeFinalized(cx, thing);
+    return js_IsAboutToBeFinalized(thing);
 }

 JS_PUBLIC_API(void)
--- a/js/src/jsarray.cpp
+++ b/js/src/jsarray.cpp
@ -822,22 +822,6 @@ slowarray_addProperty(JSContext *cx, JSObject *obj, jsval id, jsval *vp)
    return JS_TRUE;
 }

-static void
-slowarray_trace(JSTracer *trc, JSObject *obj)
-{
-    uint32 length = obj->fslots[JSSLOT_ARRAY_LENGTH];
-
-    JS_ASSERT(STOBJ_GET_CLASS(obj) == &js_SlowArrayClass);
-
-    /*
-     * Move JSSLOT_ARRAY_LENGTH aside to prevent the GC from treating
-     * untagged integer values as objects or strings.
-     */
-    obj->fslots[JSSLOT_ARRAY_LENGTH] = JSVAL_VOID;
-    js_TraceObject(trc, obj);
-    obj->fslots[JSSLOT_ARRAY_LENGTH] = length;
-}
-
 static JSObjectOps js_SlowArrayObjectOps;

 static JSObjectOps *
@ -1271,7 +1255,7 @@ JSClass js_ArrayClass = {

 JSClass js_SlowArrayClass = {
    "Array",
-    JSCLASS_HAS_RESERVED_SLOTS(1) |
+    JSCLASS_HAS_PRIVATE |
    JSCLASS_HAS_CACHED_PROTO(JSProto_Array),
    slowarray_addProperty, JS_PropertyStub, JS_PropertyStub,  JS_PropertyStub,
    JS_EnumerateStub,      JS_ResolveStub,  js_TryValueOf,    NULL,
@ -1338,9 +1322,14 @@ js_MakeArraySlow(JSContext *cx, JSObject *obj)
     * a jsval, set our slow/sparse COUNT to the current length as a jsval, so
     * we can tell when only named properties have been added to a dense array
     * to make it slow-but-not-sparse.
+     *
+     * We do not need to make the length slot GC-safe as this slot is private
+     * where the implementation can store an arbitrary value.
     */
    {
-        uint32 length = obj->fslots[JSSLOT_ARRAY_LENGTH];
+        JS_STATIC_ASSERT(JSSLOT_ARRAY_LENGTH == JSSLOT_PRIVATE);
+        JS_ASSERT(js_SlowArrayClass.flags & JSCLASS_HAS_PRIVATE);
+        uint32 length = uint32(obj->fslots[JSSLOT_ARRAY_LENGTH]);
        obj->fslots[JSSLOT_ARRAY_COUNT] = INT_FITS_IN_JSVAL(length)
                                          ? INT_TO_JSVAL(length)
                                          : JSVAL_VOID;
@ -3464,7 +3453,6 @@ js_InitArrayClass(JSContext *cx, JSObject *obj)

    /* Initialize the ops structure used by slow arrays */
    memcpy(&js_SlowArrayObjectOps, &js_ObjectOps, sizeof(JSObjectOps));
-    js_SlowArrayObjectOps.trace = slowarray_trace;
    js_SlowArrayObjectOps.enumerate = slowarray_enumerate;
    js_SlowArrayObjectOps.call = NULL;

--- a/js/src/jsatom.cpp
+++ b/js/src/jsatom.cpp
@ -581,7 +581,6 @@ js_atom_sweeper(JSDHashTable *table, JSDHashEntryHdr *hdr,
                uint32 number, void *arg)
 {
    JSAtomHashEntry *entry = TO_ATOM_ENTRY(hdr);
-    JSContext *cx = (JSContext *)arg;

    /* Remove uninitialized entries.  */
    if (entry->keyAndFlags == 0)
@ -589,8 +588,8 @@ js_atom_sweeper(JSDHashTable *table, JSDHashEntryHdr *hdr,

    if (ATOM_ENTRY_FLAGS(entry) & (ATOM_PINNED | ATOM_INTERNED)) {
        /* Pinned or interned key cannot be finalized. */
-        JS_ASSERT(!js_IsAboutToBeFinalized(cx, ATOM_ENTRY_KEY(entry)));
-    } else if (js_IsAboutToBeFinalized(cx, ATOM_ENTRY_KEY(entry))) {
+        JS_ASSERT(!js_IsAboutToBeFinalized(ATOM_ENTRY_KEY(entry)));
+    } else if (js_IsAboutToBeFinalized(ATOM_ENTRY_KEY(entry))) {
        /* Remove entries with things about to be GC'ed. */
        return JS_DHASH_REMOVE;
    }
@ -602,8 +601,8 @@ js_SweepAtomState(JSContext *cx)
 {
    JSAtomState *state = &cx->runtime->atomState;

-    JS_DHashTableEnumerate(&state->doubleAtoms, js_atom_sweeper, cx);
-    JS_DHashTableEnumerate(&state->stringAtoms, js_atom_sweeper, cx);
+    JS_DHashTableEnumerate(&state->doubleAtoms, js_atom_sweeper, NULL);
+    JS_DHashTableEnumerate(&state->stringAtoms, js_atom_sweeper, NULL);

    /*
     * Optimize for simplicity and mutate table generation numbers even if the
--- a/js/src/jsbuiltins.cpp
+++ b/js/src/jsbuiltins.cpp
@ -235,7 +235,8 @@ js_AddProperty(JSContext* cx, JSObject* obj, JSScopeProperty* sprop)

    uint32 slot = sprop->slot;
    JSScope* scope = OBJ_SCOPE(obj);
-    JS_ASSERT(slot == scope->freeslot);
+    if (slot != scope->freeslot)
+        goto exit_trace;
    JS_ASSERT(sprop->parent == scope->lastProperty());

    if (scope->owned()) {
--- a/js/src/jscntxt.h
+++ b/js/src/jscntxt.h
@ -95,10 +95,6 @@ namespace nanojit
    class Assembler;
    class CodeAlloc;
    class Fragment;
-    class LirBuffer;
-#ifdef DEBUG
-    class LabelMap;
-#endif
    template<typename K> struct DefaultHash;
    template<typename K, typename V, typename H> class HashMap;
    template<typename T> class Seq;
@ -113,7 +109,6 @@ static const size_t MAX_GLOBAL_SLOTS = 4096;
 static const size_t GLOBAL_SLOTS_BUFFER_SIZE = MAX_GLOBAL_SLOTS + 1;

 /* Forward declarations of tracer types. */
-class TreeInfo;
 class VMAllocator;
 class TraceRecorder;
 class FrameInfoCache;
@ -149,7 +144,7 @@ struct InterpState
                                        // call exit guard mismatched
    void*          rpAtLastTreeCall;    // value of rp at innermost tree call guard
    VMSideExit*    outermostTreeExitGuard; // the last side exit returned by js_CallTree
-    TreeInfo*      outermostTree;       // the outermost tree we initially invoked
+    TreeFragment*  outermostTree;       // the outermost tree we initially invoked
    uintN*         inlineCallCountp;    // inline call count counter
    VMSideExit**   innermostNestedGuardp;
    VMSideExit*    innermost;
@ -168,7 +163,7 @@ struct InterpState
    uintN          nativeVpLen;
    jsval*         nativeVp;

-    InterpState(JSContext *cx, JSTraceMonitor *tm, TreeInfo *ti,
+    InterpState(JSContext *cx, JSTraceMonitor *tm, TreeFragment *ti,
                uintN &inlineCallCountp, VMSideExit** innermostNestedGuardp);
    ~InterpState();
 };
@ -222,32 +217,40 @@ struct JSTraceMonitor {
    TraceNativeStorage      storage;

    /*
-     * There are 3 allocators here. This might seem like overkill, but they
+     * There are 5 allocators here.  This might seem like overkill, but they
     * have different lifecycles, and by keeping them separate we keep the
-     * amount of retained memory down significantly.
+     * amount of retained memory down significantly.  They are flushed (ie.
+     * all the allocated memory is freed) periodically.
     *
-     * The dataAlloc has the lifecycle of the monitor. It's flushed only
-     * when the monitor is flushed.
+     * - dataAlloc has the lifecycle of the monitor.  It's flushed only when
+     *   the monitor is flushed.  It's used for fragments.
     *
-     * The traceAlloc has the same flush lifecycle as the dataAlloc, but
-     * it is also *marked* when a recording starts and rewinds to the mark
-     * point if recording aborts. So you can put things in it that are only
-     * reachable on a successful record/compile cycle.
+     * - traceAlloc has the same flush lifecycle as the dataAlloc, but it is
+     *   also *marked* when a recording starts and rewinds to the mark point
+     *   if recording aborts.  So you can put things in it that are only
+     *   reachable on a successful record/compile cycle like GuardRecords and
+     *   SideExits.
     *
-     * The tempAlloc is flushed after each recording, successful or not.
+     * - tempAlloc is flushed after each recording, successful or not.  It's
+     *   used to store LIR code and for all other elements in the LIR
+     *   pipeline.
+     *
+     * - reTempAlloc is just like tempAlloc, but is used for regexp
+     *   compilation in RegExpNativeCompiler rather than normal compilation in
+     *   TraceRecorder.
+     *
+     * - codeAlloc has the same lifetime as dataAlloc, but its API is
+     *   different (CodeAlloc vs. VMAllocator).  It's used for native code.
+     *   It's also a good idea to keep code and data separate to avoid I-cache
+     *   vs. D-cache issues.
     */
-
-    VMAllocator*            dataAlloc;   /* A chunk allocator for fragments. */
-    VMAllocator*            traceAlloc;  /* An allocator for trace metadata. */
-    VMAllocator*            tempAlloc;   /* A temporary chunk allocator.  */
-    nanojit::CodeAlloc*     codeAlloc;   /* An allocator for native code. */
+    VMAllocator*            dataAlloc;
+    VMAllocator*            traceAlloc;
+    VMAllocator*            tempAlloc;
+    VMAllocator*            reTempAlloc;
+    nanojit::CodeAlloc*     codeAlloc;
    nanojit::Assembler*     assembler;
-    nanojit::LirBuffer*     lirbuf;
-    nanojit::LirBuffer*     reLirBuf;
    FrameInfoCache*         frameCache;
-#ifdef DEBUG
-    nanojit::LabelMap*      labels;
-#endif

    TraceRecorder*          recorder;

@ -280,11 +283,6 @@ struct JSTraceMonitor {
     */
    REHashMap*              reFragments;

-    /*
-     * A temporary allocator for RE recording.
-     */
-    VMAllocator*            reTempAlloc;
-
 #ifdef DEBUG
    /* Fields needed for fragment/guard profiling. */
    nanojit::Seq<nanojit::Fragment*>* branches;
@ -319,7 +317,7 @@ typedef struct InterpStruct InterpStruct;
 # define JS_ON_TRACE(cx)            JS_FALSE
 #endif

-#ifdef DEBUG
+#ifdef DEBUG_brendan
 # define JS_EVAL_CACHE_METERING     1
 # define JS_FUNCTION_METERING       1
 #endif
@ -608,14 +606,10 @@ struct JSRuntime {
     */
    ptrdiff_t           gcMallocBytes;

-    /*
-     * Stack of GC arenas containing things that the GC marked, where children
-     * reached from those things have not yet been marked. This helps avoid
-     * using too much native stack during recursive GC marking.
-     */
-    JSGCArenaInfo       *gcUntracedArenaStackTop;
+    /* See comments before DelayMarkingChildren is jsgc.cpp. */
+    JSGCArenaInfo       *gcUnmarkedArenaStackTop;
 #ifdef DEBUG
-    size_t              gcTraceLaterCount;
+    size_t              gcMarkLaterCount;
 #endif

    /*
--- a/js/src/jsdbgapi.cpp
+++ b/js/src/jsdbgapi.cpp
@ -526,7 +526,7 @@ js_SweepWatchPoints(JSContext *cx)
         &wp->links != &rt->watchPointList;
         wp = next) {
        next = (JSWatchPoint *)wp->links.next;
-        if (js_IsAboutToBeFinalized(cx, wp->object)) {
+        if (js_IsAboutToBeFinalized(wp->object)) {
            sample = rt->debuggerMutations;

            /* Ignore failures. */
--- a/js/src/jsemit.cpp
+++ b/js/src/jsemit.cpp
@ -4384,7 +4384,7 @@ js_EmitTree(JSContext *cx, JSCodeGenerator *cg, JSParseNode *pn)
        cg2->staticLevel = cg->staticLevel + 1;

        /* We measured the max scope depth when we parsed the function. */
-        JS_SCOPE_DEPTH_METERING(cg2->maxScopeDepth = (uintN) -1);
+        JS_SCOPE_DEPTH_METERING(cg2->maxScopeDepth = uint16(-1));
        if (!js_EmitFunctionScript(cx, cg2, pn->pn_body))
            pn = NULL;

--- a/js/src/jsemit.h
+++ b/js/src/jsemit.h
@ -157,8 +157,10 @@ struct JSStmtInfo {

 #ifdef JS_SCOPE_DEPTH_METER
 # define JS_SCOPE_DEPTH_METERING(code) ((void) (code))
+# define JS_SCOPE_DEPTH_METERING_IF(cond, code) ((cond) ? (void) (code) : (void) 0)
 #else
 # define JS_SCOPE_DEPTH_METERING(code) ((void) 0)
+# define JS_SCOPE_DEPTH_METERING_IF(code, x) ((void) 0)
 #endif

 struct JSTreeContext {              /* tree context for semantic checks */
@ -208,15 +210,15 @@ struct JSTreeContext {              /* tree context for semantic checks */
    /*
     * For functions the tree context is constructed and destructed a second
     * time during code generation. To avoid a redundant stats update in such
-     * cases, we store (uintN) -1 in maxScopeDepth.
+     * cases, we store uint16(-1) in maxScopeDepth.
     */
    ~JSTreeContext() {
-        JS_SCOPE_DEPTH_METERING(maxScopeDepth == (uintN) -1 ||
-                                JS_BASIC_STATS_ACCUM(&compiler
-                                                       ->context
-                                                       ->runtime
-                                                       ->lexicalScopeDepthStats,
-                                                     maxScopeDepth));
+        JS_SCOPE_DEPTH_METERING_IF((maxScopeDepth != uint16(-1)),
+                                   JS_BASIC_STATS_ACCUM(&compiler
+                                                          ->context
+                                                          ->runtime
+                                                          ->lexicalScopeDepthStats,
+                                                        maxScopeDepth));
    }

    uintN blockid() { return topStmt ? topStmt->blockid : bodyid; }
--- a/js/src/jsgc.cpp
+++ b/js/src/jsgc.cpp
@ -209,11 +209,13 @@ struct JSGCArenaInfo {
    JSGCArenaInfo   *prev;

    /*
-     * A link field for the list of arenas with marked but not yet traced
-     * things. The field is encoded as arena's page to share the space with
-     * firstArena and arenaIndex fields.
+     * A link field for the list of arenas with marked things that haven't yet
+     * been scanned for live children. The field is encoded as arena's page to
+     * to hold only the high-order arena-counting bits to share the space with
+     * firstArena and arenaIndex fields. For details see comments before
+     * DelayMarkingChildren.
     */
-    jsuword         prevUntracedPage :  JS_BITS_PER_WORD - GC_ARENA_SHIFT;
+    jsuword         prevUnmarkedPage :  JS_BITS_PER_WORD - GC_ARENA_SHIFT;

    /*
     * When firstArena is false, the index of arena in the chunk. When
@ -228,22 +230,20 @@ struct JSGCArenaInfo {
    /* Flag indicating if the arena is the first in the chunk. */
    jsuword         firstArena :        1;

-    union {
-        struct {
-            JSGCThing   *freeList;
-            jsuword     untracedThings;     /* bitset for fast search of marked
-                                               but not yet traced things */
-        } finalizable;
+    JSGCThing       *freeList;

-        bool            hasMarkedDoubles;   /* the arena has marked doubles */
+    union {
+        /* See comments before DelayMarkingChildren. */
+        jsuword     unmarkedChildren;
+
+        /* The arena has marked doubles. */
+        bool        hasMarkedDoubles;
    };
 };

 /* GC flag definitions, must fit in 8 bits. */
 const uint8 GCF_MARK        = JS_BIT(0);
 const uint8 GCF_LOCK        = JS_BIT(1); /* lock request bit in API */
-const uint8 GCF_CHILDREN    = JS_BIT(2); /* GC things with children to be
-                                            marked later. */

 /*
 * The private JSGCThing struct, which describes a JSRuntime.gcFreeList element.
@ -693,7 +693,7 @@ NewGCArena(JSContext *cx)
    }

    rt->gcBytes += GC_ARENA_SIZE;
-    a->prevUntracedPage = 0;
+    a->prevUnmarkedPage = 0;

    return a;
 }
@ -894,8 +894,8 @@ js_GetGCStringRuntime(JSString *str)
                         offsetof(JSRuntime, gcArenaList));
 }

-JSBool
-js_IsAboutToBeFinalized(JSContext *cx, void *thing)
+bool
+js_IsAboutToBeFinalized(void *thing)
 {
    JSGCArenaInfo *a;
    uint32 index, flags;
@ -1095,9 +1095,9 @@ js_DumpGCStats(JSRuntime *rt, FILE *fp)
    fprintf(fp, "     maximum mark recursion: %lu\n", ULSTAT(maxdepth));
    fprintf(fp, "     mark C recursion depth: %lu\n", ULSTAT(cdepth));
    fprintf(fp, "   maximum mark C recursion: %lu\n", ULSTAT(maxcdepth));
-    fprintf(fp, "      delayed tracing calls: %lu\n", ULSTAT(untraced));
+    fprintf(fp, "      delayed tracing calls: %lu\n", ULSTAT(unmarked));
 #ifdef DEBUG
-    fprintf(fp, "      max trace later count: %lu\n", ULSTAT(maxuntraced));
+    fprintf(fp, "      max trace later count: %lu\n", ULSTAT(maxunmarked));
 #endif
    fprintf(fp, "   maximum GC nesting level: %lu\n", ULSTAT(maxlevel));
    fprintf(fp, "potentially useful GC calls: %lu\n", ULSTAT(poke));
@ -1346,7 +1346,7 @@ CloseNativeIterators(JSContext *cx)
    size_t newLength = 0;
    for (size_t i = 0; i < length; ++i) {
        JSObject *obj = array[i];
-        if (js_IsAboutToBeFinalized(cx, obj))
+        if (js_IsAboutToBeFinalized(obj))
            js_CloseNativeIterator(cx, obj);
        else
            array[newLength++] = obj;
@ -1384,8 +1384,8 @@ JSGCFreeLists::purge()
        JSGCThing *freeListHead = *p;
        if (freeListHead) {
            JSGCArenaInfo *a = THING_TO_ARENA(freeListHead);
-            JS_ASSERT(!a->finalizable.freeList);
-            a->finalizable.freeList = freeListHead;
+            JS_ASSERT(!a->freeList);
+            a->freeList = freeListHead;
            *p = NULL;
        }
    }
@ -1473,9 +1473,9 @@ RefillFinalizableFreeList(JSContext *cx, unsigned thingKind)

        while ((a = arenaList->cursor) != NULL) {
            arenaList->cursor = a->prev;
-            JSGCThing *freeList = a->finalizable.freeList;
+            JSGCThing *freeList = a->freeList;
            if (freeList) {
-                a->finalizable.freeList = NULL;
+                a->freeList = NULL;
                JS_UNLOCK_GC(rt);
                return freeList;
            }
@ -1499,9 +1499,9 @@ RefillFinalizableFreeList(JSContext *cx, unsigned thingKind)
     */
    a->list = arenaList;
    a->prev = arenaList->head;
-    a->prevUntracedPage = 0;
-    a->finalizable.untracedThings = 0;
-    a->finalizable.freeList = NULL;
+    a->prevUnmarkedPage = 0;
+    a->freeList = NULL;
+    a->unmarkedChildren = 0;
    arenaList->head = a;
    JS_UNLOCK_GC(rt);

@ -1720,6 +1720,7 @@ RefillDoubleFreeList(JSContext *cx)
    }

    a->list = NULL;
+    a->freeList = NULL;
    a->hasMarkedDoubles = false;
    a->prev = rt->gcDoubleArenaList.head;
    rt->gcDoubleArenaList.head = a;
@ -1876,11 +1877,11 @@ js_LockGCThingRT(JSRuntime *rt, void *thing)
    return ok;
 }

-JSBool
+void
 js_UnlockGCThingRT(JSRuntime *rt, void *thing)
 {
    if (!thing)
-        return JS_TRUE;
+        return;

    JS_LOCK_GC(rt);

@ -1908,7 +1909,6 @@ js_UnlockGCThingRT(JSRuntime *rt, void *thing)
    METER(rt->gcStats.unlock++);
  out:
    JS_UNLOCK_GC(rt);
-    return JS_TRUE;
 }

 JS_PUBLIC_API(void)
@ -1940,110 +1940,146 @@ JS_TraceChildren(JSTracer *trc, void *thing, uint32 kind)
 }

 /*
- * Number of things covered by a single bit of JSGCArenaInfo.untracedThings.
+ * When the native stack is low, the GC does not call JS_TraceChildren to mark
+ * the reachable "children" of the thing. Rather the thing is put aside and
+ * JS_TraceChildren is called later with more space on the C stack.
+ *
+ * To implement such delayed marking of the children with minimal overhead for
+ * the normal case of sufficient native stack, the code adds two fields to
+ * JSGCArenaInfo. The first field, JSGCArenaInfo::prevUnmarkedPage, links all
+ * arenas with delayed things into a stack list with the pointer to stack top
+ * in JSRuntime::gcUnmarkedArenaStackTop. DelayMarkingChildren adds arenas to
+ * the stack as necessary while MarkDelayedChildren pops the arenas from the
+ * stack until it empties.
+ *
+ * The second field, JSGCArenaInfo::unmarkedChildren, is a bitmap that tells
+ * for which things the GC should call JS_TraceChildren later. The bitmap is
+ * a single word. As such it does not pinpoint the delayed things in the arena
+ * but rather tells the intervals containing ThingsPerUnmarkedBit(thingSize)
+ * things. Later the code in MarkDelayedChildren discovers such intervals
+ * and calls JS_TraceChildren on any marked thing in the interval. This
+ * implies that JS_TraceChildren can be called many times for a single thing
+ * if the thing shares the same interval with some delayed things. This should
+ * be fine as any GC graph marking/traversing hooks must allow repeated calls
+ * during the same GC cycle. In particular, xpcom cycle collector relies on
+ * this.
+ *
+ * Note that such repeated scanning may slow down the GC. In particular, it is
+ * possible to construct an object graph where the GC calls JS_TraceChildren
+ * ThingsPerUnmarkedBit(thingSize) for almost all things in the graph. We
+ * tolerate this as the max value for ThingsPerUnmarkedBit(thingSize) is 4.
+ * This is archived for JSObject on 32 bit system as it is exactly JSObject
+ * that has the smallest size among the GC things that can be delayed. On 32
+ * bit CPU we have less than 128 objects per 4K GC arena so each bit in
+ * unmarkedChildren covers 4 objects.
 */
-#define THINGS_PER_UNTRACED_BIT(thingSize)                                    \
-    JS_HOWMANY(THINGS_PER_ARENA(thingSize), JS_BITS_PER_WORD)
-
-static void
-DelayTracingChildren(JSRuntime *rt, uint8 *flagp)
+inline unsigned
+ThingsPerUnmarkedBit(unsigned thingSize)
 {
-    JSGCArenaInfo *a;
-    uint32 untracedBitIndex;
-    jsuword bit;
-
-    JS_ASSERT(!(*flagp & GCF_CHILDREN));
-    *flagp |= GCF_CHILDREN;
-
-    METER(rt->gcStats.untraced++);
-#ifdef DEBUG
-    ++rt->gcTraceLaterCount;
-    METER_UPDATE_MAX(rt->gcStats.maxuntraced, rt->gcTraceLaterCount);
-#endif
-
-    a = FLAGP_TO_ARENA(flagp);
-    untracedBitIndex = FLAGP_TO_INDEX(flagp) /
-                       THINGS_PER_UNTRACED_BIT(a->list->thingSize);
-    JS_ASSERT(untracedBitIndex < JS_BITS_PER_WORD);
-    bit = (jsuword)1 << untracedBitIndex;
-    if (a->finalizable.untracedThings != 0) {
-        JS_ASSERT(rt->gcUntracedArenaStackTop);
-        if (a->finalizable.untracedThings & bit) {
-            /* bit already covers things with children to trace later. */
-            return;
-        }
-        a->finalizable.untracedThings |= bit;
-    } else {
-        /*
-         * The thing is the first thing with not yet traced children in the
-         * whole arena, so push the arena on the stack of arenas with things
-         * to be traced later unless the arena has already been pushed. We
-         * detect that through checking prevUntracedPage as the field is 0
-         * only for not yet pushed arenas. To ensure that
-         *   prevUntracedPage != 0
-         * even when the stack contains one element, we make prevUntracedPage
-         * for the arena at the bottom to point to itself.
-         *
-         * See comments in TraceDelayedChildren.
-         */
-        a->finalizable.untracedThings = bit;
-        if (a->prevUntracedPage == 0) {
-            if (!rt->gcUntracedArenaStackTop) {
-                /* Stack was empty, mark the arena as the bottom element. */
-                a->prevUntracedPage = ARENA_INFO_TO_PAGE(a);
-            } else {
-                JS_ASSERT(rt->gcUntracedArenaStackTop->prevUntracedPage != 0);
-                a->prevUntracedPage =
-                    ARENA_INFO_TO_PAGE(rt->gcUntracedArenaStackTop);
-            }
-            rt->gcUntracedArenaStackTop = a;
-        }
-    }
-    JS_ASSERT(rt->gcUntracedArenaStackTop);
+    return JS_HOWMANY(THINGS_PER_ARENA(thingSize), JS_BITS_PER_WORD);
 }

 static void
-TraceDelayedChildren(JSTracer *trc)
+DelayMarkingChildren(JSRuntime *rt, uint8 *flagp)
+{
+    JSGCArenaInfo *a;
+    uint32 unmarkedBitIndex;
+    jsuword bit;
+
+    JS_ASSERT(*flagp & GCF_MARK);
+
+    METER(rt->gcStats.unmarked++);
+    a = FLAGP_TO_ARENA(flagp);
+    unmarkedBitIndex = FLAGP_TO_INDEX(flagp) /
+                       ThingsPerUnmarkedBit(a->list->thingSize);
+    JS_ASSERT(unmarkedBitIndex < JS_BITS_PER_WORD);
+    bit = (jsuword)1 << unmarkedBitIndex;
+    if (a->unmarkedChildren != 0) {
+        JS_ASSERT(rt->gcUnmarkedArenaStackTop);
+        if (a->unmarkedChildren & bit) {
+            /* bit already covers things with children to mark later. */
+            return;
+        }
+        a->unmarkedChildren |= bit;
+    } else {
+        /*
+         * The thing is the first thing with not yet marked children in the
+         * whole arena, so push the arena on the stack of arenas with things
+         * to be marked later unless the arena has already been pushed. We
+         * detect that through checking prevUnmarkedPage as the field is 0
+         * only for not yet pushed arenas. To ensure that
+         *   prevUnmarkedPage != 0
+         * even when the stack contains one element, we make prevUnmarkedPage
+         * for the arena at the bottom to point to itself.
+         *
+         * See comments in MarkDelayedChildren.
+         */
+        a->unmarkedChildren = bit;
+        if (a->prevUnmarkedPage == 0) {
+            if (!rt->gcUnmarkedArenaStackTop) {
+                /* Stack was empty, mark the arena as the bottom element. */
+                a->prevUnmarkedPage = ARENA_INFO_TO_PAGE(a);
+            } else {
+                JS_ASSERT(rt->gcUnmarkedArenaStackTop->prevUnmarkedPage != 0);
+                a->prevUnmarkedPage =
+                    ARENA_INFO_TO_PAGE(rt->gcUnmarkedArenaStackTop);
+            }
+            rt->gcUnmarkedArenaStackTop = a;
+        }
+        JS_ASSERT(rt->gcUnmarkedArenaStackTop);
+    }
+#ifdef DEBUG
+    rt->gcMarkLaterCount += ThingsPerUnmarkedBit(a->list->thingSize);
+    METER_UPDATE_MAX(rt->gcStats.maxunmarked, rt->gcMarkLaterCount);
+#endif
+}
+
+static void
+MarkDelayedChildren(JSTracer *trc)
 {
    JSRuntime *rt;
    JSGCArenaInfo *a, *aprev;
    uint32 thingSize, traceKind;
-    uint32 thingsPerUntracedBit;
-    uint32 untracedBitIndex, thingIndex, indexLimit, endIndex;
+    uint32 thingsPerUnmarkedBit;
+    uint32 unmarkedBitIndex, thingIndex, indexLimit, endIndex;
    JSGCThing *thing;
    uint8 *flagp;

    rt = trc->context->runtime;
-    a = rt->gcUntracedArenaStackTop;
+    a = rt->gcUnmarkedArenaStackTop;
    if (!a) {
-        JS_ASSERT(rt->gcTraceLaterCount == 0);
+        JS_ASSERT(rt->gcMarkLaterCount == 0);
        return;
    }

    for (;;) {
        /*
         * The following assert verifies that the current arena belongs to the
-         * untraced stack, since DelayTracingChildren ensures that even for
-         * stack's bottom prevUntracedPage != 0 but rather points to itself.
+         * unmarked stack, since DelayMarkingChildren ensures that even for
+         * the stack's bottom, prevUnmarkedPage != 0 but rather points to
+         * itself.
         */
-        JS_ASSERT(a->prevUntracedPage != 0);
-        JS_ASSERT(rt->gcUntracedArenaStackTop->prevUntracedPage != 0);
+        JS_ASSERT(a->prevUnmarkedPage != 0);
+        JS_ASSERT(rt->gcUnmarkedArenaStackTop->prevUnmarkedPage != 0);
        thingSize = a->list->thingSize;
        traceKind = GetFinalizableArenaTraceKind(a);
        indexLimit = THINGS_PER_ARENA(thingSize);
-        thingsPerUntracedBit = THINGS_PER_UNTRACED_BIT(thingSize);
+        thingsPerUnmarkedBit = ThingsPerUnmarkedBit(thingSize);

        /*
-         * We cannot use do-while loop here as a->untracedThings can be zero
+         * We cannot use do-while loop here as a->unmarkedChildren can be zero
         * before the loop as a leftover from the previous iterations. See
         * comments after the loop.
         */
-        while (a->finalizable.untracedThings != 0) {
-            untracedBitIndex = JS_FLOOR_LOG2W(a->finalizable.untracedThings);
-            a->finalizable.untracedThings &=
-                ~((jsuword)1 << untracedBitIndex);
-            thingIndex = untracedBitIndex * thingsPerUntracedBit;
-            endIndex = thingIndex + thingsPerUntracedBit;
+        while (a->unmarkedChildren != 0) {
+            unmarkedBitIndex = JS_FLOOR_LOG2W(a->unmarkedChildren);
+            a->unmarkedChildren &= ~((jsuword)1 << unmarkedBitIndex);
+#ifdef DEBUG
+            JS_ASSERT(rt->gcMarkLaterCount >= thingsPerUnmarkedBit);
+            rt->gcMarkLaterCount -= thingsPerUnmarkedBit;
+#endif
+            thingIndex = unmarkedBitIndex * thingsPerUnmarkedBit;
+            endIndex = thingIndex + thingsPerUnmarkedBit;

            /*
             * endIndex can go beyond the last allocated thing as the real
@ -2052,22 +2088,12 @@ TraceDelayedChildren(JSTracer *trc)
            if (endIndex > indexLimit)
                endIndex = indexLimit;
            JS_ASSERT(thingIndex < indexLimit);
-
            do {
-                /*
-                 * Skip free or already traced things that share the bit
-                 * with untraced ones.
-                 */
                flagp = THING_FLAGP(a, thingIndex);
-                if (!(*flagp & GCF_CHILDREN))
-                    continue;
-                *flagp &= ~GCF_CHILDREN;
-#ifdef DEBUG
-                JS_ASSERT(rt->gcTraceLaterCount != 0);
-                --rt->gcTraceLaterCount;
-#endif
-                thing = FLAGP_TO_THING(flagp, thingSize);
-                JS_TraceChildren(trc, thing, traceKind);
+                if (*flagp & GCF_MARK) {
+                    thing = FLAGP_TO_THING(flagp, thingSize);
+                    JS_TraceChildren(trc, thing, traceKind);
+                }
            } while (++thingIndex != endIndex);
        }

@ -2076,29 +2102,29 @@ TraceDelayedChildren(JSTracer *trc)
         * pop it from the stack if the arena is the stack's top.
         *
         * When JS_TraceChildren from the above calls JS_CallTracer that in
-         * turn on low C stack calls DelayTracingChildren and the latter
-         * pushes new arenas to the untraced stack, we have to skip popping
+         * turn on low C stack calls DelayMarkingChildren and the latter
+         * pushes new arenas to the unmarked stack, we have to skip popping
         * of this arena until it becomes the top of the stack again.
         */
-        if (a == rt->gcUntracedArenaStackTop) {
-            aprev = ARENA_PAGE_TO_INFO(a->prevUntracedPage);
-            a->prevUntracedPage = 0;
+        if (a == rt->gcUnmarkedArenaStackTop) {
+            aprev = ARENA_PAGE_TO_INFO(a->prevUnmarkedPage);
+            a->prevUnmarkedPage = 0;
            if (a == aprev) {
                /*
-                 * prevUntracedPage points to itself and we reached the
+                 * prevUnmarkedPage points to itself and we reached the
                 * bottom of the stack.
                 */
                break;
            }
-            rt->gcUntracedArenaStackTop = a = aprev;
+            rt->gcUnmarkedArenaStackTop = a = aprev;
        } else {
-            a = rt->gcUntracedArenaStackTop;
+            a = rt->gcUnmarkedArenaStackTop;
        }
    }
-    JS_ASSERT(rt->gcUntracedArenaStackTop);
-    JS_ASSERT(rt->gcUntracedArenaStackTop->prevUntracedPage == 0);
-    rt->gcUntracedArenaStackTop = NULL;
-    JS_ASSERT(rt->gcTraceLaterCount == 0);
+    JS_ASSERT(rt->gcUnmarkedArenaStackTop);
+    JS_ASSERT(rt->gcUnmarkedArenaStackTop->prevUnmarkedPage == 0);
+    rt->gcUnmarkedArenaStackTop = NULL;
+    JS_ASSERT(rt->gcMarkLaterCount == 0);
 }

 JS_PUBLIC_API(void)
@ -2178,7 +2204,7 @@ JS_CallTracer(JSTracer *trc, void *thing, uint32 kind)
 # define RECURSION_TOO_DEEP() (!JS_CHECK_STACK_SIZE(cx, stackDummy))
 #endif
        if (RECURSION_TOO_DEEP())
-            DelayTracingChildren(rt, flagp);
+            DelayMarkingChildren(rt, flagp);
        else
            JS_TraceChildren(trc, thing, kind);
    } else {
@ -2190,16 +2216,16 @@ JS_CallTracer(JSTracer *trc, void *thing, uint32 kind)
         *
         * Since we do not know which call from inside the callback is the
         * last, we ensure that children of all marked things are traced and
-         * call TraceDelayedChildren(trc) after tracing the thing.
+         * call MarkDelayedChildren(trc) after tracing the thing.
         *
-         * As TraceDelayedChildren unconditionally invokes JS_TraceChildren
-         * for the things with untraced children, calling DelayTracingChildren
+         * As MarkDelayedChildren unconditionally invokes JS_TraceChildren
+         * for the things with unmarked children, calling DelayMarkingChildren
         * is useless here. Hence we always trace thing's children even with a
         * low native stack.
         */
        cx->insideGCMarkCallback = JS_FALSE;
        JS_TraceChildren(trc, thing, kind);
-        TraceDelayedChildren(trc);
+        MarkDelayedChildren(trc);
        cx->insideGCMarkCallback = JS_TRUE;
    }

@ -2220,7 +2246,7 @@ js_CallValueTracerIfGCThing(JSTracer *trc, jsval v)
    if (JSVAL_IS_DOUBLE(v) || JSVAL_IS_STRING(v)) {
        thing = JSVAL_TO_TRACEABLE(v);
        kind = JSVAL_TRACE_KIND(v);
-        JS_ASSERT(kind == js_GetGCThingTraceKind(JSVAL_TO_GCTHING(v)));
+        JS_ASSERT(kind == js_GetGCThingTraceKind(thing));
    } else if (JSVAL_IS_OBJECT(v) && v != JSVAL_NULL) {
        /* v can be an arbitrary GC thing reinterpreted as an object. */
        thing = JSVAL_TO_OBJECT(v);
@ -2241,41 +2267,43 @@ gc_root_traversal(JSDHashTable *table, JSDHashEntryHdr *hdr, uint32 num,
    jsval v = *rp;

    /* Ignore null reference, scalar values, and static strings. */
-    if (!JSVAL_IS_NULL(v) &&
-        JSVAL_IS_GCTHING(v) &&
-        !JSString::isStatic(JSVAL_TO_GCTHING(v))) {
+    if (JSVAL_IS_TRACEABLE(v)) {
 #ifdef DEBUG
-        bool root_points_to_gcArenaList = false;
-        jsuword thing = (jsuword) JSVAL_TO_GCTHING(v);
-        JSRuntime *rt = trc->context->runtime;
-        for (unsigned i = 0; i != FINALIZE_LIMIT; i++) {
-            JSGCArenaList *arenaList = &rt->gcArenaList[i];
-            size_t thingSize = arenaList->thingSize;
-            size_t limit = THINGS_PER_ARENA(thingSize) * thingSize;
-            for (JSGCArenaInfo *a = arenaList->head; a; a = a->prev) {
-                if (thing - ARENA_INFO_TO_START(a) < limit) {
-                    root_points_to_gcArenaList = true;
-                    break;
+        if (!JSString::isStatic(JSVAL_TO_GCTHING(v))) {
+            bool root_points_to_gcArenaList = false;
+            jsuword thing = (jsuword) JSVAL_TO_GCTHING(v);
+            JSRuntime *rt = trc->context->runtime;
+            for (unsigned i = 0; i != FINALIZE_LIMIT; i++) {
+                JSGCArenaList *arenaList = &rt->gcArenaList[i];
+                size_t thingSize = arenaList->thingSize;
+                size_t limit = THINGS_PER_ARENA(thingSize) * thingSize;
+                for (JSGCArenaInfo *a = arenaList->head; a; a = a->prev) {
+                    if (thing - ARENA_INFO_TO_START(a) < limit) {
+                        root_points_to_gcArenaList = true;
+                        break;
+                    }
                }
            }
-        }
-        if (!root_points_to_gcArenaList) {
-            for (JSGCArenaInfo *a = rt->gcDoubleArenaList.head; a; a = a->prev) {
-                if (thing - ARENA_INFO_TO_START(a) <
-                    DOUBLES_PER_ARENA * sizeof(jsdouble)) {
-                    root_points_to_gcArenaList = true;
-                    break;
+            if (!root_points_to_gcArenaList) {
+                for (JSGCArenaInfo *a = rt->gcDoubleArenaList.head;
+                     a;
+                     a = a->prev) {
+                    if (thing - ARENA_INFO_TO_START(a) <
+                        DOUBLES_PER_ARENA * sizeof(jsdouble)) {
+                        root_points_to_gcArenaList = true;
+                        break;
+                    }
                }
            }
-        }
-        if (!root_points_to_gcArenaList && rhe->name) {
-            fprintf(stderr,
+            if (!root_points_to_gcArenaList && rhe->name) {
+                fprintf(stderr,
 "JS API usage error: the address passed to JS_AddNamedRoot currently holds an\n"
 "invalid jsval.  This is usually caused by a missing call to JS_RemoveRoot.\n"
 "The root's name is \"%s\".\n",
-                    rhe->name);
+                        rhe->name);
+            }
+            JS_ASSERT(root_points_to_gcArenaList);
        }
-        JS_ASSERT(root_points_to_gcArenaList);
 #endif
        JS_SET_TRACING_NAME(trc, rhe->name ? rhe->name : "root");
        js_CallValueTracerIfGCThing(trc, v);
@ -2765,8 +2793,8 @@ FinalizeArenaList(JSContext *cx, unsigned thingKind,
 #endif
    for (;;) {
        JS_ASSERT(a->list == arenaList);
-        JS_ASSERT(a->prevUntracedPage == 0);
-        JS_ASSERT(a->finalizable.untracedThings == 0);
+        JS_ASSERT(a->prevUnmarkedPage == 0);
+        JS_ASSERT(a->unmarkedChildren == 0);

        JSGCThing *freeList = NULL;
        JSGCThing **tailp = &freeList;
@ -2778,9 +2806,7 @@ FinalizeArenaList(JSContext *cx, unsigned thingKind,
            reinterpret_cast<JSGCThing *>(ARENA_INFO_TO_START(a) +
                                          THINGS_PER_ARENA(sizeof(T)) *
                                          sizeof(T));
-        JSGCThing* nextFree = a->finalizable.freeList
-                              ? a->finalizable.freeList
-                              : thingsEnd;
+        JSGCThing* nextFree = a->freeList ? a->freeList : thingsEnd;
        for (;; thing = NextThing(thing, sizeof(T)), --flagp) {
             if (thing == nextFree) {
                if (thing == thingsEnd)
@ -2844,7 +2870,7 @@ FinalizeArenaList(JSContext *cx, unsigned thingKind,
        } else {
            JS_ASSERT(nfree < THINGS_PER_ARENA(sizeof(T)));
            *tailp = NULL;
-            a->finalizable.freeList = freeList;
+            a->freeList = freeList;
            ap = &a->prev;
            METER(nlivearenas++);
        }
@ -3072,8 +3098,8 @@ js_GC(JSContext *cx, JSGCInvocationKind gckind)

  restart:
    rt->gcNumber++;
-    JS_ASSERT(!rt->gcUntracedArenaStackTop);
-    JS_ASSERT(rt->gcTraceLaterCount == 0);
+    JS_ASSERT(!rt->gcUnmarkedArenaStackTop);
+    JS_ASSERT(rt->gcMarkLaterCount == 0);

    /*
     * Reset the property cache's type id generator so we can compress ids.
@ -3118,7 +3144,7 @@ js_GC(JSContext *cx, JSGCInvocationKind gckind)
     * Mark children of things that caused too deep recursion during the above
     * tracing.
     */
-    TraceDelayedChildren(&trc);
+    MarkDelayedChildren(&trc);

    JS_ASSERT(!cx->insideGCMarkCallback);
    if (rt->gcCallback) {
@ -3127,7 +3153,7 @@ js_GC(JSContext *cx, JSGCInvocationKind gckind)
        JS_ASSERT(cx->insideGCMarkCallback);
        cx->insideGCMarkCallback = JS_FALSE;
    }
-    JS_ASSERT(rt->gcTraceLaterCount == 0);
+    JS_ASSERT(rt->gcMarkLaterCount == 0);

    rt->gcMarkingTracer = NULL;

@ -3198,7 +3224,7 @@ js_GC(JSContext *cx, JSGCInvocationKind gckind)
            ap = &a->prev;
 #ifdef JS_GCMETER
            for (size_t i = 0; i != DOUBLES_PER_ARENA; ++i) {
-                if (IsMarkedDouble(a, index))
+                if (IsMarkedDouble(a, i))
                    METER(nthings++);
            }
            METER(nlivearenas++);
--- a/js/src/jsgc.h
+++ b/js/src/jsgc.h
@ -148,11 +148,11 @@ js_ReserveObjects(JSContext *cx, size_t nobjects);
 extern JSBool
 js_LockGCThingRT(JSRuntime *rt, void *thing);

-extern JSBool
+extern void
 js_UnlockGCThingRT(JSRuntime *rt, void *thing);

-extern JSBool
-js_IsAboutToBeFinalized(JSContext *cx, void *thing);
+extern bool
+js_IsAboutToBeFinalized(void *thing);

 /*
 * Macro to test if a traversal is the marking phase of GC to avoid exposing
@ -414,10 +414,10 @@ typedef struct JSGCStats {
    uint32  maxdepth;   /* maximum mark tail recursion depth */
    uint32  cdepth;     /* mark recursion depth of C functions */
    uint32  maxcdepth;  /* maximum mark recursion depth of C functions */
-    uint32  untraced;   /* number of times tracing of GC thing's children were
+    uint32  unmarked;   /* number of times marking of GC thing's children were
                           delayed due to a low C stack */
 #ifdef DEBUG
-    uint32  maxuntraced;/* maximum number of things with children to trace
+    uint32  maxunmarked;/* maximum number of things with children to mark
                           later */
 #endif
    uint32  maxlevel;   /* maximum GC nesting (indirect recursion) level */
--- a/js/src/jsinterp.cpp
+++ b/js/src/jsinterp.cpp
@ -535,7 +535,6 @@ js_PurgePropertyCache(JSContext *cx, JSPropertyCache *cache)
        P(addpchits);
        P(setpchits);
        P(setpcmisses);
-        P(slotchanges);
        P(setmisses);
        P(idmisses);
        P(komisses);
--- a/js/src/jsinterp.h
+++ b/js/src/jsinterp.h
@ -313,8 +313,6 @@ typedef struct JSPropertyCache {
    uint32              addpchits;      /* adding next property pchit case */
    uint32              setpchits;      /* setting existing property pchit */
    uint32              setpcmisses;    /* setting/adding property pc misses */
-    uint32              slotchanges;    /* clasp->reserveSlots result variance-
-                                           induced slot changes */
    uint32              setmisses;      /* JSOP_SET{NAME,PROP} total misses */
    uint32              idmisses;       /* slow-path key id == atom misses */
    uint32              komisses;       /* slow-path key object misses */
--- a/js/src/jsobj.cpp
+++ b/js/src/jsobj.cpp
@ -2206,13 +2206,24 @@ InitScopeForObject(JSContext* cx, JSObject* obj, JSObject* proto, JSObjectOps* o

    /* Share proto's emptyScope only if obj is similar to proto. */
    JSClass *clasp = OBJ_GET_CLASS(cx, obj);
-    JSScope *scope;
-    if (proto && OBJ_IS_NATIVE(proto) &&
-        (scope = OBJ_SCOPE(proto))->canProvideEmptyScope(ops, clasp)) {
-        scope = scope->getEmptyScope(cx, clasp);
-        if (!scope)
-            goto bad;
-    } else {
+    JSScope *scope = NULL;
+
+    if (proto && OBJ_IS_NATIVE(proto)) {
+        JS_LOCK_OBJ(cx, proto);
+        scope = OBJ_SCOPE(proto);
+        if (scope->canProvideEmptyScope(ops, clasp)) {
+            JSScope *emptyScope = scope->getEmptyScope(cx, clasp);
+            JS_UNLOCK_SCOPE(cx, scope);
+            if (!emptyScope)
+                goto bad;
+            scope = emptyScope;
+        } else {
+            JS_UNLOCK_SCOPE(cx, scope);
+            scope = NULL;
+        }
+    }
+
+    if (!scope) {
        scope = JSScope::create(cx, ops, clasp, obj, js_GenerateShape(cx, false));
        if (!scope)
            goto bad;
@ -2225,6 +2236,7 @@ InitScopeForObject(JSContext* cx, JSObject* obj, JSObject* proto, JSObjectOps* o
            goto bad;
        }
    }
+
    obj->map = scope;
    return true;

@ -3549,14 +3561,16 @@ js_ConstructObject(JSContext *cx, JSClass *clasp, JSObject *proto,
    return obj;
 }

-/* XXXbe if one adds props, deletes earlier props, adds more, the last added
-         won't recycle the deleted props' slots. */
+/*
+ * FIXME bug 535629: If one adds props, deletes earlier props, adds more, the
+ * last added won't recycle the deleted props' slots.
+ */
 JSBool
 js_AllocSlot(JSContext *cx, JSObject *obj, uint32 *slotp)
 {
-    JS_ASSERT(OBJ_IS_NATIVE(obj));
-
    JSScope *scope = OBJ_SCOPE(obj);
+    JS_ASSERT(scope->object == obj);
+
    JSClass *clasp = obj->getClass();
    if (scope->freeslot == JSSLOT_FREE(clasp) && clasp->reserveSlots) {
        /* Adjust scope->freeslot to include computed reserved slots, if any. */
@ -3577,9 +3591,8 @@ js_AllocSlot(JSContext *cx, JSObject *obj, uint32 *slotp)
 void
 js_FreeSlot(JSContext *cx, JSObject *obj, uint32 slot)
 {
-    JS_ASSERT(OBJ_IS_NATIVE(obj));
-
    JSScope *scope = OBJ_SCOPE(obj);
+    JS_ASSERT(scope->object == obj);
    LOCKED_OBJ_SET_SLOT(obj, slot, JSVAL_VOID);
    if (scope->freeslot == slot + 1)
        scope->freeslot = slot;
@ -4175,7 +4188,7 @@ js_FindPropertyHelper(JSContext *cx, jsid id, JSBool cacheResult,
                                             scopeIndex, protoIndex, pobj,
                                             (JSScopeProperty *) prop, false);
            }
-            SCOPE_DEPTH_ACCUM(&rt->scopeSearchDepthStats, scopeIndex);
+            SCOPE_DEPTH_ACCUM(&cx->runtime->scopeSearchDepthStats, scopeIndex);
            goto out;
        }

--- a/js/src/jsops.cpp
+++ b/js/src/jsops.cpp
@ -1815,10 +1815,10 @@ BEGIN_CASE(JSOP_SETMETHOD)
                         * if something created a hash table for scope, we must
                         * pay the price of JSScope::putProperty.
                         *
-                         * If slot does not match the cached sprop's slot,
-                         * update the cache entry in the hope that obj and
-                         * other instances with the same number of reserved
-                         * slots are now "hot".
+                         * (A reserveSlots hook can cause scopes of the same
+                         * shape to have different freeslot values. This is
+                         * what causes the slot != sprop->slot case. See
+                         * js_GetMutableScope.)
                         */
                        if (slot != sprop->slot || scope->table) {
                            JSScopeProperty *sprop2 =
@ -1831,13 +1831,6 @@ BEGIN_CASE(JSOP_SETMETHOD)
                                JS_UNLOCK_SCOPE(cx, scope);
                                goto error;
                            }
-                            if (sprop2 != sprop) {
-                                PCMETER(cache->slotchanges++);
-                                JS_ASSERT(slot != sprop->slot &&
-                                          slot == sprop2->slot &&
-                                          sprop2->id == sprop->id);
-                                entry->vword = SPROP_TO_PCVAL(sprop2);
-                            }
                            sprop = sprop2;
                        } else {
                            scope->extend(cx, sprop);
@ -3566,7 +3559,8 @@ BEGIN_CASE(JSOP_INITMETHOD)
                      scope->shape == scope->lastProperty()->shape);
            if (scope->table) {
                JSScopeProperty *sprop2 =
-                    scope->addDataProperty(cx, sprop->id, slot, sprop->attrs);
+                    scope->addProperty(cx, sprop->id, sprop->getter, sprop->setter, slot,
+                                       sprop->attrs, sprop->flags, sprop->shortid);
                if (!sprop2) {
                    js_FreeSlot(cx, obj, slot);
                    JS_UNLOCK_SCOPE(cx, scope);
--- a/js/src/jsrecursion.cpp
+++ b/js/src/jsrecursion.cpp
@ -70,7 +70,7 @@ class RecursiveSlotMap : public SlotMap
         * Store at exit->sp_adj - sizeof(double)
         */
        ptrdiff_t retOffset = downPostSlots * sizeof(double) -
-                              mRecorder.treeInfo->nativeStackBase;
+                              mRecorder.tree->nativeStackBase;
        mRecorder.lir->insStorei(mRecorder.addName(rval_ins, "rval_ins"),
                                 mRecorder.lirbuf->sp, retOffset);
    }
@ -93,7 +93,7 @@ class UpRecursiveSlotMap : public RecursiveSlotMap
        /*
         * The native stack offset of the return value once this frame has
         * returned, is:
-         *      -treeInfo->nativeStackBase + downPostSlots * sizeof(double)
+         *      -tree->nativeStackBase + downPostSlots * sizeof(double)
         *
         * Note, not +1, since the offset is 0-based.
         *
@ -101,15 +101,15 @@ class UpRecursiveSlotMap : public RecursiveSlotMap
         * be the amount down recursion added, which was just guarded as
         * |downPostSlots|. So the offset is:
         *
-         *      -treeInfo->nativeStackBase + downPostSlots * sizeof(double) -
+         *      -tree->nativeStackBase + downPostSlots * sizeof(double) -
         *                                   downPostSlots * sizeof(double)
         * Or:
-         *      -treeInfo->nativeStackBase
+         *      -tree->nativeStackBase
         *
         * This makes sense because this slot is just above the highest sp for
         * the down frame.
         */
-        lir->insStorei(rval_ins, lirbuf->sp, -mRecorder.treeInfo->nativeStackBase);
+        lir->insStorei(rval_ins, lirbuf->sp, -mRecorder.tree->nativeStackBase);

        lirbuf->sp = lir->ins2(LIR_piadd, lirbuf->sp,
                               lir->insImmWord(-int(downPostSlots) * sizeof(double)));
@ -152,7 +152,7 @@ TraceRecorder::downSnapshot(FrameInfo* downFrame)

    /* Build the typemap the exit will have. Note extra stack slot for return value. */
    unsigned downPostSlots = downFrame->callerHeight;
-    unsigned ngslots = treeInfo->globalSlots->length();
+    unsigned ngslots = tree->globalSlots->length();
    unsigned exitTypeMapLen = downPostSlots + 1 + ngslots;
    JSTraceType* exitTypeMap = (JSTraceType*)alloca(sizeof(JSTraceType) * exitTypeMapLen);
    JSTraceType* typeMap = downFrame->get_typemap();
@ -176,7 +176,7 @@ TraceRecorder::downSnapshot(FrameInfo* downFrame)
    exit->block = cx->fp->down->blockChain;
    exit->pc = downFrame->pc + JSOP_CALL_LENGTH;
    exit->imacpc = NULL;
-    exit->sp_adj = ((downPostSlots + 1) * sizeof(double)) - treeInfo->nativeStackBase;
+    exit->sp_adj = ((downPostSlots + 1) * sizeof(double)) - tree->nativeStackBase;
    exit->rp_adj = exit->calldepth * sizeof(FrameInfo*);
    exit->nativeCalleeWord = 0;
    exit->lookupFlags = js_InferFlags(cx, 0);
@ -257,11 +257,11 @@ TraceRecorder::upRecursion()
         */
        js_CaptureStackTypes(cx, 1, fi->get_typemap());
    } else {
-        /* Case 2: Guess that up-recursion is backing out, infer types from our TreeInfo. */
-        JS_ASSERT(treeInfo->nStackTypes == downPostSlots + 1);
+        /* Case 2: Guess that up-recursion is backing out, infer types from our Tree. */
+        JS_ASSERT(tree->nStackTypes == downPostSlots + 1);
        JSTraceType* typeMap = fi->get_typemap();
        for (unsigned i = 0; i < downPostSlots; i++)
-            typeMap[i] = treeInfo->typeMap[i];
+            typeMap[i] = tree->typeMap[i];
    }

    fi = traceMonitor->frameCache->memoize(fi);
@ -311,7 +311,7 @@ TraceRecorder::upRecursion()
    for (unsigned i = 0; i < downPostSlots; i++)
        slotMap.addSlot(exit->stackType(i));
    slotMap.addSlot(&stackval(-1));
-    VisitGlobalSlots(slotMap, cx, *treeInfo->globalSlots);
+    VisitGlobalSlots(slotMap, cx, *tree->globalSlots);
    if (recursive_pc == (jsbytecode*)fragment->root->ip) {
        debug_only_print0(LC_TMTracer, "Compiling up-recursive loop...\n");
    } else {
@ -319,9 +319,9 @@ TraceRecorder::upRecursion()
        exit->exitType = RECURSIVE_UNLINKED_EXIT;
        exit->recursive_pc = recursive_pc;
    }
-    JS_ASSERT(treeInfo->recursion != Recursion_Disallowed);
-    if (treeInfo->recursion != Recursion_Detected)
-        treeInfo->recursion = Recursion_Unwinds;
+    JS_ASSERT(tree->recursion != Recursion_Disallowed);
+    if (tree->recursion != Recursion_Detected)
+        tree->recursion = Recursion_Unwinds;
    return closeLoop(slotMap, exit);
 }

@ -424,7 +424,7 @@ TraceRecorder::slurpDownFrames(jsbytecode* return_pc)
     * value. The slurpSlot variable keeps track of the last slot that has been
     * unboxed, as to avoid re-unboxing when taking a SLURP_FAIL exit.
     */
-    unsigned numGlobalSlots = treeInfo->globalSlots->length();
+    unsigned numGlobalSlots = tree->globalSlots->length();
    unsigned safeSlots = NativeStackSlots(cx, frameDepth) + 1 + numGlobalSlots;
    jsbytecode* recursive_pc = return_pc + JSOP_CALL_LENGTH;
    VMSideExit* exit = (VMSideExit*)
@ -435,7 +435,7 @@ TraceRecorder::slurpDownFrames(jsbytecode* return_pc)
    exit->exitType = RECURSIVE_SLURP_FAIL_EXIT;
    exit->numStackSlots = downPostSlots + 1;
    exit->numGlobalSlots = numGlobalSlots;
-    exit->sp_adj = ((downPostSlots + 1) * sizeof(double)) - treeInfo->nativeStackBase;
+    exit->sp_adj = ((downPostSlots + 1) * sizeof(double)) - tree->nativeStackBase;
    exit->recursive_pc = recursive_pc;

    /*
@ -557,7 +557,7 @@ TraceRecorder::slurpDownFrames(jsbytecode* return_pc)
    for (unsigned i = 0; i < downPostSlots; i++)
        slotMap.addSlot(typeMap[i]);
    slotMap.addSlot(&stackval(-1), typeMap[downPostSlots]);
-    VisitGlobalSlots(slotMap, cx, *treeInfo->globalSlots);
+    VisitGlobalSlots(slotMap, cx, *tree->globalSlots);
    debug_only_print0(LC_TMTracer, "Compiling up-recursive slurp...\n");
    exit = copy(exit);
    if (exit->recursive_pc == fragment->root->ip)
@ -566,10 +566,25 @@ TraceRecorder::slurpDownFrames(jsbytecode* return_pc)
        exit->exitType = RECURSIVE_UNLINKED_EXIT;
    debug_only_printf(LC_TMTreeVis, "TREEVIS CHANGEEXIT EXIT=%p TYPE=%s\n", (void*)exit,
                      getExitName(exit->exitType));
-    JS_ASSERT(treeInfo->recursion >= Recursion_Unwinds);
+    JS_ASSERT(tree->recursion >= Recursion_Unwinds);
    return closeLoop(slotMap, exit);
 }

+class ImportFrameSlotsVisitor : public SlotVisitorBase
+{
+    TraceRecorder &mRecorder;
+public:
+    ImportFrameSlotsVisitor(TraceRecorder &recorder) : mRecorder(recorder)
+    {}
+
+    JS_REQUIRES_STACK JS_ALWAYS_INLINE bool
+    visitStackSlots(jsval *vp, size_t count, JSStackFrame* fp) {
+        for (size_t i = 0; i < count; ++i)
+            mRecorder.get(vp++);
+        return true;
+    }
+};
+
 JS_REQUIRES_STACK AbortableRecordingStatus
 TraceRecorder::downRecursion()
 {
@ -584,9 +599,9 @@ TraceRecorder::downRecursion()
    JS_ASSERT(unsigned(slots) == NativeStackSlots(cx, 1) - fp->argc - 2 - fp->script->nfixed - 1);

    /* Guard that there is enough stack space. */
-    JS_ASSERT(treeInfo->maxNativeStackSlots >= treeInfo->nativeStackBase / sizeof(double));
-    int guardSlots = slots + treeInfo->maxNativeStackSlots -
-                     treeInfo->nativeStackBase / sizeof(double);
+    JS_ASSERT(tree->maxNativeStackSlots >= tree->nativeStackBase / sizeof(double));
+    int guardSlots = slots + tree->maxNativeStackSlots -
+                     tree->nativeStackBase / sizeof(double);
    LIns* sp_top = lir->ins2(LIR_piadd, lirbuf->sp, lir->insImmWord(guardSlots * sizeof(double)));
    guard(true, lir->ins2(LIR_plt, sp_top, eos_ins), OOM_EXIT);

@ -594,13 +609,23 @@ TraceRecorder::downRecursion()
    LIns* rp_top = lir->ins2(LIR_piadd, lirbuf->rp, lir->insImmWord(sizeof(FrameInfo*)));
    guard(true, lir->ins2(LIR_plt, rp_top, eor_ins), OOM_EXIT);

+    /*
+     * For every slot in the new frame that is not in the tracker, create a load
+     * in the tracker. This is necessary because otherwise snapshot() will see
+     * missing imports and use the down frame, rather than the new frame.
+     * This won't affect performance because the loads will be killed if not
+     * used.
+     */
+    ImportFrameSlotsVisitor visitor(*this);
+    VisitStackSlots(visitor, cx, callDepth);
+
    /* Add space for a new JIT frame. */
    lirbuf->sp = lir->ins2(LIR_piadd, lirbuf->sp, lir->insImmWord(slots * sizeof(double)));
    lir->insStorei(lirbuf->sp, lirbuf->state, offsetof(InterpState, sp));
    lirbuf->rp = lir->ins2(LIR_piadd, lirbuf->rp, lir->insImmWord(sizeof(FrameInfo*)));
    lir->insStorei(lirbuf->rp, lirbuf->state, offsetof(InterpState, rp));
    --callDepth;
-    clearFrameSlotsFromCache();
+    clearFrameSlotsFromTracker(nativeFrameTracker);

    /*
     * If the callee and caller have identical call sites, this is a down-
@ -618,8 +643,8 @@ TraceRecorder::downRecursion()
        exit = snapshot(RECURSIVE_UNLINKED_EXIT);
    exit->recursive_pc = fp->script->code;
    debug_only_print0(LC_TMTracer, "Compiling down-recursive function call.\n");
-    JS_ASSERT(treeInfo->recursion != Recursion_Disallowed);
-    treeInfo->recursion = Recursion_Detected;
+    JS_ASSERT(tree->recursion != Recursion_Disallowed);
+    tree->recursion = Recursion_Detected;
    return closeLoop(exit);
 }

@ -783,7 +808,7 @@ TraceRecorder::slurpSlot(LIns* val_ins, jsval* vp, SlurpInfo* info)
    LIns* val = slurpSlot(val_ins, vp, exit);
    lir->insStorei(val,
                   lirbuf->sp,
-                   -treeInfo->nativeStackBase + ptrdiff_t(info->curSlot) * sizeof(double));
+                   -tree->nativeStackBase + ptrdiff_t(info->curSlot) * sizeof(double));
    info->curSlot++;
 }

--- a/js/src/jsregexp.cpp
+++ b/js/src/jsregexp.cpp
@ -2026,7 +2026,6 @@ LookupNativeRegExp(JSContext* cx, uint16 re_flags,
                              ? (++(tm->lastFragID)) : 0;
        )
        frag = new (alloc) REFragment(0 verbose_only(, profFragID));
-        frag->lirbuf = tm->reLirBuf;
        /*
         * Copy the re_chars portion of the hash key into the Allocator, so
         * its lifecycle is disconnected from the lifecycle of the
@ -2304,6 +2303,8 @@ class RegExpNativeCompiler {
    LIns*            start;
    LIns*            cpend;

+    LirBuffer* const lirbuf;
+
    bool outOfMemory() {
        return tempAlloc.outOfMemory() || JS_TRACE_MONITOR(cx).dataAlloc->outOfMemory();
    }
@ -3134,12 +3135,19 @@ class RegExpNativeCompiler {
 public:
    RegExpNativeCompiler(JSContext* cx, JSRegExp* re, CompilerState* cs, Fragment* fragment)
        : tempAlloc(*JS_TRACE_MONITOR(cx).reTempAlloc), cx(cx),
-          re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL) {  }
+          re(re), cs(cs), fragment(fragment), lir(NULL), lirBufWriter(NULL),
+          lirbuf(new (tempAlloc) LirBuffer(tempAlloc))
+    {
+        fragment->lirbuf = lirbuf;
+#ifdef DEBUG
+        LabelMap* labels = new (tempAlloc) LabelMap(tempAlloc, &js_LogController);
+        lirbuf->names = new (tempAlloc) LirNameMap(tempAlloc, labels);
+#endif
+    }

    ~RegExpNativeCompiler() {
        /* Purge the tempAlloc used during recording. */
        tempAlloc.reset();
-        JS_TRACE_MONITOR(cx).reLirBuf->clear();
    }

    JSBool compile()
@ -3235,7 +3243,7 @@ class RegExpNativeCompiler {
         */
        JS_ASSERT(!lirbuf->sp && !lirbuf->rp);

-        ::compile(assm, fragment, tempAlloc verbose_only(, tm->labels));
+        ::compile(assm, fragment, tempAlloc verbose_only(, lirbuf->names->labels));
        if (assm->error() != nanojit::None)
            goto fail;

--- a/js/src/jsscan.cpp
+++ b/js/src/jsscan.cpp
@ -146,7 +146,7 @@ js_CheckKeyword(const jschar *str, size_t length)
 }

 JS_FRIEND_API(void)
-js_MapKeywords(void (*mapfun)(const char *))
+js_MapKeywords(JSMapKeywordFun mapfun)
 {
    size_t i;

--- a/js/src/jsscan.h
+++ b/js/src/jsscan.h
@ -369,8 +369,10 @@ js_CheckKeyword(const jschar *chars, size_t length);
 * Friend-exported API entry point to call a mapping function on each reserved
 * identifier in the scanner's keyword table.
 */
+typedef void (*JSMapKeywordFun)(const char *);
+
 extern JS_FRIEND_API(void)
-js_MapKeywords(void (*mapfun)(const char *));
+js_MapKeywords(JSMapKeywordFun mapfun);

 /*
 * Check that str forms a valid JS identifier name. The function does not
--- a/js/src/jsscope.cpp
+++ b/js/src/jsscope.cpp
@ -112,6 +112,12 @@ js_GetMutableScope(JSContext *cx, JSObject *obj)
    JS_ASSERT(newscope->freeslot == JSSLOT_FREE(STOBJ_GET_CLASS(obj)));
    clasp = STOBJ_GET_CLASS(obj);
    if (clasp->reserveSlots) {
+        /*
+         * FIXME: Here we change OBJ_SCOPE(obj)->freeslot without changing
+         * OBJ_SHAPE(obj). If we strengthen the shape guarantees to cover
+         * freeslot, we can eliminate a check in JSOP_SETPROP and in
+         * js_AddProperty. See bug 535416.
+         */
        freeslot = JSSLOT_FREE(clasp) + clasp->reserveSlots(cx, obj);
        if (freeslot > STOBJ_NSLOTS(obj))
            freeslot = STOBJ_NSLOTS(obj);
@ -1594,7 +1600,13 @@ JSScope::removeProperty(JSContext *cx, jsid id)
        if (table) {
            *spp = NULL;
 #ifdef DEBUG
-            for (JSScopeProperty *aprop = lastProp; aprop; aprop = aprop->parent)
+            /*
+             * Check the consistency of the table but limit the number of
+             * checks not to alter significantly the complexity of the delete
+             * in debug builds, see bug 534493.
+             */
+            JSScopeProperty *aprop = lastProp;
+            for (unsigned n = 50; aprop && n != 0; aprop = aprop->parent, --n)
                JS_ASSERT_IF(aprop != sprop, hasProperty(aprop));
 #endif
        }
--- a/js/src/jsscript.cpp
+++ b/js/src/jsscript.cpp
@ -67,34 +67,26 @@

 #include "jsscriptinlines.h"

+const uint32 JSSLOT_EXEC_DEPTH          = JSSLOT_PRIVATE + 1;
+const uint32 JSSCRIPT_RESERVED_SLOTS    = 1;
+
 #if JS_HAS_SCRIPT_OBJECT

 static const char js_script_exec_str[]    = "Script.prototype.exec";
 static const char js_script_compile_str[] = "Script.prototype.compile";

-/*
- * This routine requires that obj has been locked previously.
- */
 static jsint
-GetScriptExecDepth(JSContext *cx, JSObject *obj)
+GetScriptExecDepth(JSObject *obj)
 {
-    jsval v;
-
-    JS_ASSERT(JS_IS_OBJ_LOCKED(cx, obj));
-    v = LOCKED_OBJ_GET_SLOT(obj, JSSLOT_START(&js_ScriptClass));
+    jsval v = obj->fslots[JSSLOT_EXEC_DEPTH];
    return JSVAL_IS_VOID(v) ? 0 : JSVAL_TO_INT(v);
 }

 static void
-AdjustScriptExecDepth(JSContext *cx, JSObject *obj, jsint delta)
+AdjustScriptExecDepth(JSObject *obj, jsint delta)
 {
-    jsint execDepth;
-
-    JS_LOCK_OBJ(cx, obj);
-    execDepth = GetScriptExecDepth(cx, obj);
-    LOCKED_OBJ_SET_SLOT(obj, JSSLOT_START(&js_ScriptClass),
-                        INT_TO_JSVAL(execDepth + delta));
-    JS_UNLOCK_OBJ(cx, obj);
+    jsint execDepth = GetScriptExecDepth(obj);
+    obj->fslots[JSSLOT_EXEC_DEPTH] = INT_TO_JSVAL(execDepth + delta);
 }

 #if JS_HAS_TOSOURCE
@ -272,7 +264,7 @@ script_compile_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
        return JS_FALSE;

    JS_LOCK_OBJ(cx, obj);
-    execDepth = GetScriptExecDepth(cx, obj);
+    execDepth = GetScriptExecDepth(obj);

    /*
     * execDepth must be 0 to allow compilation here, otherwise the JSScript
@ -378,7 +370,7 @@ script_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
        return JS_FALSE;

    /* Keep track of nesting depth for the script. */
-    AdjustScriptExecDepth(cx, obj, 1);
+    AdjustScriptExecDepth(obj, 1);

    /* Must get to out label after this */
    script = (JSScript *) obj->getPrivate();
@ -397,7 +389,7 @@ script_exec_sub(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
    ok = js_Execute(cx, scopeobj, script, caller, JSFRAME_EVAL, rval);

 out:
-    AdjustScriptExecDepth(cx, obj, -1);
+    AdjustScriptExecDepth(obj, -1);
    return ok;
 }

@ -855,7 +847,7 @@ script_thaw(JSContext *cx, uintN argc, jsval *vp)
    }

    JS_LOCK_OBJ(cx, obj);
-    execDepth = GetScriptExecDepth(cx, obj);
+    execDepth = GetScriptExecDepth(obj);

    /*
     * execDepth must be 0 to allow compilation here, otherwise the JSScript
@ -948,7 +940,7 @@ script_trace(JSTracer *trc, JSObject *obj)

 JS_FRIEND_DATA(JSClass) js_ScriptClass = {
    js_Script_str,
-    JSCLASS_HAS_PRIVATE | JSCLASS_HAS_RESERVED_SLOTS(1) |
+    JSCLASS_HAS_PRIVATE | JSCLASS_HAS_RESERVED_SLOTS(JSSCRIPT_RESERVED_SLOTS) |
    JSCLASS_MARK_IS_TRACE | JSCLASS_HAS_CACHED_PROTO(JSProto_Script),
    JS_PropertyStub,  JS_PropertyStub,  JS_PropertyStub,  JS_PropertyStub,
    JS_EnumerateStub, JS_ResolveStub,   JS_ConvertStub,   script_finalize,
--- a/js/src/jsstr.h
+++ b/js/src/jsstr.h
@ -128,61 +128,64 @@ struct JSString {
    static const size_t ATOMIZED =      JSSTRING_BIT(3);
    static const size_t DEFLATED =      JSSTRING_BIT(4);

-    bool hasFlag(size_t flag) const {
+    inline bool hasFlag(size_t flag) const {
        return (mFlags & flag) != 0;
    }

  public:
-    /* Generous but sane length bound. */
-    static const size_t MAX_LENGTH = (1 << 28);
+    /*
+     * Generous but sane length bound; the "-1" is there for comptibility with
+     * OOM tests.
+     */
+    static const size_t MAX_LENGTH = (1 << 28) - 1;

-    bool isDependent() const {
+    inline bool isDependent() const {
        return hasFlag(DEPENDENT);
    }

-    bool isFlat() const {
+    inline bool isFlat() const {
        return !isDependent();
    }

-    bool isDeflated() const {
+    inline bool isDeflated() const {
        return hasFlag(DEFLATED);
    }

-    void setDeflated() {
+    inline void setDeflated() {
        JS_ATOMIC_SET_MASK(&mFlags, DEFLATED);
    }

-    bool isMutable() const {
+    inline bool isMutable() const {
        return !isDependent() && hasFlag(MUTABLE);
    }

-    bool isAtomized() const {
+    inline bool isAtomized() const {
        return !isDependent() && hasFlag(ATOMIZED);
    }

-    JS_ALWAYS_INLINE jschar *chars() {
+    inline jschar *chars() {
        return isDependent() ? dependentChars() : flatChars();
    }

-    JS_ALWAYS_INLINE size_t length() const {
+    inline size_t length() const {
        return mLength;
    }

-    JS_ALWAYS_INLINE bool empty() const {
+    inline bool empty() const {
        return length() == 0;
    }

-    JS_ALWAYS_INLINE void getCharsAndLength(const jschar *&chars, size_t &length) {
+    inline void getCharsAndLength(const jschar *&chars, size_t &length) {
        chars = this->chars();
        length = this->length();
    }

-    JS_ALWAYS_INLINE void getCharsAndEnd(const jschar *&chars, const jschar *&end) {
+    inline void getCharsAndEnd(const jschar *&chars, const jschar *&end) {
        end = length() + (chars = this->chars());
    }

    /* Specific flat string initializer and accessor methods. */
-    void initFlat(jschar *chars, size_t length) {
+    inline void initFlat(jschar *chars, size_t length) {
        JS_ASSERT(length <= MAX_LENGTH);
        mLength = length;
        mOffset = 0;
@ -190,12 +193,12 @@ struct JSString {
        mChars = chars;
    }

-    jschar *flatChars() const {
+    inline jschar *flatChars() const {
        JS_ASSERT(isFlat());
        return mChars;
    }

-    JS_ALWAYS_INLINE size_t flatLength() const {
+    inline size_t flatLength() const {
        JS_ASSERT(isFlat());
        return length();
    }
@ -239,23 +242,23 @@ struct JSString {
     * js_AtomizeString.  This function would find that the string was already
     * hashed and return it with the atomized bit set.
     */
-    void flatSetAtomized() {
+    inline void flatSetAtomized() {
        JS_ASSERT(isFlat() && !isMutable());
        JS_ATOMIC_SET_MASK(&mFlags, ATOMIZED);
    }

-    void flatSetMutable() {
+    inline void flatSetMutable() {
        JS_ASSERT(isFlat() && !isAtomized());
        mFlags |= MUTABLE;
    }

-    void flatClearMutable() {
+    inline void flatClearMutable() {
        JS_ASSERT(isFlat());
        if (hasFlag(MUTABLE))
            mFlags &= ~MUTABLE;
    }

-    void initDependent(JSString *bstr, size_t off, size_t len) {
+    inline void initDependent(JSString *bstr, size_t off, size_t len) {
        JS_ASSERT(len <= MAX_LENGTH);
        mLength = len;
        mOffset = off;
@ -264,7 +267,7 @@ struct JSString {
    }

    /* See JSString::reinitFlat. */
-    void reinitDependent(JSString *bstr, size_t off, size_t len) {
+    inline void reinitDependent(JSString *bstr, size_t off, size_t len) {
        JS_ASSERT(len <= MAX_LENGTH);
        mLength = len;
        mOffset = off;
@ -272,22 +275,22 @@ struct JSString {
        mBase = bstr;
    }

-    JSString *dependentBase() const {
+    inline JSString *dependentBase() const {
        JS_ASSERT(isDependent());
        return mBase;
    }

-    JS_ALWAYS_INLINE jschar *dependentChars() {
+    inline jschar *dependentChars() {
        return dependentBase()->isDependent()
               ? js_GetDependentStringChars(this)
               : dependentBase()->flatChars() + dependentStart();
    }

-    JS_ALWAYS_INLINE size_t dependentStart() const {
+    inline size_t dependentStart() const {
        return mOffset;
    }

-    JS_ALWAYS_INLINE size_t dependentLength() const {
+    inline size_t dependentLength() const {
        JS_ASSERT(isDependent());
        return length();
    }
--- a/js/src/jstracer.cpp
+++ b/js/src/jstracer.cpp
--- a/js/src/jstracer.h
+++ b/js/src/jstracer.h
@ -66,10 +66,11 @@ class Queue {

 public:
    void ensure(unsigned size) {
+        if (_max > size)
+            return;
        if (!_max)
-            _max = 16;
-        while (_max < size)
-            _max <<= 1;
+            _max = 8;
+        _max = JS_MAX(_max * 2, size);
        if (alloc) {
            T* tmp = new (*alloc) T[_max];
            memcpy(tmp, _data, _len * sizeof(T));
@ -155,6 +156,16 @@ public:
    T* data() const {
        return _data;
    }
+
+    int offsetOf(T slot) {
+        T* p = _data;
+        unsigned n = 0;
+        for (n = 0; n < _len; ++n)
+            if (*p++ == slot)
+                return n;
+        return -1;
+    }
+
 };

 /*
@ -217,54 +228,6 @@ public:
    TreeFragment* toTreeFragment();
 };

-struct LinkableFragment : public VMFragment
-{
-    LinkableFragment(const void* _ip verbose_only(, uint32_t profFragID))
-      : VMFragment(_ip verbose_only(, profFragID))
-    { }
-
-    uint32 branchCount;
-};
-
-/*
- * argc is cx->fp->argc at the trace loop header, i.e., the number of arguments
- * pushed for the innermost JS frame. This is required as part of the fragment
- * key because the fragment will write those arguments back to the interpreter
- * stack when it exits, using its typemap, which implicitly incorporates a
- * given value of argc. Without this feature, a fragment could be called as an
- * inner tree with two different values of argc, and entry type checking or
- * exit frame synthesis could crash.
- */
-struct TreeFragment : public LinkableFragment
-{
-    TreeFragment(const void* _ip, JSObject* _globalObj, uint32 _globalShape, uint32 _argc
-               verbose_only(, uint32_t profFragID)) :
-        LinkableFragment(_ip verbose_only(, profFragID)),
-        treeInfo(NULL),
-        first(NULL),
-        next(NULL),
-        peer(NULL),
-        globalObj(_globalObj),
-        globalShape(_globalShape),
-        argc(_argc)
-    { }
-
-    TreeInfo *treeInfo;
-    TreeFragment* first;
-    TreeFragment* next;
-    TreeFragment* peer;
-    JSObject* globalObj;
-    uint32 globalShape;
-    uint32 argc;
-};
-
-inline TreeFragment*
-VMFragment::toTreeFragment()
-{
-    JS_ASSERT(root == this);
-    return static_cast<TreeFragment*>(this);
-}
-
 #if defined(JS_JIT_SPEW) || defined(NJ_NO_VARIADIC_MACROS)

 enum LC_TMBits {
@ -384,7 +347,8 @@ enum JSTraceType_
    TT_STRING         = 4, /* pointer to JSString */
    TT_NULL           = 5, /* null */
    TT_PSEUDOBOOLEAN  = 6, /* true, false, or undefined (0, 1, or 2) */
-    TT_FUNCTION       = 7  /* pointer to JSObject whose class is js_FunctionClass */
+    TT_FUNCTION       = 7, /* pointer to JSObject whose class is js_FunctionClass */
+    TT_IGNORE         = 8
 }
 #if defined(__GNUC__) && defined(USE_TRACE_TYPE_ENUM)
 __attribute__((packed))
@ -409,6 +373,8 @@ typedef Queue<uint16> SlotList;
 class TypeMap : public Queue<JSTraceType> {
 public:
    TypeMap(nanojit::Allocator* alloc) : Queue<JSTraceType>(alloc) {}
+    void set(unsigned stackSlots, unsigned ngslots,
+             const JSTraceType* stackTypeMap, const JSTraceType* globalTypeMap);
    JS_REQUIRES_STACK void captureTypes(JSContext* cx, JSObject* globalObj, SlotList& slots, unsigned callDepth);
    JS_REQUIRES_STACK void captureMissingGlobalTypes(JSContext* cx, JSObject* globalObj, SlotList& slots,
                                                     unsigned stackSlots);
@ -633,8 +599,6 @@ struct REHashFn {
    }
 };

-class TreeInfo;
-
 struct FrameInfo {
    JSObject*       block;      // caller block chain head
    jsbytecode*     pc;         // caller fp->regs->pc
@ -695,51 +659,71 @@ enum RecursionStatus
    Recursion_Detected          /* Tree has down recursion and maybe up recursion. */
 };

-class TreeInfo {
-public:
-    TreeFragment* const       rootFragment;
-    JSScript*               script;
-    unsigned                maxNativeStackSlots;
-    ptrdiff_t               nativeStackBase;
-    unsigned                maxCallDepth;
+struct LinkableFragment : public VMFragment
+{
+    LinkableFragment(const void* _ip, nanojit::Allocator* alloc
+                     verbose_only(, uint32_t profFragID))
+      : VMFragment(_ip verbose_only(, profFragID)), typeMap(alloc), nStackTypes(0)
+    { }
+
+    uint32                  branchCount;
    TypeMap                 typeMap;
    unsigned                nStackTypes;
    SlotList*               globalSlots;
+};
+
+/*
+ * argc is cx->fp->argc at the trace loop header, i.e., the number of arguments
+ * pushed for the innermost JS frame. This is required as part of the fragment
+ * key because the fragment will write those arguments back to the interpreter
+ * stack when it exits, using its typemap, which implicitly incorporates a
+ * given value of argc. Without this feature, a fragment could be called as an
+ * inner tree with two different values of argc, and entry type checking or
+ * exit frame synthesis could crash.
+ */
+struct TreeFragment : public LinkableFragment
+{
+    TreeFragment(const void* _ip, nanojit::Allocator* alloc, JSObject* _globalObj,
+                 uint32 _globalShape, uint32 _argc verbose_only(, uint32_t profFragID)):
+        LinkableFragment(_ip, alloc verbose_only(, profFragID)),
+        first(NULL),
+        next(NULL),
+        peer(NULL),
+        globalObj(_globalObj),
+        globalShape(_globalShape),
+        argc(_argc),
+        dependentTrees(alloc),
+        linkedTrees(alloc),
+        sideExits(alloc),
+        gcthings(alloc),
+        sprops(alloc)
+    { }
+
+    TreeFragment* first;
+    TreeFragment* next;
+    TreeFragment* peer;
+    JSObject* globalObj;
+    uint32 globalShape;
+    uint32 argc;
    /* Dependent trees must be trashed if this tree dies, and updated on missing global types */
-    Queue<TreeFragment*> dependentTrees;
+    Queue<TreeFragment*>    dependentTrees;
    /* Linked trees must be updated on missing global types, but are not dependent */
-    Queue<TreeFragment*> linkedTrees;
-    Queue<VMSideExit*>      sideExits;
-    UnstableExit*           unstableExits;
-    /* All embedded GC things are registered here so the GC can scan them. */
-    Queue<jsval>            gcthings;
-    Queue<JSScopeProperty*> sprops;
+    Queue<TreeFragment*>    linkedTrees;
 #ifdef DEBUG
    const char*             treeFileName;
    uintN                   treeLineNumber;
    uintN                   treePCOffset;
 #endif
+    JSScript*               script;
    RecursionStatus         recursion;
-
-    TreeInfo(nanojit::Allocator* alloc,
-             TreeFragment* fragment,
-             SlotList* globalSlots)
-        : rootFragment(fragment),
-          script(NULL),
-          maxNativeStackSlots(0),
-          nativeStackBase(0),
-          maxCallDepth(0),
-          typeMap(alloc),
-          nStackTypes(0),
-          globalSlots(globalSlots),
-          dependentTrees(alloc),
-          linkedTrees(alloc),
-          sideExits(alloc),
-          unstableExits(NULL),
-          gcthings(alloc),
-          sprops(alloc),
-          recursion(Recursion_None)
-    {}
+    UnstableExit*           unstableExits;
+    Queue<VMSideExit*>      sideExits;
+    ptrdiff_t               nativeStackBase;
+    unsigned                maxCallDepth;
+    /* All embedded GC things are registered here so the GC can scan them. */
+    Queue<jsval>            gcthings;
+    Queue<JSScopeProperty*> sprops;
+    unsigned                maxNativeStackSlots;

    inline unsigned nGlobalTypes() {
        return typeMap.length() - nStackTypes;
@ -750,13 +734,18 @@ public:
    inline JSTraceType* stackTypeMap() {
        return typeMap.data();
    }
-    inline JSObject* globalObj() {
-        return rootFragment->globalObj;
-    }

+    JS_REQUIRES_STACK void initialize(JSContext* cx, SlotList *globalSlots);
    UnstableExit* removeUnstableExit(VMSideExit* exit);
 };

+inline TreeFragment*
+VMFragment::toTreeFragment()
+{
+    JS_ASSERT(root == this);
+    return static_cast<TreeFragment*>(this);
+}
+
 typedef enum JSBuiltinStatus {
    JSBUILTIN_BAILED = 1,
    JSBUILTIN_ERROR = 2
@ -935,8 +924,8 @@ class TraceRecorder
    /* The Fragment being recorded by this recording session. */
    VMFragment* const               fragment;

-    /* The tree to which this |fragment| will belong when finished. */
-    TreeInfo* const                 treeInfo;
+    /* The root fragment representing the tree. */
+    TreeFragment* const             tree;

    /* The reason we started recording. */
    RecordReason const              recordReason;
@ -965,6 +954,11 @@ class TraceRecorder
    nanojit::LIns* const            eor_ins;
    nanojit::LIns* const            loopLabel;

+    /* Lazy slot import state. */
+    unsigned                        importStackSlots;
+    unsigned                        importGlobalSlots;
+    TypeMap                         importTypeMap;
+
    /*
     * The LirBuffer used to supply memory to our LirWriter pipeline. Also contains the most recent
     * instruction for {sp, rp, state}. Also contains names for debug JIT spew. Should be split.
@ -1064,17 +1058,20 @@ class TraceRecorder
    JS_REQUIRES_STACK nanojit::GuardRecord* createGuardRecord(VMSideExit* exit);

    bool isGlobal(jsval* p) const;
+    ptrdiff_t nativeGlobalSlot(jsval *p) const;
    ptrdiff_t nativeGlobalOffset(jsval* p) const;
    JS_REQUIRES_STACK ptrdiff_t nativeStackOffset(jsval* p) const;
+    JS_REQUIRES_STACK ptrdiff_t nativeStackSlot(jsval* p) const;
    JS_REQUIRES_STACK ptrdiff_t nativespOffset(jsval* p) const;
    JS_REQUIRES_STACK void import(nanojit::LIns* base, ptrdiff_t offset, jsval* p, JSTraceType t,
                                  const char *prefix, uintN index, JSStackFrame *fp);
-    JS_REQUIRES_STACK void import(TreeInfo* treeInfo, nanojit::LIns* sp, unsigned stackSlots,
+    JS_REQUIRES_STACK void import(TreeFragment* tree, nanojit::LIns* sp, unsigned stackSlots,
                                  unsigned callDepth, unsigned ngslots, JSTraceType* typeMap);
    void trackNativeStackUse(unsigned slots);

    JS_REQUIRES_STACK bool isValidSlot(JSScope* scope, JSScopeProperty* sprop);
    JS_REQUIRES_STACK bool lazilyImportGlobalSlot(unsigned slot);
+    JS_REQUIRES_STACK void importGlobalSlot(unsigned slot);

    JS_REQUIRES_STACK void guard(bool expected, nanojit::LIns* cond, ExitType exitType);
    JS_REQUIRES_STACK void guard(bool expected, nanojit::LIns* cond, VMSideExit* exit);
@ -1148,10 +1145,11 @@ class TraceRecorder
    JS_REQUIRES_STACK nanojit::LIns* alu(nanojit::LOpcode op, jsdouble v0, jsdouble v1,
                                         nanojit::LIns* s0, nanojit::LIns* s1);
    nanojit::LIns* f2i(nanojit::LIns* f);
+    nanojit::LIns* f2u(nanojit::LIns* f);
    JS_REQUIRES_STACK nanojit::LIns* makeNumberInt32(nanojit::LIns* f);
    JS_REQUIRES_STACK nanojit::LIns* stringify(jsval& v);

-    JS_REQUIRES_STACK nanojit::LIns* newArguments();
+    JS_REQUIRES_STACK nanojit::LIns* newArguments(nanojit::LIns* callee_ins);

    JS_REQUIRES_STACK RecordingStatus call_imacro(jsbytecode* imacro);

@ -1290,7 +1288,7 @@ class TraceRecorder
                                                                             ExitType exitType);
    JS_REQUIRES_STACK RecordingStatus guardNotGlobalObject(JSObject* obj,
                                                             nanojit::LIns* obj_ins);
-    void clearFrameSlotsFromCache();
+    void clearFrameSlotsFromTracker(Tracker& which);
    JS_REQUIRES_STACK void putArguments();
    JS_REQUIRES_STACK RecordingStatus guardCallee(jsval& callee);
    JS_REQUIRES_STACK JSStackFrame      *guardArguments(JSObject *obj, nanojit::LIns* obj_ins,
@ -1365,7 +1363,7 @@ class TraceRecorder
    inline void operator delete(void *p) { free(p); }

    JS_REQUIRES_STACK
-    TraceRecorder(JSContext* cx, VMSideExit*, VMFragment*, TreeInfo*,
+    TraceRecorder(JSContext* cx, VMSideExit*, VMFragment*,
                  unsigned stackSlots, unsigned ngslots, JSTraceType* typeMap,
                  VMSideExit* expectedInnerExit, jsbytecode* outerTree,
                  uint32 outerArgc, RecordReason reason);
@ -1381,6 +1379,7 @@ class TraceRecorder
    friend class AdjustCallerGlobalTypesVisitor;
    friend class AdjustCallerStackTypesVisitor;
    friend class TypeCompatibilityVisitor;
+    friend class ImportFrameSlotsVisitor;
    friend class SlotMap;
    friend class DefaultSlotMap;
    friend class DetermineTypesVisitor;
@ -1392,14 +1391,14 @@ class TraceRecorder

 public:
    static bool JS_REQUIRES_STACK
-    startRecorder(JSContext*, VMSideExit*, VMFragment*, TreeInfo*,
+    startRecorder(JSContext*, VMSideExit*, VMFragment*,
                  unsigned stackSlots, unsigned ngslots, JSTraceType* typeMap,
                  VMSideExit* expectedInnerExit, jsbytecode* outerTree,
                  uint32 outerArgc, RecordReason reason);

    /* Accessors. */
    VMFragment*         getFragment() const { return fragment; }
-    TreeInfo*           getTreeInfo() const { return treeInfo; }
+    TreeFragment*       getTree() const { return tree; }
    bool                outOfMemory() const { return traceMonitor->outOfMemory(); }

    /* Entry points / callbacks from the interpreter. */
--- a/js/src/jsutil.h
+++ b/js/src/jsutil.h
@ -108,13 +108,12 @@ JS_Assert(const char *s, const char *file, JSIntn ln);
 */
 extern JS_PUBLIC_API(void) JS_Abort(void);

-#if 0
+#ifdef DEBUG
 # define JS_BASIC_STATS 1
-# define JS_SCOPE_DEPTH_METER 1
 #endif

-#if defined DEBUG && !defined JS_BASIC_STATS
-# define JS_BASIC_STATS 1
+#ifdef DEBUG_brendan
+# define JS_SCOPE_DEPTH_METER 1
 #endif

 #ifdef JS_BASIC_STATS
--- a/js/src/lirasm/lirasm.cpp
+++ b/js/src/lirasm/lirasm.cpp
@ -380,7 +380,7 @@ imm(const string &s)
 }

 uint64_t
-quad(const string &s)
+lquad(const string &s)
 {
    stringstream tmp(s);
    uint64_t ret;
@ -571,7 +571,11 @@ FragmentAssembler::assemble_jump(bool isCond)
        return mLir->insBranch(mOpcode, condition, target);
    } else {
        LIns *ins = mLir->insBranch(mOpcode, condition, NULL);
+#ifdef __SUNPRO_CC
+        mFwdJumps.insert(make_pair<const string, LIns *>(name, ins));
+#else
        mFwdJumps.insert(make_pair(name, ins));
+#endif
        return ins;
    }
 }
@ -842,7 +846,11 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons
        if (!lab.empty()) {
            ins = mLir->ins0(LIR_label);
            typedef multimap<string, LIns *> mulmap;
+#ifdef __SUNPRO_CC
+            typedef mulmap::iterator ci;
+#else
            typedef mulmap::const_iterator ci;
+#endif
            pair<ci, ci> range = mFwdJumps.equal_range(lab);
            for (ci i = range.first; i != range.second; ++i) {
                i->second->setTarget(ins);
@ -968,7 +976,7 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons

          case LIR_quad:
            need(1);
-            ins = mLir->insImmq(quad(mTokens[0]));
+            ins = mLir->insImmq(lquad(mTokens[0]));
            break;

          case LIR_float:
@ -976,14 +984,29 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons
            ins = mLir->insImmf(immf(mTokens[0]));
            break;

+#if NJ_EXPANDED_LOADSTORE_SUPPORTED 
+          case LIR_stb:
+          case LIR_sts:
+          case LIR_st32f:
+#endif
          case LIR_sti:
          case LIR_stqi:
            need(3);
-            ins = mLir->insStorei(ref(mTokens[0]),
+            ins = mLir->insStore(mOpcode, ref(mTokens[0]),
                                  ref(mTokens[1]),
                                  imm(mTokens[2]));
            break;

+#if NJ_EXPANDED_LOADSTORE_SUPPORTED 
+          case LIR_ldzb:
+          case LIR_ldzs:
+          case LIR_ldsb:
+          case LIR_ldss:
+          case LIR_ldcsb:
+          case LIR_ldcss:
+          case LIR_ld32f:
+          case LIR_ldc32f:
+#endif
          case LIR_ld:
          case LIR_ldc:
          case LIR_ldq:
@ -1340,11 +1363,24 @@ FragmentAssembler::assembleRandomFragment(int nIns)
    I_loads.push_back(LIR_ldc);
    I_loads.push_back(LIR_ldcb);
    I_loads.push_back(LIR_ldcs);
+#if NJ_EXPANDED_LOADSTORE_SUPPORTED 
+    I_loads.push_back(LIR_ldzb);
+    I_loads.push_back(LIR_ldzs);
+    I_loads.push_back(LIR_ldsb);
+    I_loads.push_back(LIR_ldss);
+    I_loads.push_back(LIR_ldcsb);
+    I_loads.push_back(LIR_ldcss);
+#endif

    vector<LOpcode> QorF_loads;
    QorF_loads.push_back(LIR_ldq);      // weight LIR_ldq the heaviest
    QorF_loads.push_back(LIR_ldq);
    QorF_loads.push_back(LIR_ldqc);
+#if NJ_EXPANDED_LOADSTORE_SUPPORTED
+    // this loads a 32-bit float and expands to 64-bit float
+    QorF_loads.push_back(LIR_ld32f); 
+    QorF_loads.push_back(LIR_ldc32f);
+#endif

    enum LInsClass {
 #define CLASS(name, only64bit, relFreq)     name,
@ -1748,13 +1784,10 @@ Lirasm::Lirasm(bool verbose) :
 #endif

    // Populate the mOpMap table.
-#define OPDEF(op, number, repkind) \
-    mOpMap[#op] = LIR_##op;
-#define OPD64(op, number, repkind) \
+#define OPDEF(op, number, repKind, retType) \
    mOpMap[#op] = LIR_##op;
 #include "nanojit/LIRopcode.tbl"
 #undef OPDEF
-#undef OPD64

    // TODO - These should alias to the appropriate platform-specific LIR opcode.
    mOpMap["alloc"] = mOpMap["ialloc"];
--- a/js/src/nanojit-import-rev
+++ b/js/src/nanojit-import-rev
@ -1 +1 @@
-23ed78f42df2b7b1a590fc7e986e6d446ef4d3d4
+a6a96927117a1e462a04784e1b621a3d85f61099
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -83,7 +83,6 @@ namespace nanojit
        verbose_only( _outputCache = 0; )
        verbose_only( outline[0] = '\0'; )
        verbose_only( outlineEOL[0] = '\0'; )
-        verbose_only( outputAddr = false; )

        reset();
    }
@ -777,7 +776,6 @@ namespace nanojit
        }

        NIns* fragEntry = genPrologue();
-        verbose_only( outputAddr=true; )
        verbose_only( asm_output("[prologue]"); )

        // check for resource leaks
@ -968,7 +966,7 @@ namespace nanojit
            switch(op)
            {
                default:
-                    NanoAssertMsgf(false, "unsupported LIR instruction: %d (~0x40: %d)\n", op, op&~LIR64);
+                    NanoAssertMsgf(false, "unsupported LIR instruction: %d\n", op);
                    break;

                case LIR_regfence:
@ -1063,15 +1061,24 @@ namespace nanojit
                    asm_cmov(ins);
                    break;
                }
+                case LIR_ldzb:
+                case LIR_ldzs:
+                case LIR_ldsb:
+                case LIR_ldss:
+                case LIR_ldcsb:
+                case LIR_ldcss:
                case LIR_ld:
                case LIR_ldc:
                case LIR_ldcb:
                case LIR_ldcs:
                {
                    countlir_ld();
-                    asm_ld(ins);
+                    asm_load32(ins);
                    break;
                }
+
+                case LIR_ld32f:
+                case LIR_ldc32f:
                case LIR_ldq:
                case LIR_ldqc:
                {
@ -1159,27 +1166,30 @@ namespace nanojit
                    asm_promote(ins);
                    break;
                }
+                case LIR_stb:
+                case LIR_sts:
                case LIR_sti:
                {
                    countlir_st();
-                    asm_store32(ins->oprnd1(), ins->disp(), ins->oprnd2());
+                    asm_store32(op, ins->oprnd1(), ins->disp(), ins->oprnd2());
                    break;
                }
+                case LIR_st32f:
                case LIR_stqi:
                {
                    countlir_stq();
                    LIns* value = ins->oprnd1();
                    LIns* base = ins->oprnd2();
                    int dr = ins->disp();
-                    if (value->isop(LIR_qjoin))
+                    if (value->isop(LIR_qjoin) && op != LIR_st32f)
                    {
                        // this is correct for little-endian only
-                        asm_store32(value->oprnd1(), dr, base);
-                        asm_store32(value->oprnd2(), dr+4, base);
+                        asm_store32(LIR_sti, value->oprnd1(), dr, base);
+                        asm_store32(LIR_sti, value->oprnd2(), dr+4, base);
                    }
                    else
                    {
-                        asm_store64(value, dr, base);
+                        asm_store64(op, value, dr, base);
                    }
                    break;
                }
@ -1323,8 +1333,7 @@ namespace nanojit
                        label->addr = _nIns;
                    }
                    verbose_only( if (_logc->lcbits & LC_Assembly) { 
-                        outputAddr=true; asm_output("[%s]", 
-                        _thisfrag->lirbuf->names->formatRef(ins)); 
+                        asm_output("[%s]", _thisfrag->lirbuf->names->formatRef(ins)); 
                    })
                    break;
                }
@ -1407,23 +1416,6 @@ namespace nanojit
                case LIR_icall:
                {
                    countlir_call();
-                    Register rr = UnknownReg;
-                    if (ARM_VFP && op == LIR_fcall)
-                    {
-                        // fcall
-                        rr = asm_prep_fcall(ins);
-                    }
-                    else
-                    {
-                        rr = retRegs[0];
-                        prepResultReg(ins, rmask(rr));
-                    }
-
-                    // do this after we've handled the call result, so we dont
-                    // force the call result to be spilled unnecessarily.
-
-                    evictScratchRegs();
-
                    asm_call(ins);
                    break;
                }
@ -1805,7 +1797,7 @@ namespace nanojit
            }
        }
    }
-
+    
    /**
     * Merge the current state of the registers with a previously stored version
     * current == saved    skip
@ -1825,9 +1817,13 @@ namespace nanojit
        // of load/store multiple instructions.  Hence iterate the loop the
        // other way.  The "r <= LastReg" guards against wraparound in
        // the case where Register is treated as unsigned and FirstReg is zero.
-        for (Register r=LastReg; r >= FirstReg && r <= LastReg;
-                                 r = prevreg(r))
+        //
+        // Note, the loop var is deliberately typed as int (*not* Register)
+        // to outsmart compilers that will otherwise report
+        // "error: comparison is always true due to limited range of data type".
+        for (int ri=LastReg; ri >= FirstReg && ri <= LastReg; ri = int(prevreg(Register(ri))))
        {
+            Register const r = Register(ri);
            LIns * curins = _allocator.getActive(r);
            LIns * savedins = saved.getActive(r);
            if (curins == savedins)
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -207,9 +207,6 @@ namespace nanojit
            // Buffer used to hold extra text to be printed at the end of some
            // lines.
            static char  outlineEOL[512];
-            // If outputAddr=true the next asm instruction output will
-            // be prepended with its address.
-            bool outputAddr, vpad[3];

            // Outputs 'outline' and 'outlineEOL', and resets them both.
            // Output goes to '_outputCache' if it's non-NULL, or is printed
@ -341,10 +338,9 @@ namespace nanojit
            NIns*       asm_exit(LInsp guard);
            NIns*       asm_leave_trace(LInsp guard);
            void        asm_qjoin(LIns *ins);
-            void        asm_store32(LIns *val, int d, LIns *base);
-            void        asm_store64(LIns *val, int d, LIns *base);
+            void        asm_store32(LOpcode op, LIns *val, int d, LIns *base);
+            void        asm_store64(LOpcode op, LIns *val, int d, LIns *base);
            void        asm_restore(LInsp, Register);
-            void        asm_load(int d, Register r);
            void        asm_spilli(LInsp i, bool pop);
            void        asm_spill(Register rr, int d, bool pop, bool quad);
            void        asm_load64(LInsp i);
@ -354,7 +350,7 @@ namespace nanojit
            void        asm_cond(LInsp i);
            void        asm_arith(LInsp i);
            void        asm_neg_not(LInsp i);
-            void        asm_ld(LInsp i);
+            void        asm_load32(LInsp i);
            void        asm_cmov(LInsp i);
            void        asm_param(LInsp i);
            void        asm_int(LInsp i);
@ -365,7 +361,6 @@ namespace nanojit
            void        asm_i2f(LInsp ins);
            void        asm_u2f(LInsp ins);
            void        asm_promote(LIns *ins);
-            Register    asm_prep_fcall(LInsp ins);
            void        asm_nongp_copy(Register r, Register s);
            void        asm_call(LInsp);
            Register    asm_binop_rhs_reg(LInsp ins);
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -45,27 +45,29 @@ namespace nanojit
    #ifdef FEATURE_NANOJIT

    const uint8_t repKinds[] = {
-#define OPDEF(op, number, repkind) \
-        LRK_##repkind,
-#define OPD64(op, number, repkind) \
-        LRK_##repkind,
+#define OPDEF(op, number, repKind, retType) \
+        LRK_##repKind,
 #include "LIRopcode.tbl"
 #undef OPDEF
-#undef OPD64
        0
    };

+    const LTy retTypes[] = {
+#define OPDEF(op, number, repKind, retType) \
+        LTy_##retType,
+#include "LIRopcode.tbl"
+#undef OPDEF
+        LTy_Void
+    };
+
    // LIR verbose specific
    #ifdef NJ_VERBOSE

    const char* lirNames[] = {
-#define OPDEF(op, number, repkind) \
-        #op,
-#define OPD64(op, number, repkind) \
+#define OPDEF(op, number, repKind, retType) \
        #op,
 #include "LIRopcode.tbl"
 #undef OPDEF
-#undef OPD64
        NULL
    };

@ -223,9 +225,8 @@ namespace nanojit
        return startOfRoom;
    }

-    LInsp LirBufWriter::insStorei(LInsp val, LInsp base, int32_t d)
+    LInsp LirBufWriter::insStore(LOpcode op, LInsp val, LInsp base, int32_t d)
    {
-        LOpcode op = val->isQuad() ? LIR_stqi : LIR_sti;
        base = insDisp(op, base, d);
        LInsSti* insSti = (LInsSti*)_buf->makeRoom(sizeof(LInsSti));
        LIns*    ins    = insSti->getLIns();
@ -328,22 +329,22 @@ namespace nanojit

    LInsp LirBufWriter::insImmq(uint64_t imm)
    {
-        LInsI64* insI64 = (LInsI64*)_buf->makeRoom(sizeof(LInsI64));
-        LIns*    ins    = insI64->getLIns();
-        ins->initLInsI64(LIR_quad, imm);
+        LInsN64* insN64 = (LInsN64*)_buf->makeRoom(sizeof(LInsN64));
+        LIns*    ins    = insN64->getLIns();
+        ins->initLInsN64(LIR_quad, imm);
        return ins;
    }

    LInsp LirBufWriter::insImmf(double d)
    {
-        LInsI64* insI64 = (LInsI64*)_buf->makeRoom(sizeof(LInsI64));
-        LIns*    ins    = insI64->getLIns();
+        LInsN64* insN64 = (LInsN64*)_buf->makeRoom(sizeof(LInsN64));
+        LIns*    ins    = insN64->getLIns();
        union {
            double d;
            uint64_t q;
        } u;
        u.d = d;
-        ins->initLInsI64(LIR_float, u.q);
+        ins->initLInsN64(LIR_float, u.q);
        return ins;
    }

@ -352,13 +353,10 @@ namespace nanojit
    {
        static const uint8_t insSizes[] = {
        // LIR_start is treated specially -- see below.
-#define OPDEF(op, number, repkind) \
-            ((number) == LIR_start ? 0 : sizeof(LIns##repkind)),
-#define OPD64(op, number, repkind) \
-            OPDEF(op, number, repkind)
+#define OPDEF(op, number, repKind, retType) \
+            ((number) == LIR_start ? 0 : sizeof(LIns##repKind)),
 #include "LIRopcode.tbl"
 #undef OPDEF
-#undef OPD64
            0
        };

@ -381,6 +379,33 @@ namespace nanojit
        return ret;
    }

+    LOpcode f64arith_to_i32arith(LOpcode op)
+    {
+        switch (op) {
+        case LIR_fneg:  return LIR_neg;
+        case LIR_fadd:  return LIR_add;
+        case LIR_fsub:  return LIR_sub;
+        case LIR_fmul:  return LIR_mul;
+        default:        NanoAssert(0); return LIR_skip;
+        }
+    }
+
+    LOpcode i32cmp_to_i64cmp(LOpcode op)
+    {
+        switch (op) {
+        case LIR_eq:    return LIR_qeq;
+        case LIR_lt:    return LIR_qlt;
+        case LIR_gt:    return LIR_qgt;
+        case LIR_le:    return LIR_qle;
+        case LIR_ge:    return LIR_qge;
+        case LIR_ult:   return LIR_qult;
+        case LIR_ugt:   return LIR_qugt;
+        case LIR_ule:   return LIR_qule;
+        case LIR_uge:   return LIR_quge;
+        default:        NanoAssert(0); return LIR_skip;
+        }
+    }
+
    // This is never called, but that's ok because it contains only static
    // assertions.
    void LIns::staticSanityCheck()
@ -400,9 +425,9 @@ namespace nanojit
        NanoStaticAssert(sizeof(LInsP)   == 2*sizeof(void*));
        NanoStaticAssert(sizeof(LInsI)   == 2*sizeof(void*));
    #if defined NANOJIT_64BIT
-        NanoStaticAssert(sizeof(LInsI64) == 2*sizeof(void*));
+        NanoStaticAssert(sizeof(LInsN64) == 2*sizeof(void*));
    #else
-        NanoStaticAssert(sizeof(LInsI64) == 3*sizeof(void*));
+        NanoStaticAssert(sizeof(LInsN64) == 3*sizeof(void*));
    #endif

        // oprnd_1 must be in the same position in LIns{Op1,Op2,Op3,Ld,Sti}
@ -859,6 +884,12 @@ namespace nanojit
 #endif
    }

+    LIns* LirWriter::insStorei(LIns* value, LIns* base, int32_t d)
+    {
+        LOpcode op = value->isQuad() ? LIR_stqi : LIR_sti;
+        return insStore(op, value, base, d);
+    }
+
    LIns* LirWriter::qjoin(LInsp lo, LInsp hi)
    {
        return ins2(LIR_qjoin, lo, hi);
@ -1483,8 +1514,16 @@ namespace nanojit
                case LIR_ldc:
                case LIR_ldq:
                case LIR_ldqc:
+                case LIR_ldzb:
+                case LIR_ldzs:
                case LIR_ldcb:
                case LIR_ldcs:
+                case LIR_ldsb:
+                case LIR_ldss:
+                case LIR_ldcsb:
+                case LIR_ldcss:
+                case LIR_ld32f:
+                case LIR_ldc32f:
                case LIR_ret:
                case LIR_fret:
                case LIR_live:
@ -1510,6 +1549,8 @@ namespace nanojit

                case LIR_sti:
                case LIR_stqi:
+                case LIR_stb:
+                case LIR_sts:
                case LIR_eq:
                case LIR_lt:
                case LIR_gt:
@ -1884,8 +1925,16 @@ namespace nanojit
            case LIR_ldc:
            case LIR_ldq:
            case LIR_ldqc:
+            case LIR_ldzb:
+            case LIR_ldzs:
            case LIR_ldcb:
            case LIR_ldcs:
+            case LIR_ldsb:
+            case LIR_ldss:
+            case LIR_ldcsb:
+            case LIR_ldcss:
+            case LIR_ld32f:
+            case LIR_ldc32f:
                VMPI_sprintf(s, "%s = %s %s[%d]", formatRef(i), lirNames[op],
                    formatRef(i->oprnd1()),
                    i->disp());
@ -1893,6 +1942,9 @@ namespace nanojit

            case LIR_sti:
            case LIR_stqi:
+            case LIR_stb:
+            case LIR_sts:
+            case LIR_st32f:
                VMPI_sprintf(s, "%s %s[%d] = %s", lirNames[op],
                    formatRef(i->oprnd2()),
                    i->disp(),
@ -2187,12 +2239,28 @@ namespace nanojit

    LInsp LoadFilter::insLoad(LOpcode v, LInsp base, int32_t disp)
    {
-        if (base != sp && base != rp && (v == LIR_ld || v == LIR_ldq)) {
-            uint32_t k;
-            LInsp ins = exprs->findLoad(v, base, disp, k);
-            if (ins)
-                return ins;
-            return exprs->add(LInsLoad, out->insLoad(v,base,disp), k);
+        if (base != sp && base != rp) 
+        {
+            switch (v)
+            {
+                case LIR_ld:
+                case LIR_ldq:
+                case LIR_ld32f:
+                case LIR_ldsb:
+                case LIR_ldss:
+                case LIR_ldzb:
+                case LIR_ldzs:
+                {
+                    uint32_t k;
+                    LInsp ins = exprs->findLoad(v, base, disp, k);
+                    if (ins)
+                        return ins;
+                    return exprs->add(LInsLoad, out->insLoad(v,base,disp), k);
+                }
+                default:
+                    // fall thru
+                    break;
+            }
        }
        return out->insLoad(v, base, disp);
    }
@ -2203,10 +2271,10 @@ namespace nanojit
            exprs->clear();
    }

-    LInsp LoadFilter::insStorei(LInsp v, LInsp b, int32_t d)
+    LInsp LoadFilter::insStore(LOpcode op, LInsp v, LInsp b, int32_t d)
    {
        clear(b);
-        return out->insStorei(v, b, d);
+        return out->insStore(op, v, b, d);
    }

    LInsp LoadFilter::insCall(const CallInfo *ci, LInsp args[])
@ -2250,18 +2318,12 @@ namespace nanojit
            const void *end = (const char*)start + e->size;
            const char *name = e->name;
            if (p == start) {
-                if (!(logc->lcbits & LC_NoCodeAddrs))
-                    VMPI_sprintf(b,"%p %s",p,name);
-                else
-                    VMPI_strcpy(b, name);
+                VMPI_sprintf(b,"%p %s",p,name);
                return dup(b);
            }
            else if (p > start && p < end) {
                int32_t d = int32_t(intptr_t(p)-intptr_t(start)) >> e->align;
-                if (!(logc->lcbits & LC_NoCodeAddrs))
-                    VMPI_sprintf(b, "%p %s+%d", p, name, d);
-                else
-                    VMPI_sprintf(b,"%s+%d", name, d);
+                VMPI_sprintf(b, "%p %s+%d", p, name, d);
                return dup(b);
            }
            else {
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@ -120,16 +120,28 @@ namespace nanojit
    };

    inline bool isCseOpcode(LOpcode op) {
-        op = LOpcode(op & ~LIR64);
-        return op >= LIR_int && op <= LIR_uge;
+        return (op >= LIR_int && op <= LIR_uge) ||
+               (op >= LIR_quad && op <= LIR_quge);
    }
    inline bool isRetOpcode(LOpcode op) {
-        return (op & ~LIR64) == LIR_ret;
+        return op == LIR_ret || op == LIR_fret;
    }
+    LOpcode f64arith_to_i32arith(LOpcode op);
+    LOpcode i32cmp_to_i64cmp(LOpcode op);

-    // Array holding the 'repkind' field from LIRopcode.tbl.
+    // Array holding the 'repKind' field from LIRopcode.tbl.
    extern const uint8_t repKinds[];

+    enum LTy {
+        LTy_Void,        // no value/no type
+        LTy_I32,         // 32-bit integer
+        LTy_I64,         // 64-bit integer
+        LTy_F64          // 64-bit float
+    };
+
+    // Array holding the 'retType' field from LIRopcode.tbl.
+    extern const LTy retTypes[];
+
    //-----------------------------------------------------------------------
    // Low-level instructions.  This is a bit complicated, because we have a
    // variable-width representation to minimise space usage.
@ -213,7 +225,7 @@ namespace nanojit
        LRK_C,
        LRK_P,
        LRK_I,
-        LRK_I64,
+        LRK_N64,
        LRK_Jtbl,
        LRK_None    // this one is used for unused opcode numbers
    };
@ -228,7 +240,7 @@ namespace nanojit
    class LInsC;
    class LInsP;
    class LInsI;
-    class LInsI64;
+    class LInsN64;
    class LInsJtbl;

    class LIns
@ -265,7 +277,7 @@ namespace nanojit
        inline LInsC*   toLInsC()   const;
        inline LInsP*   toLInsP()   const;
        inline LInsI*   toLInsI()   const;
-        inline LInsI64* toLInsI64() const;
+        inline LInsN64* toLInsN64() const;
        inline LInsJtbl*toLInsJtbl()const;

        void staticSanityCheck();
@ -284,7 +296,7 @@ namespace nanojit
        inline void initLInsC(LOpcode opcode, LIns** args, const CallInfo* ci);
        inline void initLInsP(int32_t arg, int32_t kind);
        inline void initLInsI(LOpcode opcode, int32_t imm32);
-        inline void initLInsI64(LOpcode opcode, int64_t imm64);
+        inline void initLInsN64(LOpcode opcode, int64_t imm64);
        inline void initLInsJtbl(LIns* index, uint32_t size, LIns** table);

        LOpcode opcode() const { return lastWord.opcode; }
@ -349,7 +361,7 @@ namespace nanojit
        // For LInsI.
        inline int32_t  imm32() const;

-        // For LInsI64.
+        // For LInsN64.
        inline int32_t  imm64_0() const;
        inline int32_t  imm64_1() const;
        inline uint64_t imm64()   const;
@ -416,9 +428,9 @@ namespace nanojit
            NanoAssert(LRK_None != repKinds[opcode()]);
            return LRK_I == repKinds[opcode()];
        }
-        bool isLInsI64() const {
+        bool isLInsN64() const {
            NanoAssert(LRK_None != repKinds[opcode()]);
-            return LRK_I64 == repKinds[opcode()];
+            return LRK_N64 == repKinds[opcode()];
        }
        bool isLInsJtbl() const {
            NanoAssert(LRK_None != repKinds[opcode()]);
@ -436,48 +448,35 @@ namespace nanojit
            return opcode() == o;
        }
        bool isQuad() const {
-            LOpcode op = opcode();
-#ifdef NANOJIT_64BIT
-            // callh in 64bit cpu's means a call that returns an int64 in a single register
-            return (!(op >= LIR_qeq && op <= LIR_quge) && (op & LIR64) != 0) ||
-                   op == LIR_callh;
-#else
-            // callh in 32bit cpu's means the 32bit MSW of an int64 result in 2 registers
-            return (op & LIR64) != 0;
-#endif
+            LTy ty = retTypes[opcode()];
+            return ty == LTy_I64 || ty == LTy_F64;
        }
        bool isCond() const {
-            LOpcode op = opcode();
-            return (op == LIR_ov) || isCmp();
+            return (isop(LIR_ov)) || isCmp();
        }
        bool isFloat() const;   // not inlined because it contains a switch
        bool isCmp() const {
            LOpcode op = opcode();
-            return (op >= LIR_eq && op <= LIR_uge) ||
+            return (op >= LIR_eq  && op <= LIR_uge) ||
                   (op >= LIR_qeq && op <= LIR_quge) ||
                   (op >= LIR_feq && op <= LIR_fge);
        }
        bool isCall() const {
-            LOpcode op = opcode();
-            return (op & ~LIR64) == LIR_icall || op == LIR_qcall;
+            return isop(LIR_icall) || isop(LIR_fcall) || isop(LIR_qcall);
        }
        bool isStore() const {
-            LOpcode op = LOpcode(opcode() & ~LIR64);
-            return op == LIR_sti;
+            return isLInsSti();
        }
        bool isLoad() const {
-            LOpcode op = opcode();
-            return op == LIR_ldq  || op == LIR_ld || op == LIR_ldc ||
-                   op == LIR_ldqc || op == LIR_ldcs || op == LIR_ldcb;
+            return isLInsLd();
        }
        bool isGuard() const {
-            LOpcode op = opcode();
-            return op == LIR_x || op == LIR_xf || op == LIR_xt ||
-                   op == LIR_xbarrier || op == LIR_xtbl;
+            return isop(LIR_x) || isop(LIR_xf) || isop(LIR_xt) ||
+                   isop(LIR_xbarrier) || isop(LIR_xtbl);
        }
        // True if the instruction is a 32-bit or smaller constant integer.
        bool isconst() const {
-            return opcode() == LIR_int;
+            return isop(LIR_int);
        }
        // True if the instruction is a 32-bit or smaller constant integer and
        // has the value val when treated as a 32-bit signed integer.
@ -486,7 +485,7 @@ namespace nanojit
        }
        // True if the instruction is a constant quad value.
        bool isconstq() const {
-            return opcode() == LIR_quad || opcode() == LIR_float;
+            return isop(LIR_quad) || isop(LIR_float);
        }
        // True if the instruction is a constant pointer value.
        bool isconstp() const
@ -499,7 +498,7 @@ namespace nanojit
        }
        // True if the instruction is a constant float value.
        bool isconstf() const {
-            return opcode() == LIR_float;
+            return isop(LIR_float);
        }

        bool isBranch() const {
@ -508,16 +507,16 @@ namespace nanojit

        bool isPtr() {
 #ifdef NANOJIT_64BIT
-            return isQuad();
+            return retTypes[opcode()] == LTy_I64;
 #else
-            return !isQuad();
+            return retTypes[opcode()] == LTy_I32;
 #endif
        }

        // Return true if removal of 'ins' from a LIR fragment could
        // possibly change the behaviour of that fragment, even if any
        // value computed by 'ins' is not used later in the fragment.
-        // In other words, can 'ins' possible alter control flow or memory?
+        // In other words, can 'ins' possibly alter control flow or memory?
        // Note, this assumes that loads will never fault and hence cannot
        // affect the control flow.
        bool isStmt() {
@ -701,8 +700,8 @@ namespace nanojit
        LIns* getLIns() { return &ins; };
    };

-    // Used for LIR_quad.
-    class LInsI64
+    // Used for LIR_quad and LIR_float.
+    class LInsN64
    {
    private:
        friend class LIns;
@ -750,7 +749,7 @@ namespace nanojit
    LInsC*   LIns::toLInsC()   const { return (LInsC*  )( uintptr_t(this+1) - sizeof(LInsC  ) ); }
    LInsP*   LIns::toLInsP()   const { return (LInsP*  )( uintptr_t(this+1) - sizeof(LInsP  ) ); }
    LInsI*   LIns::toLInsI()   const { return (LInsI*  )( uintptr_t(this+1) - sizeof(LInsI  ) ); }
-    LInsI64* LIns::toLInsI64() const { return (LInsI64*)( uintptr_t(this+1) - sizeof(LInsI64) ); }
+    LInsN64* LIns::toLInsN64() const { return (LInsN64*)( uintptr_t(this+1) - sizeof(LInsN64) ); }
    LInsJtbl*LIns::toLInsJtbl()const { return (LInsJtbl*)(uintptr_t(this+1) - sizeof(LInsJtbl)); }

    void LIns::initLInsOp0(LOpcode opcode) {
@ -821,12 +820,12 @@ namespace nanojit
        toLInsI()->imm32 = imm32;
        NanoAssert(isLInsI());
    }
-    void LIns::initLInsI64(LOpcode opcode, int64_t imm64) {
+    void LIns::initLInsN64(LOpcode opcode, int64_t imm64) {
        markAsClear();
        lastWord.opcode = opcode;
-        toLInsI64()->imm64_0 = int32_t(imm64);
-        toLInsI64()->imm64_1 = int32_t(imm64 >> 32);
-        NanoAssert(isLInsI64());
+        toLInsN64()->imm64_0 = int32_t(imm64);
+        toLInsN64()->imm64_1 = int32_t(imm64 >> 32);
+        NanoAssert(isLInsN64());
    }
    void LIns::initLInsJtbl(LIns* index, uint32_t size, LIns** table) {
        markAsClear();
@ -898,11 +897,11 @@ namespace nanojit

    inline int32_t LIns::imm32()     const { NanoAssert(isconst());  return toLInsI()->imm32; }

-    inline int32_t LIns::imm64_0()   const { NanoAssert(isconstq()); return toLInsI64()->imm64_0; }
-    inline int32_t LIns::imm64_1()   const { NanoAssert(isconstq()); return toLInsI64()->imm64_1; }
+    inline int32_t LIns::imm64_0()   const { NanoAssert(isconstq()); return toLInsN64()->imm64_0; }
+    inline int32_t LIns::imm64_1()   const { NanoAssert(isconstq()); return toLInsN64()->imm64_1; }
    uint64_t       LIns::imm64()     const {
        NanoAssert(isconstq());
-        return (uint64_t(toLInsI64()->imm64_1) << 32) | uint32_t(toLInsI64()->imm64_0);
+        return (uint64_t(toLInsN64()->imm64_1) << 32) | uint32_t(toLInsN64()->imm64_0);
    }
    double         LIns::imm64f()    const {
        union {
@ -1006,8 +1005,8 @@ namespace nanojit
        virtual LInsp insLoad(LOpcode op, LIns* base, int32_t d) {
            return out->insLoad(op, base, d);
        }
-        virtual LInsp insStorei(LIns* value, LIns* base, int32_t d) {
-            return out->insStorei(value, base, d);
+        virtual LInsp insStore(LOpcode op, LIns* value, LIns* base, int32_t d) {
+            return out->insStore(op, value, base, d);
        }
        // args[] is in reverse order, ie. args[0] holds the rightmost arg.
        virtual LInsp insCall(const CallInfo *call, LInsp args[]) {
@ -1039,6 +1038,8 @@ namespace nanojit
        // Sign or zero extend integers to native integers. On 32-bit this is a no-op.
        LIns*        ins_i2p(LIns* intIns);
        LIns*        ins_u2p(LIns* uintIns);
+        // choose LIR_sti or LIR_stqi based on size of value
+        LIns*        insStorei(LIns* value, LIns* base, int32_t d);
    };


@ -1098,10 +1099,10 @@ namespace nanojit
            char* name;
        };
        HashMap<LInsp, Entry*> names;
-        LabelMap *labels;
        void formatImm(int32_t c, char *buf);
-    public:

+    public:
+        LabelMap *labels;
        LirNameMap(Allocator& alloc, LabelMap *lm)
            : alloc(alloc),
            lircounts(alloc),
@ -1192,8 +1193,8 @@ namespace nanojit
        LIns* insLoad(LOpcode v, LInsp base, int32_t disp) {
            return add(out->insLoad(v, base, disp));
        }
-        LIns* insStorei(LInsp v, LInsp b, int32_t d) {
-            return add(out->insStorei(v, b, d));
+        LIns* insStore(LOpcode op, LInsp v, LInsp b, int32_t d) {
+            return add(out->insStore(op, v, b, d));
        }
        LIns* insAlloc(int32_t size) {
            return add(out->insAlloc(size));
@ -1374,7 +1375,7 @@ namespace nanojit

            // LirWriter interface
            LInsp   insLoad(LOpcode op, LInsp base, int32_t disp);
-            LInsp   insStorei(LInsp o1, LInsp o2, int32_t disp);
+            LInsp   insStore(LOpcode op, LInsp o1, LInsp o2, int32_t disp);
            LInsp   ins0(LOpcode op);
            LInsp   ins1(LOpcode op, LInsp o1);
            LInsp   ins2(LOpcode op, LInsp o1, LInsp o2);
@ -1483,7 +1484,7 @@ namespace nanojit

        LInsp ins0(LOpcode);
        LInsp insLoad(LOpcode, LInsp base, int32_t disp);
-        LInsp insStorei(LInsp v, LInsp b, int32_t d);
+        LInsp insStore(LOpcode op, LInsp v, LInsp b, int32_t d);
        LInsp insCall(const CallInfo *call, LInsp args[]);
    };

--- a/js/src/nanojit/LIRopcode.tbl
+++ b/js/src/nanojit/LIRopcode.tbl
@ -42,16 +42,16 @@
 * Definitions of LIR opcodes.  If you need to allocate an opcode, look
 * for a name beginning with "__" and claim it.
 *
- * Includers must define OPDEF and OPD64 macros of the following forms:
+ * Includers must define an OPDEF macro of the following form:
 *
- * #define OPDEF(op,val,repkind) ...
- * #define OPD64(op,val,repkind) ...
+ *   #define OPDEF(op, val, repKind, retType) ...
 *
 * Selected arguments can then be used within the macro expansions.
 * - op         Bytecode name, token-pasted after "LIR_" to form an LOpcode.
 * - val        Bytecode value, which is the LOpcode enumerator value.
- * - repkind    Indicates how the instruction is represented in memory;  XYZ
+ * - repKind    Indicates how the instruction is represented in memory;  XYZ
 *              corresponds to LInsXYZ and LRK_XYZ.
+ * - retType    Type (LTy) of the value returned by the instruction.
 *
 * This file is best viewed with 128 columns:
 12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678
@ -63,36 +63,36 @@
 /*    op    val name        operands */

 /* special operations (must be 0..N) */
-OPDEF(start,     0, Op0)    // start of a fragment
-OPDEF(regfence,  1, Op0)    // register fence, no register allocation is allowed across this meta instruction
-OPDEF(skip,      2, Sk)     // holds blobs ("payloads") of data;  also links pages
-OPDEF(__3,       3, None)
-OPDEF(__4,       4, None)
-OPDEF(__5,       5, None)
-OPDEF(__6,       6, None)
+OPDEF(start,     0, Op0,  Void) // start of a fragment
+OPDEF(regfence,  1, Op0,  Void) // register fence, no register allocation is allowed across this meta instruction
+OPDEF(skip,      2, Sk,   Void) // used to link code chunks

 /* non-pure operations */
-OPDEF(iaddp,     7, Op2)    // integer addition for temporary pointer calculations (32bit only)
-OPDEF(iparam,    8, P)      // load a parameter (32bit register or stk location)
-OPDEF(__9,       9, None)
-OPDEF(ld,       10, Ld)     // 32-bit load
-OPDEF(ialloc,   11, I)      // alloc some stack space (value is 32bit address)
-OPDEF(sti,      12, Sti)    // 32-bit store
-OPDEF(ret,      13, Op1)    // return a word-sized value
-OPDEF(live,     14, Op1)    // extend live range of reference
-OPDEF(flive,    15, Op1)    // extend live range of a floating point value reference
-OPDEF(icall,    16, C)      // subroutine call returning a 32-bit value
-OPDEF(__17,     17, None)
+OPDEF(ldsb,      3, Ld,   I32)  // 8-bit integer load, sign-extend to 32-bit
+OPDEF(ldss,      4, Ld,   I32)  // 16-bit integer load, sign-extend to 32-bit
+OPDEF(ldzb,      5, Ld,   I32)  // 8-bit integer load, zero extend to 32-bit
+OPDEF(ldzs,      6, Ld,   I32)  // 16-bit integer load, zero extend to 32-bit
+OPDEF(iaddp,     7, Op2,  I32)  // integer addition for temporary pointer calculations (32bit only)
+OPDEF(iparam,    8, P,    I32)  // load a parameter (32bit register or stk location)
+OPDEF(stb,       9, Sti,  Void) // 8-bit integer store
+OPDEF(ld,       10, Ld,   I32)  // 32-bit integer load
+OPDEF(ialloc,   11, I,    I32)  // alloc some stack space (value is 32bit address)
+OPDEF(sti,      12, Sti,  Void) // 32-bit integer store
+OPDEF(ret,      13, Op1,  Void) // return a word-sized value
+OPDEF(live,     14, Op1,  Void) // extend live range of reference
+OPDEF(flive,    15, Op1,  Void) // extend live range of a floating point value reference
+OPDEF(icall,    16, C,    I32)  // subroutine call returning a 32-bit value
+OPDEF(sts,      17, Sti,  Void) // 16-bit integer store

 /* guards */
-OPDEF(x,        18, Op2)    // exit always
+OPDEF(x,        18, Op2,  Void) // exit always

 /* branches */
-OPDEF(j,        19, Op2)    // jump always
-OPDEF(jt,       20, Op2)    // jump if true
-OPDEF(jf,       21, Op2)    // jump if false
-OPDEF(label,    22, Op0)    // a jump target (no machine code is emitted for this)
-OPDEF(jtbl,     23, Jtbl)   // jump to address in table
+OPDEF(j,        19, Op2,  Void) // jump always
+OPDEF(jt,       20, Op2,  Void) // jump if true
+OPDEF(jf,       21, Op2,  Void) // jump if false
+OPDEF(label,    22, Op0,  Void) // a jump target (no machine code is emitted for this)
+OPDEF(jtbl,     23, Jtbl, Void) // jump to address in table

 /* operators */

@ -101,158 +101,154 @@ OPDEF(jtbl,     23, Jtbl)   // jump to address in table
 *     common-subexpression-elimination detection code.
 */

-OPDEF(int,      24, I)      // constant 32-bit integer
-OPDEF(cmov,     25, Op3)    // conditional move
-OPDEF(callh,    26, Op1)    // get the high 32 bits of a call returning a 64-bit value in two 32bit registers
+OPDEF(int,      24, I,    I32)  // constant 32-bit integer
+OPDEF(cmov,     25, Op3,  I32)  // conditional move
+OPDEF(callh,    26, Op1,  I32)  // get the high 32 bits of a call returning a 64-bit value in two 32bit registers

-/*
- * feq though fge must only be used on float arguments.  They return integers.
- * For all except feq, (op ^ 1) is the op which flips the
- * left and right sides of the comparison, so (lt ^ 1) == gt, or the operator
- * "<" is xored with 1 to get ">".  Similarly, (op ^ 3) is the complement of
- * op, so (lt ^ 1) == ge, or the complement of the operator "<" is ">=" xored
- * with 3.  NB: These opcodes must remain continuous so that comparison-opcode
- * detection works correctly.
- */
-OPDEF(feq,      27, Op2)    // floating-point equality
-OPDEF(flt,      28, Op2)    // floating-point less-than
-OPDEF(fgt,      29, Op2)    // floating-point greater-than
-OPDEF(fle,      30, Op2)    // floating-point less-than-or-equal
-OPDEF(fge,      31, Op2)    // floating-point greater-than-or-equal
+// feq though fge must only be used on float arguments.  They return integers.
+// For all except feq, (op ^ 1) is the op which flips the
+// left and right sides of the comparison, so (lt ^ 1) == gt, or the operator
+// "<" is xored with 1 to get ">".  Similarly, (op ^ 3) is the complement of
+// op, so (lt ^ 1) == ge, or the complement of the operator "<" is ">=" xored
+// with 3.  NB: These opcodes must remain continuous so that comparison-opcode
+// detection works correctly.
+OPDEF(feq,      27, Op2,  I32)  // floating-point equality
+OPDEF(flt,      28, Op2,  I32)  // floating-point less-than
+OPDEF(fgt,      29, Op2,  I32)  // floating-point greater-than
+OPDEF(fle,      30, Op2,  I32)  // floating-point less-than-or-equal
+OPDEF(fge,      31, Op2,  I32)  // floating-point greater-than-or-equal

-OPDEF(ldcb,     32, Ld)     // non-volatile  8-bit load
-OPDEF(ldcs,     33, Ld)     // non-volatile 16-bit load
-OPDEF(ldc,      34, Ld)     // non-volatile 32-bit load
+OPDEF(ldcb,     32, Ld,   I32)  // non-volatile  8-bit integer load, zero-extended to 32-bit
+OPDEF(ldcs,     33, Ld,   I32)  // non-volatile 16-bit integer load, zero-extended to 32-bit
+OPDEF(ldc,      34, Ld,   I32)  // non-volatile 32-bit integer load, zero-extended to 32-bit

-OPDEF(neg,      35, Op1)    // integer negation
-OPDEF(add,      36, Op2)    // integer addition
-OPDEF(sub,      37, Op2)    // integer subtraction
-OPDEF(mul,      38, Op2)    // integer multiplication
-OPDEF(div,      39, Op2)    // integer division
-OPDEF(mod,      40, Op1)    // hack: get the modulus from a LIR_div result, for x86 only
+OPDEF(neg,      35, Op1,  I32)  // integer negation
+OPDEF(add,      36, Op2,  I32)  // integer addition
+OPDEF(sub,      37, Op2,  I32)  // integer subtraction
+OPDEF(mul,      38, Op2,  I32)  // integer multiplication
+OPDEF(div,      39, Op2,  I32)  // integer division
+OPDEF(mod,      40, Op1,  I32)  // hack: get the modulus from a LIR_div result, for x86 only

-OPDEF(and,      41, Op2)    // 32-bit bitwise AND
-OPDEF(or,       42, Op2)    // 32-bit bitwise OR
-OPDEF(xor,      43, Op2)    // 32-bit bitwise XOR
-OPDEF(not,      44, Op1)    // 32-bit bitwise NOT
-OPDEF(lsh,      45, Op2)    // 32-bit left shift
-OPDEF(rsh,      46, Op2)    // 32-bit right shift with sign-extend (>>)
-OPDEF(ush,      47, Op2)    // 32-bit unsigned right shift (>>>)
+OPDEF(and,      41, Op2,  I32)  // 32-bit bitwise AND
+OPDEF(or,       42, Op2,  I32)  // 32-bit bitwise OR
+OPDEF(xor,      43, Op2,  I32)  // 32-bit bitwise XOR
+OPDEF(not,      44, Op1,  I32)  // 32-bit bitwise NOT
+OPDEF(lsh,      45, Op2,  I32)  // 32-bit left shift
+OPDEF(rsh,      46, Op2,  I32)  // 32-bit right shift with sign-extend (>>)
+OPDEF(ush,      47, Op2,  I32)  // 32-bit unsigned right shift (>>>)

-// conditional guards, op^1 to complement.  Only things that are
+// Conditional guards, op^1 to complement.  Only things that are
 // isCond() can be passed to these.
-OPDEF(xt,       48, Op2)    // exit if true   (0x30 0011 0000)
-OPDEF(xf,       49, Op2)    // exit if false  (0x31 0011 0001)
+OPDEF(xt,       48, Op2,  Void) // exit if true   (0x30 0011 0000)
+OPDEF(xf,       49, Op2,  Void) // exit if false  (0x31 0011 0001)

-OPDEF(qlo,      50, Op1)    // get the low  32 bits of a 64-bit value
-OPDEF(qhi,      51, Op1)    // get the high 32 bits of a 64-bit value
+OPDEF(qlo,      50, Op1,  I32)  // get the low  32 bits of a 64-bit value
+OPDEF(qhi,      51, Op1,  I32)  // get the high 32 bits of a 64-bit value

-OPDEF(__52,     52, None)
-OPDEF(__53,     53, None)
+OPDEF(ldcsb,    52, Ld,   Void) // non-volatile  8-bit integer load, sign-extednded to 32-bit
+OPDEF(ldcss,    53, Ld,   Void) // non-volatile 16-bit integer load, sign-extednded to 32-bit

-// This must be right before LIR_eq, so (op&~LIR64 - LIR_ov) can be indexed
-// into a convenient table.
-OPDEF(ov,       54, Op1)    // test for overflow;  value must have just been computed
+OPDEF(ov,       54, Op1,  I32)  // test for overflow;  value must have just been computed

-// Integer (32 bit) relational operators.  (op ^ 1) is the op which flips the
+// Integer (32-bit) relational operators.  (op ^ 1) is the op which flips the
 // left and right sides of the comparison, so (lt ^ 1) == gt, or the operator
 // "<" is xored with 1 to get ">".  Similarly, (op ^ 3) is the complement of
 // op, so (lt ^ 1) == ge, or the complement of the operator "<" is ">=" xored
 // with 3.  'u' prefix indicates the unsigned integer variant.
 // NB: These opcodes must remain continuous so that comparison-opcode detection
 // works correctly.
-OPDEF(eq,       55, Op2)    //          integer equality
-OPDEF(lt,       56, Op2)    //   signed integer less-than             (0x38 0011 1000)
-OPDEF(gt,       57, Op2)    //   signed integer greater-than          (0x39 0011 1001)
-OPDEF(le,       58, Op2)    //   signed integer less-than-or-equal    (0x3A 0011 1010)
-OPDEF(ge,       59, Op2)    //   signed integer greater-than-or-equal (0x3B 0011 1011)
-OPDEF(ult,      60, Op2)    // unsigned integer less-than             (0x3C 0011 1100)
-OPDEF(ugt,      61, Op2)    // unsigned integer greater-than          (0x3D 0011 1101)
-OPDEF(ule,      62, Op2)    // unsigned integer less-than-or-equal    (0x3E 0011 1110)
-OPDEF(uge,      63, Op2)    // unsigned integer greater-than-or-equal (0x3F 0011 1111)
+OPDEF(eq,       55, Op2,  I32)  //          integer equality
+OPDEF(lt,       56, Op2,  I32)  //   signed integer less-than             (0x38 0011 1000)
+OPDEF(gt,       57, Op2,  I32)  //   signed integer greater-than          (0x39 0011 1001)
+OPDEF(le,       58, Op2,  I32)  //   signed integer less-than-or-equal    (0x3A 0011 1010)
+OPDEF(ge,       59, Op2,  I32)  //   signed integer greater-than-or-equal (0x3B 0011 1011)
+OPDEF(ult,      60, Op2,  I32)  // unsigned integer less-than             (0x3C 0011 1100)
+OPDEF(ugt,      61, Op2,  I32)  // unsigned integer greater-than          (0x3D 0011 1101)
+OPDEF(ule,      62, Op2,  I32)  // unsigned integer less-than-or-equal    (0x3E 0011 1110)
+OPDEF(uge,      63, Op2,  I32)  // unsigned integer greater-than-or-equal (0x3F 0011 1111)

-OPD64(__0_64,    0, None)
+OPDEF(__64,     64, None, Void)

-OPD64(file,      1, Op1)    // source filename for debug symbols
-OPD64(line,      2, Op1)    // source line number for debug symbols
-OPD64(xbarrier,  3, Op2)    // memory barrier;  doesn't exit, but flushes all values to the stack
-OPD64(xtbl,      4, Op2)    // exit via indirect jump
+OPDEF(file,     65, Op1,  Void) // source filename for debug symbols
+OPDEF(line,     66, Op1,  Void) // source line number for debug symbols
+OPDEF(xbarrier, 67, Op2,  Void) // memory barrier;  doesn't exit, but flushes all values to the stack
+OPDEF(xtbl,     68, Op2,  Void) // exit via indirect jump

-OPD64(__5_64,    5, None)
-OPD64(__6_64,    6, None)
-OPD64(qaddp, LIR_iaddp, Op2)    // integer addition for temp pointer calculations (64bit only)
-OPD64(qparam, LIR_iparam, P)    // load a parameter (64bit register or stk location)
-OPD64(__9_64,    9, None)
+OPDEF(__69,     69, None, Void)
+OPDEF(__70,     70, None, Void)
+OPDEF(qaddp,    71, Op2,  I64)  // integer addition for temp pointer calculations (64bit only)
+OPDEF(qparam,   72, P,    I64)  // load a parameter (64bit register or stk location)
+OPDEF(__73,     73, None, Void)

-OPD64(ldq,  LIR_ld, Ld)     // 64-bit (quad) load
+OPDEF(ldq,      74, Ld,   I64)  // 64-bit (quad) load

-OPD64(qalloc, LIR_ialloc, I)    // allocate some stack space (value is 64bit address)
+OPDEF(qalloc,   75, I,    I64)  // allocate some stack space (value is 64bit address)

-OPD64(stqi, LIR_sti, Sti)   // 64-bit (quad) store
-OPD64(fret, LIR_ret, Op1)
+OPDEF(stqi,     76, Sti,  Void) // 64-bit (quad) store
+OPDEF(fret,     77, Op1,  Void)

-OPD64(__14_64,  14, None)
-OPD64(__15_64,  15, None)
+OPDEF(st32f,    78, Sti,  Void) // store 64-bit float as a 32-bit float (dropping precision)
+OPDEF(ld32f,    79, Ld,   F64)  // load 32-bit float and widen to 64-bit float

-OPD64(fcall, LIR_icall, C)  // subroutine call returning 64-bit (quad) double value
-OPD64(qcall,    17, C)      // subroutine call returning 64-bit (quad) integer value
+OPDEF(fcall,    80, C,    F64)  // subroutine call returning 64-bit (quad) double value
+OPDEF(qcall,    81, C,    I64)  // subroutine call returning 64-bit (quad) integer value

-OPD64(__18_64,  18, None)
-OPD64(__19_64,  19, None)
-OPD64(__20_64,  20, None)
-OPD64(__21_64,  21, None)
-OPD64(__22_64,  22, None)
-OPD64(__23_64,  23, None)
+OPDEF(__82,     82, None, Void)
+OPDEF(__83,     83, None, Void)
+OPDEF(__84,     84, None, Void)
+OPDEF(__85,     85, None, Void)
+OPDEF(__86,     86, None, Void)
+OPDEF(__87,     87, None, Void)

-// We strip off the 64 bit flag and compare that the opcode is between LIR_int
-// and LIR_uge to decide whether we can CSE the opcode. All opcodes below
-// this marker are subject to CSE.
+// All opcodes below this marker are subject to CSE.

-OPD64(quad,  LIR_int, I64)  // 64-bit (quad) constant value
-OPD64(qcmov, LIR_cmov, Op3) // 64-bit conditional move
+OPDEF(quad,     88, N64,  I64)  // 64-bit (quad) constant value
+OPDEF(qcmov,    89, Op3,  I64)  // 64-bit conditional move

-OPD64(i2q,      26, Op1)    // sign-extend i32 to i64
-OPD64(u2q,      27, Op1)    // zero-extend u32 to u64
-OPD64(i2f,      28, Op1)    // convert a signed 32-bit integer to a float
-OPD64(u2f,      29, Op1)    // convert an unsigned 32-bit integer to a float
+OPDEF(i2q,      90, Op1,  I64)  // sign-extend i32 to i64
+OPDEF(u2q,      91, Op1,  I64)  // zero-extend u32 to u64
+OPDEF(i2f,      92, Op1,  F64)  // convert a signed 32-bit integer to a float
+OPDEF(u2f,      93, Op1,  F64)  // convert an unsigned 32-bit integer to a float

-OPD64(__30_64,  30, None)
-OPD64(__31_64,  31, None)
-OPD64(__32_64,  32, None)
-OPD64(__33_64,  33, None)
+OPDEF(__94,     94, None, Void)
+OPDEF(__95,     95, None, Void)
+OPDEF(__96,     96, None, Void)
+OPDEF(__97,     97, None, Void)

-OPD64(ldqc, LIR_ldc, Ld)    // non-volatile 64-bit load
+OPDEF(ldqc,     98, Ld,   I64)  // non-volatile 64-bit load

-OPD64(fneg, LIR_neg, Op1)   // floating-point negation
-OPD64(fadd, LIR_add, Op2)   // floating-point addition
-OPD64(fsub, LIR_sub, Op2)   // floating-point subtraction
-OPD64(fmul, LIR_mul, Op2)   // floating-point multiplication
-OPD64(fdiv, LIR_div, Op2)   // floating-point division
-OPD64(fmod, LIR_mod, Op2)   // floating-point modulus(?)
+OPDEF(fneg,     99, Op1,  F64)  // floating-point negation
+OPDEF(fadd,    100, Op2,  F64)  // floating-point addition
+OPDEF(fsub,    101, Op2,  F64)  // floating-point subtraction
+OPDEF(fmul,    102, Op2,  F64)  // floating-point multiplication
+OPDEF(fdiv,    103, Op2,  F64)  // floating-point division
+OPDEF(fmod,    104, Op2,  F64)  // floating-point modulus(?)

-OPD64(qiand,    41, Op2)    // 64-bit bitwise AND
-OPD64(qior,     42, Op2)    // 64-bit bitwise OR
-OPD64(qxor,     43, Op2)    // 64-bit bitwise XOR
-OPD64(__44_64,  44, None)
-OPD64(qilsh,    45, Op2)    // 64-bit left shift
-OPD64(qirsh,    46, Op2)    // 64-bit signed right shift
-OPD64(qursh,    47, Op2)    // 64-bit unsigned right shift
-OPD64(qiadd,    48, Op2)    // 64-bit bitwise ADD
+OPDEF(qiand,   105, Op2,  I64)  // 64-bit bitwise AND
+OPDEF(qior,    106, Op2,  I64)  // 64-bit bitwise OR
+OPDEF(qxor,    107, Op2,  I64)  // 64-bit bitwise XOR
+OPDEF(__108,   108, None, Void)
+OPDEF(qilsh,   109, Op2,  I64)  // 64-bit left shift
+OPDEF(qirsh,   110, Op2,  I64)  // 64-bit signed right shift
+OPDEF(qursh,   111, Op2,  I64)  // 64-bit unsigned right shift
+OPDEF(qiadd,   112, Op2,  I64)  // 64-bit bitwise ADD

-OPD64(__49_64,  49, None)
-OPD64(qjoin,    50, Op2)    // join two 32-bit values (1st arg is low bits, 2nd is high)
-OPD64(__51_64,  51, None)
-OPD64(__52_64,  52, None)
-OPD64(__53_64,  53, None)
-OPD64(float,    54, I64)
+OPDEF(ldc32f,  113, Ld,   F64)  // non-volatile load 32-bit float and widen to 64-bit float
+OPDEF(qjoin,   114, Op2,  F64)  // join two 32-bit values (1st arg is low bits, 2nd is high)
+OPDEF(__115,   115, None, Void)
+OPDEF(__116,   116, None, Void)
+OPDEF(__117,   117, None, Void)
+OPDEF(float,   118, N64,  F64)

-// 64bit equivalents for integer comparisons
-OPD64(qeq,  LIR_eq, Op2)    //          integer equality
-OPD64(qlt,  LIR_lt, Op2)    //   signed integer less-than             (0x78 0111 1000)
-OPD64(qgt,  LIR_gt, Op2)    //   signed integer greater-than          (0x79 0111 1001)
-OPD64(qle,  LIR_le, Op2)    //   signed integer less-than-or-equal    (0x7A 0111 1010)
-OPD64(qge,  LIR_ge, Op2)    //   signed integer greater-than-or-equal (0x7B 0111 1011)
-OPD64(qult, LIR_ult, Op2)   // unsigned integer less-than             (0x7C 0111 1100)
-OPD64(qugt, LIR_ugt, Op2)   // unsigned integer greater-than          (0x7D 0111 1101)
-OPD64(qule, LIR_ule, Op2)   // unsigned integer less-than-or-equal    (0x7E 0111 1110)
-OPD64(quge, LIR_uge, Op2)   // unsigned integer greater-than-or-equal (0x7F 0111 1111)
+// Integer (64-bit) relational operators.
+// NB: These opcodes must remain continuous so that comparison-opcode detection
+// works correctly.
+OPDEF(qeq,     119, Op2,  I32)  //          integer equality
+OPDEF(qlt,     120, Op2,  I32)  //   signed integer less-than             (0x78 0111 1000)
+OPDEF(qgt,     121, Op2,  I32)  //   signed integer greater-than          (0x79 0111 1001)
+OPDEF(qle,     122, Op2,  I32)  //   signed integer less-than-or-equal    (0x7A 0111 1010)
+OPDEF(qge,     123, Op2,  I32)  //   signed integer greater-than-or-equal (0x7B 0111 1011)
+OPDEF(qult,    124, Op2,  I32)  // unsigned integer less-than             (0x7C 0111 1100)
+OPDEF(qugt,    125, Op2,  I32)  // unsigned integer greater-than          (0x7D 0111 1101)
+OPDEF(qule,    126, Op2,  I32)  // unsigned integer less-than-or-equal    (0x7E 0111 1110)
+OPDEF(quge,    127, Op2,  I32)  // unsigned integer greater-than-or-equal (0x7F 0111 1111)
--- a/js/src/nanojit/Native.h
+++ b/js/src/nanojit/Native.h
@ -61,17 +61,11 @@ namespace nanojit {
          : unsigned
 #endif
    {
-        // flags; upper bits reserved
-        LIR64    = 0x40,            // result is double or quad
-
-#define OPDEF(op, number, repkind) \
+#define OPDEF(op, number, repKind, retType) \
        LIR_##op = (number),
-#define OPD64(op, number, repkind) \
-        LIR_##op = ((number) | LIR64),
 #include "LIRopcode.tbl"
        LIR_sentinel,
 #undef OPDEF
-#undef OPD64

 #ifdef NANOJIT_64BIT
 #  define PTR_SIZE(a,b)  b
@ -125,6 +119,10 @@ namespace nanojit {
 #  define NJ_JTBL_SUPPORTED 0
 #endif

+#ifndef NJ_EXPANDED_LOADSTORE_SUPPORTED
+#  define NJ_EXPANDED_LOADSTORE_SUPPORTED 0
+#endif
+
 namespace nanojit {

    inline Register nextreg(Register r) {
@ -183,19 +181,14 @@ namespace nanojit {
    #elif defined(NJ_VERBOSE)
        // Used for printing native instructions.  Like Assembler::outputf(),
        // but only outputs if LC_Assembly is set.  Also prepends the output
-        // with the address of the current native instruction if
-        // LC_NoCodeAddrs is not set.  
+        // with the address of the current native instruction.
        #define asm_output(...) do { \
            counter_increment(native); \
            if (_logc->lcbits & LC_Assembly) { \
                outline[0]='\0'; \
-                if (outputAddr) \
-                   VMPI_sprintf(outline, "%010lx   ", (unsigned long)_nIns); \
-                else \
-                   VMPI_memset(outline, (int)' ', 10+3); \
+               VMPI_sprintf(outline, "%010lx   ", (unsigned long)_nIns); \
                sprintf(&outline[13], ##__VA_ARGS__); \
                output(); \
-                outputAddr=(_logc->lcbits & LC_NoCodeAddrs) ? false : true;    \
            } \
        } while (0) /* no semi */
        #define gpn(r)                  regNames[(r)]
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@ -824,6 +824,37 @@ Assembler::asm_stkarg(LInsp arg, int stkd)
 void
 Assembler::asm_call(LInsp ins)
 {
+    if (ARM_VFP && ins->isop(LIR_fcall)) {
+        /* Because ARM actually returns the result in (R0,R1), and not in a
+         * floating point register, the code to move the result into a correct
+         * register is below.  We do nothing here.
+         *
+         * The reason being that if we did something here, the final code
+         * sequence we'd get would be something like:
+         *     MOV {R0-R3},params        [from below]
+         *     BL function               [from below]
+         *     MOV {R0-R3},spilled data  [from evictScratchRegs()]
+         *     MOV Dx,{R0,R1}            [from here]
+         * which is clearly broken.
+         *
+         * This is not a problem for non-floating point calls, because the
+         * restoring of spilled data into R0 is done via a call to
+         * prepResultReg(R0) in the other branch of this if-then-else,
+         * meaning that evictScratchRegs() will not modify R0. However,
+         * prepResultReg is not aware of the concept of using a register pair
+         * (R0,R1) for the result of a single operation, so it can only be
+         * used here with the ultimate VFP register, and not R0/R1, which
+         * potentially allows for R0/R1 to get corrupted as described.
+         */
+    } else {
+        prepResultReg(ins, rmask(retRegs[0]));
+    }
+
+    // Do this after we've handled the call result, so we don't
+    // force the call result to be spilled unnecessarily.
+
+    evictScratchRegs();
+
    const CallInfo* call = ins->callInfo();
    ArgSize sizes[MAXARGS];
    uint32_t argc = call->get_sizes(sizes);
@ -835,8 +866,8 @@ Assembler::asm_call(LInsp ins)

    // If we're using VFP, and the return type is a double, it'll come back in
    // R0/R1. We need to either place it in the result fp reg, or store it.
-    // See comments in asm_prep_fcall() for more details as to why this is
-    // necessary here for floating point calls, but not for integer calls.
+    // See comments above for more details as to why this is necessary here
+    // for floating point calls, but not for integer calls.
    if (ARM_VFP && ins->isUsed()) {
        // Determine the size (and type) of the instruction result.
        ArgSize rsize = (ArgSize)(call->_argtypes & ARGSIZE_MASK_ANY);
@ -1172,8 +1203,21 @@ Assembler::asm_qjoin(LIns *ins)
 }

 void
-Assembler::asm_store32(LIns *value, int dr, LIns *base)
+Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
 {
+    switch (op) {
+        case LIR_sti:
+            // handled by mainline code below for now
+            break;
+        case LIR_stb:
+        case LIR_sts:
+            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            return;
+        default:
+            NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+            return;
+    }
+
    Register ra, rb;
    if (base->isop(LIR_alloc)) {
        rb = FP;
@ -1268,6 +1312,20 @@ Assembler::asm_load64(LInsp ins)
 {
    //asm_output("<<< load64");

+    switch (ins->opcode()) {
+        case LIR_ldq:
+        case LIR_ldqc:
+            // handled by mainline code below for now
+            break;
+        case LIR_ld32f:
+        case LIR_ldc32f:
+            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            return;
+        default:
+            NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+            return;
+    }
+
    NanoAssert(ins->isQuad());

    LIns* base = ins->oprnd1();
@ -1310,10 +1368,22 @@ Assembler::asm_load64(LInsp ins)
 }

 void
-Assembler::asm_store64(LInsp value, int dr, LInsp base)
+Assembler::asm_store64(LOpcode op, LInsp value, int dr, LInsp base)
 {
    //asm_output("<<< store64 (dr: %d)", dr);

+    switch (op) {
+        case LIR_stqi:
+            // handled by mainline code below for now
+            break;
+        case LIR_st32f:
+            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            return;
+        default:
+            NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
+            return;
+    }
+
    if (ARM_VFP) {
        Register rb = findRegFor(base, GpRegs);

@ -2028,33 +2098,6 @@ Assembler::asm_fcmp(LInsp ins)
    FCMPD(ra, rb, e_bit);
 }

-Register
-Assembler::asm_prep_fcall(LInsp)
-{
-    /* Because ARM actually returns the result in (R0,R1), and not in a
-     * floating point register, the code to move the result into a correct
-     * register is at the beginning of asm_call(). This function does
-     * nothing.
-     *
-     * The reason being that if this function did something, the final code
-     * sequence we'd get would be something like:
-     *     MOV {R0-R3},params        [from asm_call()]
-     *     BL function               [from asm_call()]
-     *     MOV {R0-R3},spilled data  [from evictScratchRegs()]
-     *     MOV Dx,{R0,R1}            [from this function]
-     * which is clearly broken.
-     *
-     * This is not a problem for non-floating point calls, because the
-     * restoring of spilled data into R0 is done via a call to prepResultReg(R0)
-     * at the same point in the sequence as this function is called, meaning that
-     * evictScratchRegs() will not modify R0. However, prepResultReg is not aware
-     * of the concept of using a register pair (R0,R1) for the result of a single
-     * operation, so it can only be used here with the ultimate VFP register, and
-     * not R0/R1, which potentially allows for R0/R1 to get corrupted as described.
-     */
-    return UnknownReg;
-}
-
 /* Call this with targ set to 0 if the target is not yet known and the branch
 * will be patched up later.
 */
@ -2440,7 +2483,7 @@ Assembler::asm_neg_not(LInsp ins)
 }

 void
-Assembler::asm_ld(LInsp ins)
+Assembler::asm_load32(LInsp ins)
 {
    LOpcode op = ins->opcode();
    LIns* base = ins->oprnd1();
@ -2449,25 +2492,31 @@ Assembler::asm_ld(LInsp ins)
    Register rr = prepResultReg(ins, GpRegs);
    Register ra = getBaseReg(op, base, d, GpRegs);

-    // these will always be 4-byte aligned
-    if (op == LIR_ld || op == LIR_ldc) {
-        LDR(rr, ra, d);
-        return;
+    switch(op) {
+        case LIR_ldzb:
+        case LIR_ldcb:
+            LDRB(rr, ra, d);
+            return;
+        case LIR_ldzs:
+        case LIR_ldcs:
+            // these are expected to be 2 or 4-byte aligned
+            LDRH(rr, ra, d);
+            return;
+        case LIR_ld:
+        case LIR_ldc:
+            // these are expected to be 4-byte aligned
+            LDR(rr, ra, d);
+            return;
+        case LIR_ldsb:
+        case LIR_ldss:
+        case LIR_ldcsb:
+        case LIR_ldcss:
+            NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+            return;
+        default:
+            NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+            return;
    }
-
-    // these will be 2 or 4-byte aligned
-    if (op == LIR_ldcs) {
-        LDRH(rr, ra, d);
-        return;
-    }
-
-    // aaand this is just any byte.
-    if (op == LIR_ldcb) {
-        LDRB(rr, ra, d);
-        return;
-    }
-
-    NanoAssertMsg(0, "Unsupported instruction in asm_ld");
 }

 void
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@ -79,6 +79,7 @@ namespace nanojit
 #define NJ_MAX_PARAMETERS               16
 #define NJ_ALIGN_STACK                  8
 #define NJ_JTBL_SUPPORTED               1
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0

 #define NJ_CONSTANT_POOLS
 const int NJ_MAX_CPOOL_OFFSET = 4096;
@ -799,22 +800,6 @@ enum {
        }                                                               \
    } while(0)

-#define STMIA(_b, _mask) do {                                           \
-        underrunProtect(4);                                             \
-        NanoAssert(IsGpReg(_b));                                        \
-        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
-        *(--_nIns) = (NIns)(COND_AL | (0x8A<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output("stmia %s!,{0x%x}", gpn(_b), _mask); \
-    } while (0)
-
-#define LDMIA(_b, _mask) do {                                           \
-        underrunProtect(4);                                             \
-        NanoAssert(IsGpReg(_b));                                        \
-        NanoAssert(((_mask)&rmask(_b))==0 && isU8(_mask));              \
-        *(--_nIns) = (NIns)(COND_AL | (0x8B<<20) | ((_b)<<16) | (_mask)&0xFF); \
-        asm_output("ldmia %s!,{0x%x}", gpn(_b), (_mask)); \
-    } while (0)
-
 /*
 * VFP
 */
--- a/js/src/nanojit/NativePPC.cpp
+++ b/js/src/nanojit/NativePPC.cpp
@ -140,30 +140,69 @@ namespace nanojit
        freeRsrcOf(ins, false); // if we had a reg in use, emit a ST to flush it to mem
    }

-    void Assembler::asm_ld(LIns *ins) {
+    void Assembler::asm_load32(LIns *ins) {
        LIns* base = ins->oprnd1();
        int d = ins->disp();
        Register rr = prepResultReg(ins, GpRegs);
        Register ra = getBaseReg(ins->opcode(), base, d, GpRegs);

-        #if !PEDANTIC
-        if (isS16(d)) {
-            if (ins->isop(LIR_ldcb)) {
-                LBZ(rr, d, ra);
-            } else {
-                LWZ(rr, d, ra);
-            }
-            return;
+        switch(ins->opcode()) {
+            case LIR_ldzb:
+            case LIR_ldcb:
+                if (isS16(d)) {
+                    LBZ(rr, d, ra);
+                } else {
+                    LBZX(rr, ra, R0); // rr = [ra+R0]
+                    asm_li(R0,d);
+                }
+                return;
+            case LIR_ldzs:
+            case LIR_ldcs:
+                // these are expected to be 2 or 4-byte aligned
+                if (isS16(d)) {
+                    LHZ(rr, d, ra);
+                } else {
+                    LHZX(rr, ra, R0); // rr = [ra+R0]
+                    asm_li(R0,d);
+                }
+                return;
+            case LIR_ld:
+            case LIR_ldc:
+                // these are expected to be 4-byte aligned
+                if (isS16(d)) {
+                    LWZ(rr, d, ra);
+                } else {
+                    LWZX(rr, ra, R0); // rr = [ra+R0]
+                    asm_li(R0,d);
+                }
+                return;
+            case LIR_ldsb:
+            case LIR_ldss:
+            case LIR_ldcsb:
+            case LIR_ldcss:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                return;
        }
-        #endif
-
-        // general case
-        underrunProtect(12);
-        LWZX(rr, ra, R0); // rr = [ra+R0]
-        asm_li(R0,d);
    }

-    void Assembler::asm_store32(LIns *value, int32_t dr, LIns *base) {
+    void Assembler::asm_store32(LOpcode op, LIns *value, int32_t dr, LIns *base) {
+
+        switch (op) {
+            case LIR_sti:
+                // handled by mainline code below for now
+                break;
+            case LIR_stb:
+            case LIR_sts:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+                return;
+        }
+
        Register rs = findRegFor(value, GpRegs);
        Register ra = value == base ? rs : getBaseReg(LIR_sti, base, dr, GpRegs & ~rmask(rs));

@ -180,6 +219,21 @@ namespace nanojit
    }

    void Assembler::asm_load64(LIns *ins) {
+
+        switch (ins->opcode()) {
+            case LIR_ldq:
+            case LIR_ldqc:
+                // handled by mainline code below for now
+                break;
+            case LIR_ld32f:
+            case LIR_ldc32f:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+                return;
+        }
+
        LIns* base = ins->oprnd1();
    #ifdef NANOJIT_64BIT
        Register rr = ins->getReg();
@ -256,8 +310,21 @@ namespace nanojit
        asm_li32(r, int32_t(imm>>32)); // r[0:31] = imm[32:63]
    }

-    void Assembler::asm_store64(LIns *value, int32_t dr, LIns *base) {
+    void Assembler::asm_store64(LOpcode op, LIns *value, int32_t dr, LIns *base) {
        NanoAssert(value->isQuad());
+
+        switch (op) {
+            case LIR_stqi:
+                // handled by mainline code below for now
+                break;
+            case LIR_st32f:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
+                return;
+        }
+
        Register ra = getBaseReg(LIR_stqi, base, dr, GpRegs);

    #if !PEDANTIC && !defined NANOJIT_64BIT
@ -595,10 +662,6 @@ namespace nanojit
        }
    }

-    Register Assembler::asm_prep_fcall(LIns *ins) {
-        return prepResultReg(ins, rmask(F1));
-    }
-
    void Assembler::asm_int(LIns *ins) {
        Register rr = prepResultReg(ins, GpRegs);
        asm_li(rr, ins->imm32());
@ -632,6 +695,14 @@ namespace nanojit
    }

    void Assembler::asm_call(LIns *ins) {
+        Register retReg = ( ins->isop(LIR_fcall) ? F1 : retRegs[0] );
+        prepResultReg(ins, rmask(retReg));
+
+        // Do this after we've handled the call result, so we don't
+        // force the call result to be spilled unnecessarily.
+
+        evictScratchRegs();
+
        const CallInfo* call = ins->callInfo();
        ArgSize sizes[MAXARGS];
        uint32_t argc = call->get_sizes(sizes);
--- a/js/src/nanojit/NativePPC.h
+++ b/js/src/nanojit/NativePPC.h
@ -57,6 +57,7 @@ namespace nanojit
 #define NJ_MAX_STACK_ENTRY              256
 #define NJ_ALIGN_STACK                  16
 #define NJ_JTBL_SUPPORTED               1
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0

    enum ConditionRegister {
        CR0 = 0,
@ -191,10 +192,13 @@ namespace nanojit
        PPC_fneg    = 0xFC000050, // floating negate
        PPC_fsub    = 0xFC000028, // floating subtract (double precision)
        PPC_lbz     = 0x88000000, // load byte and zero
+        PPC_lbzx    = 0x7C0000AE, // load byte and zero indexed
        PPC_ld      = 0xE8000000, // load doubleword
        PPC_ldx     = 0x7C00002A, // load doubleword indexed
        PPC_lfd     = 0xC8000000, // load floating point double
        PPC_lfdx    = 0x7C0004AE, // load floating-point double indexed
+        PPC_lhz     = 0xA0000000, // load halfword and zero
+        PPC_lhzx    = 0x7C00022E, // load halfword and zero indexed
        PPC_lwz     = 0x80000000, // load word and zero
        PPC_lwzx    = 0x7C00002E, // load word and zero indexed
        PPC_mfcr    = 0x7C000026, // move from condition register
@ -448,8 +452,11 @@ namespace nanojit
                "%s %s,%s,%s", #op, gpn(rs), gpn(ra), gpn(rb))

    #define LBZ(r,  d, b) MEMd(lbz,  r, d, b)
+    #define LHZ(r,  d, b) MEMd(lhz,  r, d, b)
    #define LWZ(r,  d, b) MEMd(lwz,  r, d, b)
    #define LD(r,   d, b) MEMd(ld,   r, d, b)
+    #define LBZX(r, a, b) MEMx(lbzx, r, a, b)
+    #define LHZX(r, a, b) MEMx(lhzx, r, a, b)
    #define LWZX(r, a, b) MEMx(lwzx, r, a, b)
    #define LDX(r,  a, b) MEMx(ldx,  r, a, b)

--- a/js/src/nanojit/NativeSparc.cpp
+++ b/js/src/nanojit/NativeSparc.cpp
@ -153,6 +153,14 @@ namespace nanojit

    void Assembler::asm_call(LInsp ins)
    {
+        Register retReg = ( ins->isop(LIR_fcall) ? F0 : retRegs[0] );
+        prepResultReg(ins, rmask(retReg));
+
+        // Do this after we've handled the call result, so we don't
+        // force the call result to be spilled unnecessarily.
+
+        evictScratchRegs();
+
        const CallInfo* call = ins->callInfo();

        underrunProtect(8);
@ -299,8 +307,21 @@ namespace nanojit
        }
    }

-    void Assembler::asm_store32(LIns *value, int dr, LIns *base)
+    void Assembler::asm_store32(LOpcode op, LIns *value, int dr, LIns *base)
    {
+        switch (op) {
+            case LIR_sti:
+                // handled by mainline code below for now
+                break;
+            case LIR_stb:
+            case LIR_sts:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+                return;
+        }
+
        underrunProtect(20);
        if (value->isconst())
            {
@ -344,6 +365,20 @@ namespace nanojit

    void Assembler::asm_load64(LInsp ins)
    {
+        switch (ins->opcode()) {
+            case LIR_ldq:
+            case LIR_ldqc:
+                // handled by mainline code below for now
+                break;
+            case LIR_ld32f:
+            case LIR_ldc32f:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+                return;
+        }
+
        underrunProtect(72);
        LIns* base = ins->oprnd1();
        int db = ins->disp();
@ -373,8 +408,20 @@ namespace nanojit
            }
    }

-    void Assembler::asm_store64(LInsp value, int dr, LInsp base)
+    void Assembler::asm_store64(LOpcode op, LInsp value, int dr, LInsp base)
    {
+        switch (op) {
+            case LIR_stqi:
+                // handled by mainline code below for now
+                break;
+            case LIR_st32f:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
+                return;
+        }
+
        underrunProtect(48);
        if (value->isconstq())
            {
@ -716,7 +763,7 @@ namespace nanojit
            ORI(ra, 0, rr);
    }

-    void Assembler::asm_ld(LInsp ins)
+    void Assembler::asm_load32(LInsp ins)
    {
        underrunProtect(12);
        LOpcode op = ins->opcode();
@ -724,12 +771,28 @@ namespace nanojit
        int d = ins->disp();
        Register rr = prepResultReg(ins, GpRegs);
        Register ra = getBaseReg(ins->opcode(), base, d, GpRegs);
-        if (op == LIR_ldcb) {
-            LDUB32(ra, d, rr);
-        } else if (op == LIR_ldcs) {
-            LDUH32(ra, d, rr);
-        } else {
-            LDSW32(ra, d, rr);
+        switch(op) {
+            case LIR_ldzb:
+            case LIR_ldcb:
+                LDUB32(ra, d, rr);
+                break;
+            case LIR_ldzs:
+            case LIR_ldcs:
+                LDUH32(ra, d, rr);
+                break;
+            case LIR_ld:
+            case LIR_ldc:
+                LDSW32(ra, d, rr);
+                break;
+            case LIR_ldsb:
+            case LIR_ldss:
+            case LIR_ldcsb:
+            case LIR_ldcss:
+                NanoAssertMsg(0, "NJ_EXPANDED_LOADSTORE_SUPPORTED not yet supported for this architecture");
+                return;
+            default:
+                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                return;
        }
    }

@ -879,11 +942,6 @@ namespace nanojit
        LDDF32(FP, d, rr);
    }

-    Register Assembler::asm_prep_fcall(LInsp ins)
-    {
-        return prepResultReg(ins, rmask(F0));
-    }
-
    void Assembler::asm_u2f(LInsp ins)
    {
        underrunProtect(72);
--- a/js/src/nanojit/NativeSparc.h
+++ b/js/src/nanojit/NativeSparc.h
@ -71,8 +71,10 @@ namespace nanojit

    const int LARGEST_UNDERRUN_PROT = 32;  // largest value passed to underrunProtect

-#define NJ_MAX_STACK_ENTRY 256
-#define NJ_MAX_PARAMETERS 1
+#define NJ_MAX_STACK_ENTRY              256
+#define NJ_MAX_PARAMETERS               1
+#define NJ_JTBL_SUPPORTED               0
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 0

    const int NJ_ALIGN_STACK = 16;

--- a/js/src/nanojit/NativeX64.cpp
+++ b/js/src/nanojit/NativeX64.cpp
@ -99,6 +99,11 @@ namespace nanojit
        "ah", "ch", "dh", "bh"
    };

+    const char *gpRegNames16[] = {
+        "ax",  "cx",  "dx",   "bx",   "spx",  "bpx",  "six",  "dix",
+        "r8x", "r9x", "r10x", "r11x", "r12x", "r13x", "r14x", "r15x"
+    };
+
 #ifdef _DEBUG
    #define TODO(x) todo(#x)
    static void todo(const char *s) {
@ -240,6 +245,11 @@ namespace nanojit
        emit(rexprb(mod_rr(op, r, b), r, b));
    }

+    // disp32 modrm8 form, when the disp fits in the instruction (opcode is 1-3 bytes)
+    void Assembler::emitrm8(uint64_t op, Register r, int32_t d, Register b) {
+        emit(rexrb8(mod_disp32(op, r, b, d), r, b));
+    }
+
    // disp32 modrm form, when the disp fits in the instruction (opcode is 1-3 bytes)
    void Assembler::emitrm(uint64_t op, Register r, int32_t d, Register b) {
        emit(rexrb(mod_disp32(op, r, b, d), r, b));
@ -343,6 +353,7 @@ namespace nanojit
    }

 #define RB(r)       gpRegNames8[(r)]
+#define RS(r)       gpRegNames16[(r)]
 #define RBhi(r)     gpRegNames8hi[(r)]
 #define RL(r)       gpRegNames32[(r)]
 #define RQ(r)       gpn(r)
@ -431,13 +442,16 @@ namespace nanojit
 // XORPS is a 4x32f vector operation, we use it instead of the more obvious
 // XORPD because it's one byte shorter.  This is ok because it's only used for
 // zeroing an XMM register;  hence the single argument.
-    void Assembler::XORPS(        R r)  { emitprr(X64_xorps,   r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
+// Also note that (unlike most SSE2 instructions) XORPS does not have a prefix, thus emitrr() should be used.
+    void Assembler::XORPS(        R r)  { emitrr(X64_xorps,   r,r); asm_output("xorps %s, %s",   RQ(r),RQ(r)); }
    void Assembler::DIVSD(   R l, R r)  { emitprr(X64_divsd,   l,r); asm_output("divsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::MULSD(   R l, R r)  { emitprr(X64_mulsd,   l,r); asm_output("mulsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::ADDSD(   R l, R r)  { emitprr(X64_addsd,   l,r); asm_output("addsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::SUBSD(   R l, R r)  { emitprr(X64_subsd,   l,r); asm_output("subsd %s, %s",   RQ(l),RQ(r)); }
    void Assembler::CVTSQ2SD(R l, R r)  { emitprr(X64_cvtsq2sd,l,r); asm_output("cvtsq2sd %s, %s",RQ(l),RQ(r)); }
    void Assembler::CVTSI2SD(R l, R r)  { emitprr(X64_cvtsi2sd,l,r); asm_output("cvtsi2sd %s, %s",RQ(l),RL(r)); }
+    void Assembler::CVTSS2SD(R l, R r)  { emitprr(X64_cvtss2sd,l,r); asm_output("cvtss2sd %s, %s",RQ(l),RL(r)); }
+    void Assembler::CVTSD2SS(R l, R r)  { emitprr(X64_cvtsd2ss,l,r); asm_output("cvtsd2ss %s, %s",RL(l),RQ(r)); }
    void Assembler::UCOMISD( R l, R r)  { emitprr(X64_ucomisd, l,r); asm_output("ucomisd %s, %s", RQ(l),RQ(r)); }
    void Assembler::MOVQRX(  R l, R r)  { emitprr(X64_movqrx,  r,l); asm_output("movq %s, %s",    RQ(l),RQ(r)); } // Nb: r and l are deliberately reversed within the emitprr() call.
    void Assembler::MOVQXR(  R l, R r)  { emitprr(X64_movqxr,  l,r); asm_output("movq %s, %s",    RQ(l),RQ(r)); }
@ -482,14 +496,21 @@ namespace nanojit
    void Assembler::LEAQRM(R r1, I d, R r2)     { emitrm(X64_leaqrm,r1,d,r2); asm_output("leaq %s, %d(%s)",RQ(r1),d,RQ(r2)); }
    void Assembler::MOVLRM(R r1, I d, R r2)     { emitrm(X64_movlrm,r1,d,r2); asm_output("movl %s, %d(%s)",RL(r1),d,RQ(r2)); }
    void Assembler::MOVQRM(R r1, I d, R r2)     { emitrm(X64_movqrm,r1,d,r2); asm_output("movq %s, %d(%s)",RQ(r1),d,RQ(r2)); }
+    void Assembler::MOVBMR(R r1, I d, R r2)     { emitrm8(X64_movbmr,r1,d,r2); asm_output("movb %d(%s), %s",d,RQ(r1),RB(r2)); }
+    void Assembler::MOVSMR(R r1, I d, R r2)     { emitprm(X64_movsmr,r1,d,r2); asm_output("movs %d(%s), %s",d,RQ(r1),RS(r2)); }
    void Assembler::MOVLMR(R r1, I d, R r2)     { emitrm(X64_movlmr,r1,d,r2); asm_output("movl %d(%s), %s",d,RQ(r1),RL(r2)); }
    void Assembler::MOVQMR(R r1, I d, R r2)     { emitrm(X64_movqmr,r1,d,r2); asm_output("movq %d(%s), %s",d,RQ(r1),RQ(r2)); }

    void Assembler::MOVZX8M( R r1, I d, R r2)   { emitrm_wide(X64_movzx8m, r1,d,r2); asm_output("movzxb %s, %d(%s)",RQ(r1),d,RQ(r2)); }
    void Assembler::MOVZX16M(R r1, I d, R r2)   { emitrm_wide(X64_movzx16m,r1,d,r2); asm_output("movzxs %s, %d(%s)",RQ(r1),d,RQ(r2)); }

+    void Assembler::MOVSX8M( R r1, I d, R r2)   { emitrm_wide(X64_movsx8m, r1,d,r2); asm_output("movsxb %s, %d(%s)",RQ(r1),d,RQ(r2)); }
+    void Assembler::MOVSX16M(R r1, I d, R r2)   { emitrm_wide(X64_movsx16m,r1,d,r2); asm_output("movsxs %s, %d(%s)",RQ(r1),d,RQ(r2)); }
+
    void Assembler::MOVSDRM(R r1, I d, R r2)    { emitprm(X64_movsdrm,r1,d,r2); asm_output("movsd %s, %d(%s)",RQ(r1),d,RQ(r2)); }
    void Assembler::MOVSDMR(R r1, I d, R r2)    { emitprm(X64_movsdmr,r1,d,r2); asm_output("movsd %d(%s), %s",d,RQ(r1),RQ(r2)); }
+    void Assembler::MOVSSRM(R r1, I d, R r2)    { emitprm(X64_movssrm,r1,d,r2); asm_output("movss %s, %d(%s)",RQ(r1),d,RQ(r2)); }
+    void Assembler::MOVSSMR(R r1, I d, R r2)    { emitprm(X64_movssmr,r1,d,r2); asm_output("movss %d(%s), %s",d,RQ(r1),RQ(r2)); }

    void Assembler::JMP8( S n, NIns* t)    { emit_target8(n, X64_jmp8,t); asm_output("jmp %p", t); }

@ -764,10 +785,10 @@ namespace nanojit
    void Assembler::asm_arith(LIns *ins) {
        Register rr, ra, rb;

-        switch (ins->opcode() & ~LIR64) {
-        case LIR_lsh:
-        case LIR_rsh:
-        case LIR_ush:
+        switch (ins->opcode()) {
+        case LIR_lsh: case LIR_qilsh:
+        case LIR_rsh: case LIR_qirsh:
+        case LIR_ush: case LIR_qursh:
            asm_shift(ins);
            return;
        case LIR_mod:
@ -832,6 +853,14 @@ namespace nanojit
    }

    void Assembler::asm_call(LIns *ins) {
+        Register retReg = ( ins->isop(LIR_fcall) ? XMM0 : retRegs[0] );
+        prepResultReg(ins, rmask(retReg));
+
+        // Do this after we've handled the call result, so we don't
+        // force the call result to be spilled unnecessarily.
+
+        evictScratchRegs();
+
        const CallInfo *call = ins->callInfo();
        ArgSize sizes[MAXARGS];
        int argc = call->get_sizes(sizes);
@ -991,32 +1020,32 @@ namespace nanojit

        LOpcode condop = cond->opcode();
        if (ins->opcode() == LIR_cmov) {
-            switch (condop & ~LIR64) {
-            case LIR_ov:  CMOVNO( rr, rf);  break;
-            case LIR_eq:  CMOVNE( rr, rf);  break;
-            case LIR_lt:  CMOVNL( rr, rf);  break;
-            case LIR_gt:  CMOVNG( rr, rf);  break;
-            case LIR_le:  CMOVNLE(rr, rf);  break;
-            case LIR_ge:  CMOVNGE(rr, rf);  break;
-            case LIR_ult: CMOVNB( rr, rf);  break;
-            case LIR_ugt: CMOVNA( rr, rf);  break;
-            case LIR_ule: CMOVNBE(rr, rf);  break;
-            case LIR_uge: CMOVNAE(rr, rf);  break;
-            default:      NanoAssert(0);    break;
+            switch (condop) {
+            case LIR_ov:                    CMOVNO( rr, rf);  break;
+            case LIR_eq:  case LIR_qeq:     CMOVNE( rr, rf);  break;
+            case LIR_lt:  case LIR_qlt:     CMOVNL( rr, rf);  break;
+            case LIR_gt:  case LIR_qgt:     CMOVNG( rr, rf);  break;
+            case LIR_le:  case LIR_qle:     CMOVNLE(rr, rf);  break;
+            case LIR_ge:  case LIR_qge:     CMOVNGE(rr, rf);  break;
+            case LIR_ult: case LIR_qult:    CMOVNB( rr, rf);  break;
+            case LIR_ugt: case LIR_qugt:    CMOVNA( rr, rf);  break;
+            case LIR_ule: case LIR_qule:    CMOVNBE(rr, rf);  break;
+            case LIR_uge: case LIR_quge:    CMOVNAE(rr, rf);  break;
+            default:                        NanoAssert(0);    break;
            }
        } else {
-            switch (condop & ~LIR64) {
-            case LIR_ov:  CMOVQNO( rr, rf); break;
-            case LIR_eq:  CMOVQNE( rr, rf); break;
-            case LIR_lt:  CMOVQNL( rr, rf); break;
-            case LIR_gt:  CMOVQNG( rr, rf); break;
-            case LIR_le:  CMOVQNLE(rr, rf); break;
-            case LIR_ge:  CMOVQNGE(rr, rf); break;
-            case LIR_ult: CMOVQNB( rr, rf); break;
-            case LIR_ugt: CMOVQNA( rr, rf); break;
-            case LIR_ule: CMOVQNBE(rr, rf); break;
-            case LIR_uge: CMOVQNAE(rr, rf); break;
-            default:      NanoAssert(0);    break;
+            switch (condop) {
+            case LIR_ov:                    CMOVQNO( rr, rf); break;
+            case LIR_eq:  case LIR_qeq:     CMOVQNE( rr, rf); break;
+            case LIR_lt:  case LIR_qlt:     CMOVQNL( rr, rf); break;
+            case LIR_gt:  case LIR_qgt:     CMOVQNG( rr, rf); break;
+            case LIR_le:  case LIR_qle:     CMOVQNLE(rr, rf); break;
+            case LIR_ge:  case LIR_qge:     CMOVQNGE(rr, rf); break;
+            case LIR_ult: case LIR_qult:    CMOVQNB( rr, rf); break;
+            case LIR_ugt: case LIR_qugt:    CMOVQNA( rr, rf); break;
+            case LIR_ule: case LIR_qule:    CMOVQNBE(rr, rf); break;
+            case LIR_uge: case LIR_quge:    CMOVQNAE(rr, rf); break;
+            default:                        NanoAssert(0);    break;
            }
        }
        /*const Register rt =*/ findSpecificRegFor(iftrue, rr);
@ -1024,72 +1053,71 @@ namespace nanojit
    }

    NIns* Assembler::asm_branch(bool onFalse, LIns *cond, NIns *target) {
+        NanoAssert(cond->isCond());
        LOpcode condop = cond->opcode();
        if (condop >= LIR_feq && condop <= LIR_fge)
            return asm_fbranch(onFalse, cond, target);

-        // we must ensure there's room for the instr before calculating
-        // the offset.  and the offset, determines the opcode (8bit or 32bit)
-        NanoAssert((condop & ~LIR64) >= LIR_ov);
-        NanoAssert((condop & ~LIR64) <= LIR_uge);
+        // We must ensure there's room for the instr before calculating
+        // the offset.  And the offset determines the opcode (8bit or 32bit).
        if (target && isTargetWithinS8(target)) {
            if (onFalse) {
-                switch (condop & ~LIR64) {
-                case LIR_ov:  JNO8( 8, target); break;
-                case LIR_eq:  JNE8( 8, target); break;
-                case LIR_lt:  JNL8( 8, target); break;
-                case LIR_gt:  JNG8( 8, target); break;
-                case LIR_le:  JNLE8(8, target); break;
-                case LIR_ge:  JNGE8(8, target); break;
-                case LIR_ult: JNB8( 8, target); break;
-                case LIR_ugt: JNA8( 8, target); break;
-                case LIR_ule: JNBE8(8, target); break;
-                case LIR_uge: JNAE8(8, target); break;
-                default:      NanoAssert(0);    break;
+                switch (condop) {
+                case LIR_ov:                    JNO8( 8, target); break;
+                case LIR_eq:  case LIR_qeq:     JNE8( 8, target); break;
+                case LIR_lt:  case LIR_qlt:     JNL8( 8, target); break;
+                case LIR_gt:  case LIR_qgt:     JNG8( 8, target); break;
+                case LIR_le:  case LIR_qle:     JNLE8(8, target); break;
+                case LIR_ge:  case LIR_qge:     JNGE8(8, target); break;
+                case LIR_ult: case LIR_qult:    JNB8( 8, target); break;
+                case LIR_ugt: case LIR_qugt:    JNA8( 8, target); break;
+                case LIR_ule: case LIR_qule:    JNBE8(8, target); break;
+                case LIR_uge: case LIR_quge:    JNAE8(8, target); break;
+                default:                        NanoAssert(0);    break;
                }
            } else {
-                switch (condop & ~LIR64) {
-                case LIR_ov:  JO8( 8, target);  break;
-                case LIR_eq:  JE8( 8, target);  break;
-                case LIR_lt:  JL8( 8, target);  break;
-                case LIR_gt:  JG8( 8, target);  break;
-                case LIR_le:  JLE8(8, target);  break;
-                case LIR_ge:  JGE8(8, target);  break;
-                case LIR_ult: JB8( 8, target);  break;
-                case LIR_ugt: JA8( 8, target);  break;
-                case LIR_ule: JBE8(8, target);  break;
-                case LIR_uge: JAE8(8, target);  break;
-                default:      NanoAssert(0);    break;
+                switch (condop) {
+                case LIR_ov:                    JO8( 8, target);  break;
+                case LIR_eq:  case LIR_qeq:     JE8( 8, target);  break;
+                case LIR_lt:  case LIR_qlt:     JL8( 8, target);  break;
+                case LIR_gt:  case LIR_qgt:     JG8( 8, target);  break;
+                case LIR_le:  case LIR_qle:     JLE8(8, target);  break;
+                case LIR_ge:  case LIR_qge:     JGE8(8, target);  break;
+                case LIR_ult: case LIR_qult:    JB8( 8, target);  break;
+                case LIR_ugt: case LIR_qugt:    JA8( 8, target);  break;
+                case LIR_ule: case LIR_qule:    JBE8(8, target);  break;
+                case LIR_uge: case LIR_quge:    JAE8(8, target);  break;
+                default:                        NanoAssert(0);    break;
                }
            }
        } else {
            if (onFalse) {
-                switch (condop & ~LIR64) {
-                case LIR_ov:  JNO( 8, target);  break;
-                case LIR_eq:  JNE( 8, target);  break;
-                case LIR_lt:  JNL( 8, target);  break;
-                case LIR_gt:  JNG( 8, target);  break;
-                case LIR_le:  JNLE(8, target);  break;
-                case LIR_ge:  JNGE(8, target);  break;
-                case LIR_ult: JNB( 8, target);  break;
-                case LIR_ugt: JNA( 8, target);  break;
-                case LIR_ule: JNBE(8, target);  break;
-                case LIR_uge: JNAE(8, target);  break;
-                default:      NanoAssert(0);    break;
+                switch (condop) {
+                case LIR_ov:                    JNO( 8, target);  break;
+                case LIR_eq:  case LIR_qeq:     JNE( 8, target);  break;
+                case LIR_lt:  case LIR_qlt:     JNL( 8, target);  break;
+                case LIR_gt:  case LIR_qgt:     JNG( 8, target);  break;
+                case LIR_le:  case LIR_qle:     JNLE(8, target);  break;
+                case LIR_ge:  case LIR_qge:     JNGE(8, target);  break;
+                case LIR_ult: case LIR_qult:    JNB( 8, target);  break;
+                case LIR_ugt: case LIR_qugt:    JNA( 8, target);  break;
+                case LIR_ule: case LIR_qule:    JNBE(8, target);  break;
+                case LIR_uge: case LIR_quge:    JNAE(8, target);  break;
+                default:                        NanoAssert(0);    break;
                }
            } else {
-                switch (condop & ~LIR64) {
-                case LIR_ov:  JO( 8, target);   break;
-                case LIR_eq:  JE( 8, target);   break;
-                case LIR_lt:  JL( 8, target);   break;
-                case LIR_gt:  JG( 8, target);   break;
-                case LIR_le:  JLE(8, target);   break;
-                case LIR_ge:  JGE(8, target);   break;
-                case LIR_ult: JB( 8, target);   break;
-                case LIR_ugt: JA( 8, target);   break;
-                case LIR_ule: JBE(8, target);   break;
-                case LIR_uge: JAE(8, target);   break;
-                default:      NanoAssert(0);    break;
+                switch (condop) {
+                case LIR_ov:                    JO( 8, target);   break;
+                case LIR_eq:  case LIR_qeq:     JE( 8, target);   break;
+                case LIR_lt:  case LIR_qlt:     JL( 8, target);   break;
+                case LIR_gt:  case LIR_qgt:     JG( 8, target);   break;
+                case LIR_le:  case LIR_qle:     JLE(8, target);   break;
+                case LIR_ge:  case LIR_qge:     JGE(8, target);   break;
+                case LIR_ult: case LIR_qult:    JB( 8, target);   break;
+                case LIR_ugt: case LIR_qugt:    JA( 8, target);   break;
+                case LIR_ule: case LIR_qule:    JBE(8, target);   break;
+                case LIR_uge: case LIR_quge:    JAE(8, target);   break;
+                default:                        NanoAssert(0);    break;
                }
            }
        }
@ -1117,25 +1145,29 @@ namespace nanojit
        }

        LOpcode condop = cond->opcode();
-        if (condop & LIR64)
+        if (LIR_qeq <= condop && condop <= LIR_quge) {
            CMPQR(ra, rb);
-        else
+        } else {
+            NanoAssert(LIR_eq <= condop && condop <= LIR_uge);
            CMPLR(ra, rb);
+        }
    }

    void Assembler::asm_cmp_imm(LIns *cond) {
+        LOpcode condop = cond->opcode();
        LIns *a = cond->oprnd1();
        LIns *b = cond->oprnd2();
        Register ra = findRegFor(a, GpRegs);
        int32_t imm = getImm32(b);
-        if (isS8(imm)) {
-            if (cond->opcode() & LIR64)
+        if (LIR_qeq <= condop && condop <= LIR_quge) {
+            if (isS8(imm))
                CMPQR8(ra, imm);
-            else 
-                CMPLR8(ra, imm);
-        } else {
-            if (cond->opcode() & LIR64)
+            else
                CMPQRI(ra, imm);
+        } else {
+            NanoAssert(LIR_eq <= condop && condop <= LIR_uge);
+            if (isS8(imm))
+                CMPLR8(ra, imm);
            else
                CMPLRI(ra, imm);
        }
@ -1327,61 +1359,103 @@ namespace nanojit
            // xmm <- xmm: use movaps. movsd r,r causes partial register stall
            MOVAPSR(d, s);
        } else {
+            NanoAssert(IsFpReg(d) && !IsFpReg(s));
            // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r)
            MOVQXR(d, s);
        }
    }

-    void Assembler::regalloc_load(LIns *ins, Register &rr, int32_t &dr, Register &rb) {
+    void Assembler::regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &dr, Register &rb) {
        dr = ins->disp();
        LIns *base = ins->oprnd1();
        rb = getBaseReg(ins->opcode(), base, dr, BaseRegs);
-        if (ins->isUnusedOrHasUnknownReg()) {
-            // use a gpr in case we're copying a non-double
-            rr = prepResultReg(ins, GpRegs & ~rmask(rb));
+        if (ins->isUnusedOrHasUnknownReg() || !(allow & rmask(ins->getReg()))) {
+            rr = prepResultReg(ins, allow & ~rmask(rb));
        } else {
            // keep already assigned register
            rr = ins->getReg();
+            NanoAssert(allow & rmask(rr));
            freeRsrcOf(ins, false);
        }
    }

    void Assembler::asm_load64(LIns *ins) {
+
        Register rr, rb;
        int32_t dr;
-        regalloc_load(ins, rr, dr, rb);
-        if (IsGpReg(rr)) {
-            // general 64bit load, 32bit const displacement
-            MOVQRM(rr, dr, rb);
-        } else {
-            // load 64bits into XMM.  don't know if double or int64, assume double.
-            MOVSDRM(rr, dr, rb);
+        switch (ins->opcode()) {
+            case LIR_ldq:
+            case LIR_ldqc:
+                regalloc_load(ins, GpRegs, rr, dr, rb);
+                if (IsGpReg(rr)) {
+                    // general 64bit load, 32bit const displacement
+                    MOVQRM(rr, dr, rb);
+                } else {
+                    NanoAssert(IsFpReg(rr));
+                    // load 64bits into XMM.  don't know if double or int64, assume double.
+                    MOVSDRM(rr, dr, rb);
+                }
+                break;
+            case LIR_ld32f:
+            case LIR_ldc32f:
+                regalloc_load(ins, FpRegs, rr, dr, rb);
+                NanoAssert(IsFpReg(rr));
+                CVTSS2SD(rr, rr);
+                MOVSSRM(rr, dr, rb); 
+                break;
+            default:
+                NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+                break;
        }
+
    }

-    void Assembler::asm_ld(LIns *ins) {
+    void Assembler::asm_load32(LIns *ins) {
        NanoAssert(!ins->isQuad());
        Register r, b;
        int32_t d;
-        regalloc_load(ins, r, d, b);
+        regalloc_load(ins, GpRegs, r, d, b);
        LOpcode op = ins->opcode();
-        switch (op) {
-        case LIR_ldcb: MOVZX8M( r, d, b);   break;
-        case LIR_ldcs: MOVZX16M(r, d, b);   break;
-        default:       MOVLRM(  r, d, b);   break;
+        switch(op) {
+            case LIR_ldzb:
+            case LIR_ldcb:
+                MOVZX8M( r, d, b);
+                break;
+            case LIR_ldzs:
+            case LIR_ldcs:
+                MOVZX16M(r, d, b);
+                break;
+            case LIR_ld:
+            case LIR_ldc:
+                MOVLRM(  r, d, b);
+                break;
+            case LIR_ldsb:
+            case LIR_ldcsb:
+                MOVSX8M( r, d, b);
+                break;
+            case LIR_ldss:
+            case LIR_ldcss:
+                MOVSX16M( r, d, b);
+                break;
+            default:
+                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                break;
        }
    }

-    void Assembler::asm_store64(LIns *value, int d, LIns *base) {
+    void Assembler::asm_store64(LOpcode op, LIns *value, int d, LIns *base) {
        NanoAssert(value->isQuad());
+
        Register b = getBaseReg(LIR_stqi, base, d, BaseRegs);
+        Register r;

        // if we have to choose a register, use a GPR, but not the base reg
-        Register r;
        if (value->isUnusedOrHasUnknownReg()) {
            RegisterMask allow;
+            // If op is LIR_st32f and we have no reg, prefer FPR over GPR: saves an instruction later,
+            // and the value is almost certainly going to operated on as FP later anyway.
            // XXX: isFloat doesn't cover float/fmod!  see bug 520208.
-            if (value->isFloat() || value->isop(LIR_float) || value->isop(LIR_fmod)) {
+            if (op == LIR_st32f || value->isFloat() || value->isop(LIR_float) || value->isop(LIR_fmod)) {
                allow = FpRegs;
            } else {
                allow = GpRegs;
@ -1391,23 +1465,76 @@ namespace nanojit
            r = value->getReg();
        }

-        if (IsGpReg(r)) {
-            // gpr store
-            MOVQMR(r, d, b);
-        }
-        else {
-            // xmm store
-            MOVSDMR(r, d, b);
+        switch (op) {
+            case LIR_stqi:
+            {
+                if (IsGpReg(r)) {
+                    // gpr store
+                    MOVQMR(r, d, b);
+                }
+                else {
+                    // xmm store
+                    MOVSDMR(r, d, b);
+                }
+                break;
+            }
+            case LIR_st32f:
+            {
+                // need a scratch FPR reg
+                Register t = registerAllocTmp(FpRegs & ~rmask(r));
+
+                // store
+                MOVSSMR(t, d, b);
+
+                // cvt to single-precision
+                if (IsGpReg(r))
+                {
+                    CVTSD2SS(t, t);
+                    MOVQXR(t, r); // xmm <- gpr: use movq xmm, r/m64 (66 REX.W 0F 6E /r)
+                }
+                else
+                {
+                    NanoAssert(IsFpReg(r));
+                    CVTSD2SS(t, r);
+                }
+                XORPS(t); // break dependency chains
+                break;
+            }
+            default:
+                NanoAssertMsg(0, "asm_store64 should never receive this LIR opcode");
+                break;
        }
    }

-    void Assembler::asm_store32(LIns *value, int d, LIns *base) {
+    void Assembler::asm_store32(LOpcode op, LIns *value, int d, LIns *base) {
+
+        // quirk of x86-64: reg cannot appear to be ah/bh/ch/dh 
+        // for single-byte stores with REX prefix
+        const RegisterMask SrcRegs = 
+                        (op == LIR_stb) ?
+                        (GpRegs & ~(1<<RSP | 1<<RBP | 1<<RSI | 1<<RDI)) :
+                        GpRegs;
+
        NanoAssert(!value->isQuad());
        Register b = getBaseReg(LIR_sti, base, d, BaseRegs);
-        Register r = findRegFor(value, GpRegs & ~rmask(b));
+        Register r = findRegFor(value, SrcRegs & ~rmask(b));
+
+        switch (op) {
+            case LIR_stb:
+                MOVBMR(r, d, b);
+                break;
+            case LIR_sts:
+                MOVSMR(r, d, b);
+                break;
+            case LIR_sti:
+                MOVLMR(r, d, b);
+                break;
+            default:
+                NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+                break;
+        }
+

-        // store 32bits to 64bit addr.  use rex so we can use all 16 regs
-        MOVLMR(r, d, b);
    }

    // generate a 64bit constant, must not affect condition codes!
@ -1459,10 +1586,6 @@ namespace nanojit
        TODO(asm_qjoin);
    }

-    Register Assembler::asm_prep_fcall(LIns *ins) {
-        return prepResultReg(ins, rmask(XMM0));
-    }
-
    void Assembler::asm_param(LIns *ins) {
        uint32_t a = ins->paramArg();
        uint32_t kind = ins->paramKind();
@ -1570,7 +1693,7 @@ namespace nanojit
                SUBQRI(RSP, amt);
        }

-        verbose_only( outputAddr=true; asm_output("[patch entry]"); )
+        verbose_only( asm_output("[patch entry]"); )
        NIns *patchEntry = _nIns;
        MR(FP, RSP);    // Establish our own FP.
        PUSHR(FP);      // Save caller's FP.
--- a/js/src/nanojit/NativeX64.h
+++ b/js/src/nanojit/NativeX64.h
@ -61,6 +61,7 @@ namespace nanojit
 #define NJ_MAX_STACK_ENTRY              256
 #define NJ_ALIGN_STACK                  16
 #define NJ_JTBL_SUPPORTED               1
+#define NJ_EXPANDED_LOADSTORE_SUPPORTED 1

    enum Register {
        RAX = 0, // 1st int return, # of sse varargs
@ -190,6 +191,8 @@ namespace nanojit
        X64_cmpqr8  = 0x00F8834800000004LL, // 64bit compare r,int64(imm8)
        X64_cvtsi2sd= 0xC02A0F40F2000005LL, // convert int32 to double r = (double) b
        X64_cvtsq2sd= 0xC02A0F48F2000005LL, // convert int64 to double r = (double) b
+        X64_cvtss2sd= 0xC05A0F40F3000005LL, // convert float to double r = (double) b
+        X64_cvtsd2ss= 0xC05A0F40F2000005LL, // convert double to float r = (float) b
        X64_divsd   = 0xC05E0F40F2000005LL, // divide scalar double r /= b
        X64_mulsd   = 0xC0590F40F2000005LL, // multiply scalar double r *= b
        X64_addsd   = 0xC0580F40F2000005LL, // add scalar double r += b
@ -229,6 +232,8 @@ namespace nanojit
        X64_learm   = 0x00000000808D4007LL, // 32bit load effective addr reg <- disp32+base
        X64_learip  = 0x00000000058D4807LL, // 64bit RIP-relative lea. reg <- disp32+rip (modrm = 00rrr101 = 05)
        X64_movlr   = 0xC08B400000000003LL, // 32bit mov r <- b
+        X64_movbmr  = 0x0000000080884007LL, // 8bit store r -> [b+d32]
+        X64_movsmr  = 0x8089406600000004LL, // 16bit store r -> [b+d32]
        X64_movlmr  = 0x0000000080894007LL, // 32bit store r -> [b+d32]
        X64_movlrm  = 0x00000000808B4007LL, // 32bit load r <- [b+d32]
        X64_movqmr  = 0x0000000080894807LL, // 64bit store gpr -> [b+d32]
@ -244,10 +249,14 @@ namespace nanojit
        X64_movsdrr = 0xC0100F40F2000005LL, // 64bit mov xmm-r <- xmm-b (upper 64bits unchanged)
        X64_movsdrm = 0x80100F40F2000005LL, // 64bit load xmm-r <- [b+d32] (upper 64 cleared)
        X64_movsdmr = 0x80110F40F2000005LL, // 64bit store xmm-r -> [b+d32]
+        X64_movssrm = 0x80100F40F3000005LL, // 32bit load xmm-r <- [b+d32] (upper 96 cleared)
+        X64_movssmr = 0x80110F40F3000005LL, // 32bit store xmm-r -> [b+d32]
        X64_movsxdr = 0xC063480000000003LL, // sign extend i32 to i64 r = (int64)(int32) b
        X64_movzx8  = 0xC0B60F4000000004LL, // zero extend i8 to i64 r = (uint64)(uint8) b
        X64_movzx8m = 0x80B60F4000000004LL, // zero extend i8 load to i32 r <- [b+d32]
        X64_movzx16m= 0x80B70F4000000004LL, // zero extend i16 load to i32 r <- [b+d32]
+        X64_movsx8m = 0x80BE0F4000000004LL, // sign extend i8 load to i32 r <- [b+d32]
+        X64_movsx16m= 0x80BF0F4000000004LL, // sign extend i16 load to i32 r <- [b+d32]
        X64_neg     = 0xD8F7400000000003LL, // 32bit two's compliment b = -b
        X64_nop1    = 0x9000000000000001LL, // one byte NOP
        X64_nop2    = 0x9066000000000002LL, // two byte NOP
@ -358,6 +367,7 @@ namespace nanojit
        void emitr(uint64_t op, Register b) { emitrr(op, (Register)0, b); }\
        void emitr8(uint64_t op, Register b) { emitrr8(op, (Register)0, b); }\
        void emitprr(uint64_t op, Register r, Register b);\
+        void emitrm8(uint64_t op, Register r, int32_t d, Register b);\
        void emitrm(uint64_t op, Register r, int32_t d, Register b);\
        void emitrm_wide(uint64_t op, Register r, int32_t d, Register b);\
        uint64_t emit_disp32(uint64_t op, int32_t d);\
@ -379,7 +389,7 @@ namespace nanojit
        void asm_arith_imm(LIns*);\
        void regalloc_unary(LIns *ins, RegisterMask allow, Register &rr, Register &ra);\
        void regalloc_binary(LIns *ins, RegisterMask allow, Register &rr, Register &ra, Register &rb);\
-        void regalloc_load(LIns *ins, Register &rr, int32_t &d, Register &rb);\
+        void regalloc_load(LIns *ins, RegisterMask allow, Register &rr, int32_t &d, Register &rb);\
        void dis(NIns *p, int bytes);\
        void asm_cmp(LIns*);\
        void asm_cmp_imm(LIns*);\
@ -459,6 +469,8 @@ namespace nanojit
        void SUBSD(Register l, Register r);\
        void CVTSQ2SD(Register l, Register r);\
        void CVTSI2SD(Register l, Register r);\
+        void CVTSS2SD(Register l, Register r);\
+        void CVTSD2SS(Register l, Register r);\
        void UCOMISD(Register l, Register r);\
        void MOVQRX(Register l, Register r);\
        void MOVQXR(Register l, Register r);\
@ -494,12 +506,18 @@ namespace nanojit
        void LEAQRM(Register r1, int d, Register r2);\
        void MOVLRM(Register r1, int d, Register r2);\
        void MOVQRM(Register r1, int d, Register r2);\
+        void MOVBMR(Register r1, int d, Register r2);\
+        void MOVSMR(Register r1, int d, Register r2);\
        void MOVLMR(Register r1, int d, Register r2);\
        void MOVQMR(Register r1, int d, Register r2);\
        void MOVZX8M(Register r1, int d, Register r2);\
        void MOVZX16M(Register r1, int d, Register r2);\
+        void MOVSX8M(Register r1, int d, Register r2);\
+        void MOVSX16M(Register r1, int d, Register r2);\
        void MOVSDRM(Register r1, int d, Register r2);\
        void MOVSDMR(Register r1, int d, Register r2);\
+        void MOVSSMR(Register r1, int d, Register r2);\
+        void MOVSSRM(Register r1, int d, Register r2);\
        void JMP8(size_t n, NIns* t);\
        void JMP32(size_t n, NIns* t);\
        void JMPX(Register indexreg, NIns** table);\
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@ -70,11 +70,45 @@ namespace nanojit
        0  /* ABI_CDECL */
    };

+    static bool CheckForSSE2()
+    {
+        int features = 0;
+    #if defined _MSC_VER
+        __asm
+        {
+            pushad
+            mov eax, 1
+            cpuid
+            mov features, edx
+            popad
+        }
+    #elif defined __GNUC__
+        asm("xchg %%esi, %%ebx\n" /* we can't clobber ebx on gcc (PIC register) */
+            "mov $0x01, %%eax\n"
+            "cpuid\n"
+            "mov %%edx, %0\n"
+            "xchg %%esi, %%ebx\n"
+            : "=m" (features)
+            : /* We have no inputs */
+            : "%eax", "%esi", "%ecx", "%edx"
+           );
+    #elif defined __SUNPRO_C || defined __SUNPRO_CC
+        asm("push %%ebx\n"
+            "mov $0x01, %%eax\n"
+            "cpuid\n"
+            "pop %%ebx\n"
+            : "=d" (features)
+            : /* We have no inputs */
+            : "%eax", "%ecx"
+           );
+    #endif
+        return (features & (1<<26)) != 0;
+    }

    void Assembler::nInit(AvmCore* core)
    {
        (void) core;
-        VMPI_getDate();
+        config.sse2 = config.sse2 && CheckForSSE2();
    }

    void Assembler::nBeginAssembly() {
@ -102,7 +136,7 @@ namespace nanojit
            SUBi(SP, amt);
        }

-        verbose_only( outputAddr=true; asm_output("[frag entry]"); )
+        verbose_only( asm_output("[frag entry]"); )
        NIns *fragEntry = _nIns;
        MR(FP, SP); // Establish our own FP.
        PUSHr(FP); // Save caller's FP.
@ -154,7 +188,7 @@ namespace nanojit
        MR(SP,FP);

        // return value is GuardRecord*
-        LDi(EAX, int(lr));
+        asm_int(EAX, int(lr), /*canClobberCCs*/true);
    }

    NIns *Assembler::genEpilogue()
@ -167,6 +201,14 @@ namespace nanojit

    void Assembler::asm_call(LInsp ins)
    {
+        Register retReg = ( ins->isop(LIR_fcall) ? FST0 : retRegs[0] );
+        prepResultReg(ins, rmask(retReg));
+
+        // Do this after we've handled the call result, so we don't
+        // force the call result to be spilled unnecessarily.
+
+        evictScratchRegs();
+
        const CallInfo* call = ins->callInfo();
        // must be signed, not unsigned
        uint32_t iargs = call->count_iargs();
@ -407,7 +449,7 @@ namespace nanojit
            if (!i->getArIndex()) {
                i->markAsClear();
            }
-            LDi(r, i->imm32());
+            asm_int(r, i->imm32(), /*canClobberCCs*/false);
        }
        else if (i->isop(LIR_param) && i->paramKind() == 0 &&
            (arg = i->paramArg()) >= (abi_regcount = max_abi_regs[_thisfrag->lirbuf->abi])) {
@ -430,31 +472,62 @@ namespace nanojit
        }
    }

-    void Assembler::asm_store32(LIns *value, int dr, LIns *base)
+    void Assembler::asm_store32(LOpcode op, LIns* value, int dr, LIns* base)
    {
        if (value->isconst())
        {
            Register rb = getBaseReg(LIR_sti, base, dr, GpRegs);
            int c = value->imm32();
-            STi(rb, dr, c);
+            switch(op) {
+                case LIR_stb:
+                    ST8i(rb, dr, c);
+                    break;
+                case LIR_sts:
+                    ST16i(rb, dr, c);
+                    break;
+                case LIR_sti:
+                    STi(rb, dr, c);
+                    break;
+                default:
+                    NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+                    break;
+            }
        }
        else
        {
+            // quirk of x86-32: reg must be a/b/c/d for single-byte stores
+            const RegisterMask SrcRegs = (op == LIR_stb) ?
+                            (1<<EAX | 1<<ECX | 1<<EDX | 1<<EBX) :
+                            GpRegs;
+
            // make sure what is in a register
            Register ra, rb;
            if (base->isop(LIR_alloc)) {
                rb = FP;
                dr += findMemFor(base);
-                ra = findRegFor(value, GpRegs);
+                ra = findRegFor(value, SrcRegs);
            } else if (base->isconst()) {
                // absolute address
                dr += base->imm32();
-                ra = findRegFor(value, GpRegs);
+                ra = findRegFor(value, SrcRegs);
                rb = UnknownReg;
            } else {
-                findRegFor2(GpRegs, value, ra, base, rb);
+                findRegFor2(SrcRegs, value, ra, base, rb);
+            }
+            switch(op) {
+                case LIR_stb:
+                    ST8(rb, dr, ra);
+                    break;
+                case LIR_sts:
+                    ST16(rb, dr, ra);
+                    break;
+                case LIR_sti:
+                    ST(rb, dr, ra);
+                    break;
+                default:
+                    NanoAssertMsg(0, "asm_store32 should never receive this LIR opcode");
+                    break;
            }
-            ST(rb, dr, ra);
        }
    }

@ -494,10 +567,25 @@ namespace nanojit
        {
            freeRsrcOf(ins, false);
            Register rb = getBaseReg(ins->opcode(), base, db, GpRegs);
-            SSE_LDQ(rr, db, rb);
+            switch (ins->opcode()) {
+                case LIR_ldq:
+                case LIR_ldqc:
+                    SSE_LDQ(rr, db, rb);
+                    break;
+                case LIR_ld32f:
+                case LIR_ldc32f:
+                    SSE_CVTSS2SD(rr, rr);
+                    SSE_LDSS(rr, db, rb);
+                    SSE_XORPDr(rr,rr);  
+                    break;
+                default:
+                    NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+                    break;
+            }
        }
        else
        {
+            
            int dr = disp(ins);
            Register rb;
            if (base->isop(LIR_alloc)) {
@ -508,23 +596,79 @@ namespace nanojit
            }
            ins->setReg(UnknownReg);

-            // don't use an fpu reg to simply load & store the value.
-            if (dr)
-                asm_mmq(FP, dr, rb, db);
-
-            freeRsrcOf(ins, false);
-
-            if (isKnownReg(rr))
-            {
-                NanoAssert(rmask(rr)&FpRegs);
-                _allocator.retire(rr);
-                FLDQ(db, rb);
+            switch (ins->opcode()) {
+                case LIR_ldq:
+                case LIR_ldqc:
+                    // don't use an fpu reg to simply load & store the value.
+                    if (dr)
+                        asm_mmq(FP, dr, rb, db);
+                    freeRsrcOf(ins, false);
+                    if (isKnownReg(rr))
+                    {
+                        NanoAssert(rmask(rr)&x87Regs);
+                        _allocator.retire(rr);
+                        FLDQ(db, rb);
+                    }
+                    break;
+                case LIR_ld32f:
+                case LIR_ldc32f:
+                    freeRsrcOf(ins, false);
+                    if (isKnownReg(rr))
+                    {
+                        NanoAssert(rmask(rr)&x87Regs);
+                        _allocator.retire(rr);
+                        FLD32(db, rb);
+                    }
+                    else
+                    {
+                        // need to use fpu to expand 32->64
+                        NanoAssert(dr != 0);
+                        FSTPQ(dr, FP);
+                        FLD32(db, rb);
+                    }
+                    break;
+                default:
+                    NanoAssertMsg(0, "asm_load64 should never receive this LIR opcode");
+                    break;
            }
        }
    }

-    void Assembler::asm_store64(LInsp value, int dr, LInsp base)
+    void Assembler::asm_store64(LOpcode op, LInsp value, int dr, LInsp base)
    {
+        if (op == LIR_st32f)
+        {
+            Register rb;
+            if (base->isop(LIR_alloc)) {
+                rb = FP;
+                dr += findMemFor(base);
+            } else {
+                rb = findRegFor(base, GpRegs);
+            }
+
+            // if value already in a reg, use that, otherwise
+            // try to get it into XMM regs before FPU regs.
+            bool pop = value->isUnusedOrHasUnknownReg();
+            Register rv = findRegFor(value, config.sse2 ? XmmRegs : FpRegs);
+            if (rmask(rv) & XmmRegs) 
+            {
+                // need a scratch reg
+                Register t = registerAllocTmp(XmmRegs);
+
+                // cvt to single-precision and store
+                SSE_STSS(dr, rb, t);
+                SSE_CVTSD2SS(t, rv);
+                SSE_XORPDr(t,t);  // zero dest to ensure no dependency stalls
+            } 
+            else 
+            {
+                FST32(pop?1:0, dr, rb);
+            }
+            return;
+        }
+        
+        NanoAssertMsg(op == LIR_stqi, "asm_store64 should never receive this LIR opcode");
+
        if (value->isconstq())
        {
            // if a constant 64-bit value just store it now rather than
@ -592,9 +736,12 @@ namespace nanojit
                      ? findRegFor(value, config.sse2 ? XmmRegs : FpRegs)
                      : value->getReg() );

-        if (rmask(rv) & XmmRegs) {
+        if (rmask(rv) & XmmRegs) 
+        {
            SSE_STQ(dr, rb, rv);
-        } else {
+        } 
+        else 
+        {
            FSTQ(pop?1:0, dr, rb);
        }
    }
@ -1010,7 +1157,7 @@ namespace nanojit
            MR(rr,ra);
    }

-    void Assembler::asm_ld(LInsp ins)
+    void Assembler::asm_load32(LInsp ins)
    {
        LOpcode op = ins->opcode();
        LIns* base = ins->oprnd1();
@ -1020,13 +1167,31 @@ namespace nanojit
        if (base->isconst()) {
            intptr_t addr = base->imm32();
            addr += d;
-            if (op == LIR_ldcb)
-                LD8Zdm(rr, addr);
-            else if (op == LIR_ldcs)
-                LD16Zdm(rr, addr);
-            else
-                LDdm(rr, addr);
-            return;
+            switch(op) {
+                case LIR_ldzb:
+                case LIR_ldcb:
+                    LD8Zdm(rr, addr);
+                    return;
+                case LIR_ldsb:
+                case LIR_ldcsb:
+                    LD8Sdm(rr, addr);
+                    return;
+                case LIR_ldzs:
+                case LIR_ldcs:
+                    LD16Zdm(rr, addr);
+                    return;
+                case LIR_ldss:
+                case LIR_ldcss:
+                    LD16Sdm(rr, addr);
+                    return;
+                case LIR_ld:
+                case LIR_ldc:
+                    LDdm(rr, addr);
+                    return;
+                default:
+                    NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                    return;
+            }
        }

        /* Search for add(X,Y) */
@ -1058,23 +1223,59 @@ namespace nanojit
                              ? findSpecificRegForUnallocated(rhs, rr)
                              : findRegFor(rhs, GpRegs & ~(rmask(rleft))) );

-            if (op == LIR_ldcb)
-                LD8Zsib(rr, d, rleft, rright, scale);
-            else if (op == LIR_ldcs)
-                LD16Zsib(rr, d, rleft, rright, scale);
-            else
-                LDsib(rr, d, rleft, rright, scale);
-
-            return;
+            switch(op) {
+                case LIR_ldzb:
+                case LIR_ldcb:
+                    LD8Zsib(rr, d, rleft, rright, scale);
+                    return;
+                case LIR_ldsb:
+                case LIR_ldcsb:
+                    LD8Ssib(rr, d, rleft, rright, scale);
+                    return;
+                case LIR_ldzs:
+                case LIR_ldcs:
+                    LD16Zsib(rr, d, rleft, rright, scale);
+                    return;
+                case LIR_ldss:
+                case LIR_ldcss:
+                    LD16Ssib(rr, d, rleft, rright, scale);
+                    return;
+                case LIR_ld:
+                case LIR_ldc:
+                    LDsib(rr, d, rleft, rright, scale);
+                    return;
+                default:
+                    NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                    return;
+            }
        }

        Register ra = getBaseReg(op, base, d, GpRegs);
-        if (op == LIR_ldcb)
-            LD8Z(rr, d, ra);
-        else if (op == LIR_ldcs)
-            LD16Z(rr, d, ra);
-        else
-            LD(rr, d, ra);
+        switch(op) {
+            case LIR_ldzb:
+            case LIR_ldcb:
+                LD8Z(rr, d, ra);
+                return;
+            case LIR_ldsb:
+            case LIR_ldcsb:
+                LD8S(rr, d, ra);
+                return;
+            case LIR_ldzs:
+            case LIR_ldcs:
+                LD16Z(rr, d, ra);
+                return;
+            case LIR_ldss:
+            case LIR_ldcss:
+                LD16S(rr, d, ra);
+                return;
+            case LIR_ld:
+            case LIR_ldc:
+                LD(rr, d, ra);
+                return;
+            default:
+                NanoAssertMsg(0, "asm_load32 should never receive this LIR opcode");
+                return;
+        }
    }

    void Assembler::asm_cmov(LInsp ins)
@ -1150,11 +1351,15 @@ namespace nanojit
    void Assembler::asm_int(LInsp ins)
    {
        Register rr = prepResultReg(ins, GpRegs);
-        int32_t val = ins->imm32();
-        if (val == 0)
-            XOR(rr,rr);
+        asm_int(rr, ins->imm32(), /*canClobberCCs*/true);
+    }
+
+    void Assembler::asm_int(Register r, int32_t val, bool canClobberCCs)
+    {
+        if (val == 0 && canClobberCCs)
+            XOR(r, r);
        else
-            LDi(rr, val);
+            LDi(r, val);
    }

    void Assembler::asm_quad(LInsp ins)
@ -1182,7 +1387,7 @@ namespace nanojit
                    Register gr = registerAllocTmp(GpRegs);
                    SSE_CVTSI2SD(rr, gr);
                    SSE_XORPDr(rr,rr);  // zero rr to ensure no dependency stalls
-                    LDi(gr, (int)d);
+                    asm_int(gr, (int)d, /*canClobberCCs*/true);
                } else {
                    findMemFor(ins);
                    const int d = disp(ins);
@ -1329,7 +1534,7 @@ namespace nanojit
            if (isKnownReg(r)) {
                // arg goes in specific register
                if (p->isconst()) {
-                    LDi(r, p->imm32());
+                    asm_int(r, p->imm32(), /*canClobberCCs*/true);
                } else {
                    if (p->isUsed()) {
                        if (!p->hasKnownReg()) {
@ -1533,11 +1738,6 @@ namespace nanojit
        }
    }

-    Register Assembler::asm_prep_fcall(LInsp ins)
-    {
-        return prepResultReg(ins, rmask(FST0));
-    }
-
    void Assembler::asm_u2f(LInsp ins)
    {
        // where our result goes
@ -1876,6 +2076,6 @@ namespace nanojit
        SWAP(NIns*, codeEnd, exitEnd);
        verbose_only( SWAP(size_t, codeBytes, exitBytes); )
    }
-
+    
    #endif /* FEATURE_NANOJIT */
 }
--- a/js/src/nanojit/Nativei386.h
+++ b/js/src/nanojit/Nativei386.h
@ -96,6 +96,7 @@ namespace nanojit
    #define NJ_MAX_STACK_ENTRY 256
    #define NJ_MAX_PARAMETERS 1
    #define NJ_JTBL_SUPPORTED 1
+    #define NJ_EXPANDED_LOADSTORE_SUPPORTED 1

        // Preserve a 16-byte stack alignment, to support the use of
        // SSE instructions like MOVDQA (if not by Tamarin itself,
@ -178,6 +179,7 @@ namespace nanojit
        void nativePageReset();\
        void nativePageSetup();\
        void underrunProtect(int);\
+        void asm_int(Register r, int32_t val, bool canClobberCCs);\
        void asm_stkarg(LInsp p, int32_t& stkd);\
        void asm_farg(LInsp, int32_t& stkd);\
        void asm_arg(ArgSize sz, LInsp p, Register r, int32_t& stkd);\
@ -185,14 +187,24 @@ namespace nanojit
        void asm_fcmp(LIns *cond);\
        NIns* asm_fbranch(bool, LIns*, NIns*);\
        void asm_cmp(LIns *cond); \
-        void asm_div_mod(LIns *cond);
+        void asm_div_mod(LIns *cond); \
+        void asm_load(int d, Register r);

+ #define IMM8(i)    \
+     _nIns -= 1;     \
+     *((int8_t*)_nIns) = (int8_t)(i)
+ 
+ #define IMM16(i)    \
+     _nIns -= 2;     \
+     *((int16_t*)_nIns) = (int16_t)(i)
+ 
 #define IMM32(i)    \
    _nIns -= 4;     \
    *((int32_t*)_nIns) = (int32_t)(i)

 // XXX rearrange NanoAssert() expression to workaround apparent gcc 4.3 bug:
 // XXX "error: logical && with non-zero constant will always evaluate as true"
+// underrunProtect(6) is necessary for worst-case
 #define MODRMs(r,d,b,l,i) \
        NanoAssert(unsigned(i)<8 && unsigned(b)<8 && unsigned(r)<8); \
        if ((d) == 0 && (b) != EBP) { \
@ -210,6 +222,7 @@ namespace nanojit
            *(--_nIns) = (uint8_t)    ( 2<<6 |   (r)<<3 | 4 ); \
        }

+// underrunProtect(6) is necessary for worst-case
 #define MODRMm(r,d,b) \
        NanoAssert(unsigned(r)<8 && ((b)==UnknownReg || unsigned(b)<8)); \
        if ((b) == UnknownReg) {\
@ -440,37 +453,62 @@ namespace nanojit
    asm_output("mov   %s,%d(%s+%s*%c)",gpn(reg),disp,gpn(base),gpn(index),SIBIDX(scale)); \
    } while (0)

+// note: movzx/movsx are being output with an 8/16 suffix to indicate the size
+// being loaded. this doesn't really match standard intel format (though is arguably
+// terser and more obvious in this case) and would probably be nice to fix. 
+// (likewise, the 8/16 bit stores being output as "mov8" and "mov16" respectively.) 
+
 // load 16-bit, sign extend
-#define LD16S(r,d,b) do { count_ld(); ALU2m(0x0fbf,r,d,b); asm_output("movsx %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+#define LD16S(r,d,b) do { count_ld(); ALU2m(0x0fbf,r,d,b); asm_output("movsx16 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+
+#define LD16Sdm(r,addr) do { count_ld(); ALU2dm(0x0fbf,r,addr); asm_output("movsx16 %s,0(%lx)", gpn(r),(unsigned long)addr); } while (0)
+
+#define LD16Ssib(r,disp,base,index,scale) do {    \
+    count_ld();                                 \
+    ALU2sib(0x0fbf,r,base,index,scale,disp);    \
+    asm_output("movsx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
+    } while (0)

 // load 16-bit, zero extend
-#define LD16Z(r,d,b) do { count_ld(); ALU2m(0x0fb7,r,d,b); asm_output("movsz %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+#define LD16Z(r,d,b) do { count_ld(); ALU2m(0x0fb7,r,d,b); asm_output("movzx16 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)

-#define LD16Zdm(r,addr) do { count_ld(); ALU2dm(0x0fb7,r,addr); asm_output("movsz %s,0(%lx)", gpn(r),(unsigned long)addr); } while (0)
+#define LD16Zdm(r,addr) do { count_ld(); ALU2dm(0x0fb7,r,addr); asm_output("movzx16 %s,0(%lx)", gpn(r),(unsigned long)addr); } while (0)

 #define LD16Zsib(r,disp,base,index,scale) do {    \
    count_ld();                                 \
    ALU2sib(0x0fb7,r,base,index,scale,disp);    \
-    asm_output("movsz %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
+    asm_output("movzx16 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
    } while (0)

 // load 8-bit, zero extend
-#define LD8Z(r,d,b)    do { count_ld(); ALU2m(0x0fb6,r,d,b); asm_output("movzx %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+#define LD8Z(r,d,b)    do { count_ld(); ALU2m(0x0fb6,r,d,b); asm_output("movzx8 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)

 #define LD8Zdm(r,addr) do { \
    count_ld(); \
-    NanoAssert((d)>=0&&(d)<=31); \
    ALU2dm(0x0fb6,r,addr); \
-    asm_output("movzx %s,0(%lx)", gpn(r),(long unsigned)addr); \
+    asm_output("movzx8 %s,0(%lx)", gpn(r),(long unsigned)addr); \
    } while(0)

 #define LD8Zsib(r,disp,base,index,scale) do {    \
    count_ld();                                 \
-    NanoAssert((d)>=0&&(d)<=31);                \
    ALU2sib(0x0fb6,r,base,index,scale,disp);    \
-    asm_output("movzx %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
+    asm_output("movzx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
    } while(0)

+// load 8-bit, sign extend
+#define LD8S(r,d,b)    do { count_ld(); ALU2m(0x0fbe,r,d,b); asm_output("movsx8 %s,%d(%s)", gpn(r),d,gpn(b)); } while(0)
+
+#define LD8Sdm(r,addr) do { \
+    count_ld(); \
+    ALU2dm(0x0fbe,r,addr); \
+    asm_output("movsx8 %s,0(%lx)", gpn(r),(long unsigned)addr); \
+    } while(0)
+
+#define LD8Ssib(r,disp,base,index,scale) do {    \
+    count_ld();                                 \
+    ALU2sib(0x0fbe,r,base,index,scale,disp);    \
+    asm_output("movsx8 %s,%d(%s+%s*%c)",gpn(r),disp,gpn(base),gpn(index),SIBIDX(scale)); \
+    } while(0)

 #define LDi(r,i) do { \
    count_ld();\
@ -480,14 +518,43 @@ namespace nanojit
    *(--_nIns) = (uint8_t) (0xb8 | (r) );       \
    asm_output("mov %s,%d",gpn(r),i); } while(0)

+// quirk of x86-32: reg must be a/b/c/d for byte stores here
+#define ST8(base,disp,reg) do {  \
+    count_st();\
+    NanoAssert(((unsigned)reg)<4); \
+    ALUm(0x88,reg,disp,base);   \
+    asm_output("mov8 %d(%s),%s",disp,base==UnknownReg?"0":gpn(base),gpn(reg)); } while(0)
+
+#define ST16(base,disp,reg) do {  \
+    count_st();\
+    ALUm16(0x89,reg,disp,base);   \
+    asm_output("mov16 %d(%s),%s",disp,base==UnknownReg?"0":gpn(base),gpn(reg)); } while(0)
+
 #define ST(base,disp,reg) do {  \
    count_st();\
    ALUm(0x89,reg,disp,base);   \
    asm_output("mov %d(%s),%s",disp,base==UnknownReg?"0":gpn(base),gpn(reg)); } while(0)

+#define ST8i(base,disp,imm)  do { \
+    count_st();\
+    underrunProtect(8);    \
+    IMM8(imm);             \
+    MODRMm(0, disp, base);  \
+    *(--_nIns) = 0xc6;      \
+    asm_output("mov8 %d(%s),%d",disp,gpn(base),imm); } while(0)
+
+#define ST16i(base,disp,imm)  do { \
+    count_st();\
+    underrunProtect(10);    \
+    IMM16(imm);             \
+    MODRMm(0, disp, base);  \
+    *(--_nIns) = 0xc7;      \
+    *(--_nIns) = 0x66;      \
+    asm_output("mov16 %d(%s),%d",disp,gpn(base),imm); } while(0)
+
 #define STi(base,disp,imm)  do { \
    count_st();\
-    underrunProtect(12);    \
+    underrunProtect(11);    \
    IMM32(imm);             \
    MODRMm(0, disp, base);  \
    *(--_nIns) = 0xc7;      \
@ -680,12 +747,36 @@ namespace nanojit
    asm_output("movq %d(%s),%s",(d),gpn(b),gpn(r)); \
    } while(0)

+#define SSE_LDSS(r,d,b)do {  \
+    count_ld();\
+    SSEm(0xf30f10, (r)&7, (d), (b)); \
+    asm_output("movss %s,%d(%s)",gpn(r),d,gpn(b)); \
+    } while(0)
+
+#define SSE_STSS(d,b,r)do {  \
+    count_st();\
+    SSEm(0xf30f11, (r)&7, (d), (b)); \
+    asm_output("movss %d(%s),%s",(d),gpn(b),gpn(r)); \
+    } while(0)
+
 #define SSE_CVTSI2SD(xr,gr) do{ \
    count_fpu();\
    SSE(0xf20f2a, (xr)&7, (gr)&7); \
    asm_output("cvtsi2sd %s,%s",gpn(xr),gpn(gr)); \
    } while(0)

+#define SSE_CVTSD2SS(xr,gr) do{ \
+    count_fpu();\
+    SSE(0xf20f5a, (xr)&7, (gr)&7); \
+    asm_output("cvtsd2ss %s,%s",gpn(xr),gpn(gr)); \
+    } while(0)
+
+#define SSE_CVTSS2SD(xr,gr) do{ \
+    count_fpu();\
+    SSE(0xf30f5a, (xr)&7, (gr)&7); \
+    asm_output("cvtss2sd %s,%s",gpn(xr),gpn(gr)); \
+    } while(0)
+
 #define CVTDQ2PD(dstr,srcr) do{ \
    count_fpu();\
    SSE(0xf30fe6, (dstr)&7, (srcr)&7); \
@ -828,9 +919,11 @@ namespace nanojit
 #define FLD1()      do { count_fpu(); FPUc(0xd9e8);             asm_output("fld1"); fpu_push(); } while(0)
 #define FLDZ()      do { count_fpu(); FPUc(0xd9ee);             asm_output("fldz"); fpu_push(); } while(0)
 #define FFREE(r)    do { count_fpu(); FPU(0xddc0, r);           asm_output("ffree %s",fpn(r)); } while(0)
+#define FST32(p,d,b) do { count_stq(); FPUm(0xd902|(p), d, b);   asm_output("fst%s32 %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
 #define FSTQ(p,d,b) do { count_stq(); FPUm(0xdd02|(p), d, b);   asm_output("fst%sq %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
 #define FSTPQ(d,b)  FSTQ(1,d,b)
 #define FCOM(p,d,b) do { count_fpuld(); FPUm(0xdc02|(p), d, b); asm_output("fcom%s %d(%s)",((p)?"p":""),d,gpn(b)); if (p) fpu_pop(); } while(0)
+#define FLD32(d,b)  do { count_ldq(); FPUm(0xd900, d, b);       asm_output("fld32 %d(%s)",d,gpn(b)); fpu_push();} while(0)
 #define FLDQ(d,b)   do { count_ldq(); FPUm(0xdd00, d, b);       asm_output("fldq %d(%s)",d,gpn(b)); fpu_push();} while(0)
 #define FILDQ(d,b)  do { count_fpuld(); FPUm(0xdf05, d, b);     asm_output("fildq %d(%s)",d,gpn(b)); fpu_push(); } while(0)
 #define FILD(d,b)   do { count_fpuld(); FPUm(0xdb00, d, b);     asm_output("fild %d(%s)",d,gpn(b)); fpu_push(); } while(0)
--- a/js/src/nanojit/VMPI.cpp
+++ b/js/src/nanojit/VMPI.cpp
@ -35,11 +35,6 @@
 #include "nanojit.h"

 #ifdef SOLARIS
-	#include <ucontext.h>
-	#include <dlfcn.h>
-	#include <procfs.h>
-	#include <sys/stat.h>
-    extern "C" caddr_t _getfp(void);
    typedef caddr_t maddr_ptr;
 #else
    typedef void *maddr_ptr;
--- a/js/src/nanojit/avmplus.cpp
+++ b/js/src/nanojit/avmplus.cpp
@ -35,11 +35,6 @@
 #include "nanojit.h"

 #ifdef SOLARIS
-	#include <ucontext.h>
-	#include <dlfcn.h>
-	#include <procfs.h>
-	#include <sys/stat.h>
-    extern "C" caddr_t _getfp(void);
    typedef caddr_t maddr_ptr;
 #else
    typedef void *maddr_ptr;
--- a/js/src/nanojit/nanojit.h
+++ b/js/src/nanojit/nanojit.h
@ -249,14 +249,13 @@ namespace nanojit {
           and below, so that callers can use bits 16 and above for
           themselves. */
        // TODO: add entries for the writer pipeline
-        LC_FragProfile = 1<<7, // collect per-frag usage counts
-        LC_Activation  = 1<<6, // enable printActivationState
-        LC_Liveness    = 1<<5, // (show LIR liveness analysis)
-        LC_ReadLIR     = 1<<4, // As read from LirBuffer
-        LC_AfterSF     = 1<<3, // After StackFilter
-        LC_RegAlloc    = 1<<2, // stuff to do with reg alloc
-        LC_Assembly    = 1<<1, // final assembly
-        LC_NoCodeAddrs = 1<<0  // (don't show code addresses on asm output)
+        LC_FragProfile = 1<<6, // collect per-frag usage counts
+        LC_Activation  = 1<<5, // enable printActivationState
+        LC_Liveness    = 1<<4, // (show LIR liveness analysis)
+        LC_ReadLIR     = 1<<3, // As read from LirBuffer
+        LC_AfterSF     = 1<<2, // After StackFilter
+        LC_RegAlloc    = 1<<1, // stuff to do with reg alloc
+        LC_Assembly    = 1<<0  // final assembly
    };

    class LogControl
--- a/js/src/tests/e4x/Regress/jstests.list
+++ b/js/src/tests/e4x/Regress/jstests.list
@ -52,7 +52,7 @@ script regress-354145-03.js
 script regress-354145-04.js
 script regress-354145-05.js
 script regress-354145-07.js
-skip-if(!xulRuntime.shell&&isDebugBuild) script regress-354998.js # very slow; test needs revising
+script regress-354998.js
 script regress-355474-02.js
 script regress-355478.js
 script regress-355569.js
--- a/js/src/tests/ecma_3/FunExpr/jstests.list
+++ b/js/src/tests/ecma_3/FunExpr/jstests.list
@ -5,3 +5,4 @@ script fe-002.js
 script regress-518103.js
 script regress-524826.js
 script regress-528082.js
+script regress-533254.js
--- a/js/src/tests/ecma_3/FunExpr/regress-533254.js
+++ b/js/src/tests/ecma_3/FunExpr/regress-533254.js
@ -0,0 +1,29 @@
+/*
+ * Any copyright is dedicated to the Public Domain.
+ * http://creativecommons.org/licenses/publicdomain/
+ */
+
+var gTestfile = 'regress-533254.js';
+var BUGNUMBER = 533254;
+var summary = 'init-method late in table-big initialiser screwup';
+
+printBugNumber(BUGNUMBER);
+printStatus(summary);
+
+function f() {
+    var proto = {p8:8};
+    var obj = {
+        p0:0, p1:1, p2:2, p3:3, p4:4, p5:5, p6:6, p7:7, p8:8, p9:9, 
+        p10:0, p11:1, p12:2, p13:3, p14:4, p15:5, p16:6, p17:7, p18:8, p19:9, 
+        m: function() { return 42; }
+    };
+    return obj;
+}
+var expect = f(),
+    actual = f();
+
+expect += '';
+actual += '';
+reportCompare(expect, actual, summary);
+
+printStatus("All tests passed!");
--- a/js/src/tests/js1_4/Eval/jstests.list
+++ b/js/src/tests/js1_4/Eval/jstests.list
@ -2,3 +2,5 @@ url-prefix ../../jsreftest.html?test=js1_4/Eval/
 script eval-001.js
 script eval-002.js
 script eval-003.js
+script regress-531037.js
+script regress-531682.js
--- a/js/src/tests/js1_8/regress/jstests.list
+++ b/js/src/tests/js1_8/regress/jstests.list
@ -82,3 +82,4 @@ script regress-479740.js
 script regress-481800.js
 script regress-483749.js
 script regress-499524.js
+script regress-532491.js
--- a/js/src/trace-test/tests/basic/bug533705.js
+++ b/js/src/trace-test/tests/basic/bug533705.js
@ -0,0 +1,26 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+
+function mk() {
+    return (function () {});
+}
+
+function f() {
+    var j = 55;
+
+    var f = function () {
+        return j;
+    };
+
+    var g = function() {};
+
+    var a = [ mk(), f, g, mk(), mk() ];
+
+    for (var i = 0; i < 5; ++i) {
+        a[i].p = 99;
+    }
+}
+
+f();
+
+for (var i = 0; i < 9; i++)
+    ({__parent__: []} = []);
--- a/js/src/trace-test/tests/basic/bug535474.js
+++ b/js/src/trace-test/tests/basic/bug535474.js
@ -0,0 +1,27 @@
+function f() {
+  var _76 = {};
+  for (var i = 0; i < arguments.length; i++) {
+    var typ = arguments[i];
+    _76[typ] = typ;
+  }
+  return function () {
+    for (var i = 0; i < arguments.length; i++) {
+      if (!(typeof (arguments[i]) in _76)) {
+        return false;
+      }
+    }
+    return true;
+  }
+}
+
+g = f("number", "boolean", "object");
+
+g("a", "b", "c", "d", "e", "f", 2);
+g(2, "a", "b", "c", "d", "e", "f", 2);
+
+/*
+ * Don't assert --
+ * Assertion failed: frame entry -4 wasn't freed
+ * : _activation.entry[i] == 0 (../nanojit/Assembler.cpp:786)
+ */
+
--- a/js/src/trace-test/tests/basic/testBug532363.js
+++ b/js/src/trace-test/tests/basic/testBug532363.js
@ -0,0 +1,2 @@
+for (var i = 0; i < 9; i++)
+    ({__parent__: []} = []);