From e82972a6ac5c6cebb1419cfe98105c7cc9f09f6d Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@mozilla.com>
Date: Sun, 25 Jul 2010 19:17:39 -0700
Subject: [PATCH] Bug 552812 - nanojit: factor out AccSet differences into TM
 and TR (take 2).  r=edwsmith.

--HG--
extra : convert_revision : 7e11df137a4d30bd8411cf3b35cb26a003188f86
---
 js/src/lirasm/lirasm.cpp |  50 ++--
 js/src/nanojit/LIR.cpp   | 477 +++++++++++++++++++--------------------
 js/src/nanojit/LIR.h     | 376 ++++++++++++++++++------------
 3 files changed, 499 insertions(+), 404 deletions(-)

diff --git a/js/src/lirasm/lirasm.cpp b/js/src/lirasm/lirasm.cpp
index b211002406f..067066d955a 100644
--- a/js/src/lirasm/lirasm.cpp
+++ b/js/src/lirasm/lirasm.cpp
@@ -95,6 +95,10 @@ nanojit::StackFilter::getTop(LIns*)
     return 0;
 }
 
+// We lump everything into a single access region for lirasm.
+static const AccSet ACCSET_OTHER = (1 << 0);
+static const uint8_t LIRASM_NUM_USED_ACCS = 1;
+
 #if defined NJ_VERBOSE
 void
 nanojit::LInsPrinter::formatGuard(InsBuf *buf, LIns *ins)
@@ -124,6 +128,22 @@ nanojit::LInsPrinter::formatGuardXov(InsBuf *buf, LIns *ins)
             (long)x->line,
             ins->record()->profGuardID);
 }
+
+const char*
+nanojit::LInsPrinter::accNames[] = {
+    "o",    // (1 << 0) == ACCSET_OTHER
+    "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",   //  1..10 (unused)
+    "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",   // 11..20 (unused)
+    "?", "?", "?", "?", "?", "?", "?", "?", "?", "?",   // 21..30 (unused)
+    "?"                                                 //     31 (unused)
+};
+#endif
+
+#ifdef DEBUG
+void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet)
+{
+    NanoAssert(accSet == ACCSET_OTHER);
+}
 #endif
 
 typedef int32_t (FASTCALL *RetInt)();
@@ -148,7 +168,7 @@ enum ReturnType {
 #endif
 
 #define CI(name, args) \
-    {(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACC_STORE_ANY \
+    {(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACCSET_STORE_ANY \
      DEBUG_ONLY_NAME(name)}
 
 #define FN(name, args) \
@@ -504,7 +524,7 @@ FragmentAssembler::FragmentAssembler(Lirasm &parent, const string &fragmentName,
     }
 #endif
     if (optimize) {
-        mLir = mCseFilter = new CseFilter(mLir, mParent.mAlloc);
+        mLir = mCseFilter = new CseFilter(mLir, LIRASM_NUM_USED_ACCS, mParent.mAlloc);
     }
 #if NJ_SOFTFLOAT_SUPPORTED
     if (avmplus::AvmCore::config.soft_float) {
@@ -610,7 +630,7 @@ FragmentAssembler::assemble_load()
         mTokens[1].find_first_of("0123456789") == 0) {
         return mLir->insLoad(mOpcode,
                              ref(mTokens[0]),
-                             immI(mTokens[1]), ACC_LOAD_ANY);
+                             immI(mTokens[1]), ACCSET_OTHER);
     }
     bad("immediate offset required for load");
     return NULL;  // not reached
@@ -1061,7 +1081,7 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons
             need(3);
             ins = mLir->insStore(mOpcode, ref(mTokens[0]),
                                   ref(mTokens[1]),
-                                  immI(mTokens[2]), ACC_STORE_ANY);
+                                  immI(mTokens[2]), ACCSET_OTHER);
             break;
 
 #if NJ_EXPANDED_LOADSTORE_SUPPORTED 
@@ -1313,8 +1333,8 @@ const CallInfo ci_V_IQF = CI(f_V_IQF, CallInfo::typeSig3(ARGTYPE_V, ARGTYPE_I, A
 // - LIR_modd (not implemented in NJ backends)
 //
 // Other limitations:
-// - Loads always use accSet==ACC_LOAD_ANY
-// - Stores always use accSet==ACC_STORE_ANY
+// - Loads always use accSet==ACCSET_OTHER
+// - Stores always use accSet==ACCSET_OTHER
 //
 void
 FragmentAssembler::assembleRandomFragment(int nIns)
@@ -1817,7 +1837,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty()) {
                 LIns* base = rndPick(Ms);
-                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACC_LOAD_ANY);
+                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACCSET_OTHER);
                 addOrReplace(Is, ins);
                 n++;
             }
@@ -1828,7 +1848,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LLD_Q:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
+                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACCSET_OTHER);
                 addOrReplace(Qs, ins);
                 n++;
             }
@@ -1838,7 +1858,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LLD_D:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
+                ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACCSET_OTHER);
                 addOrReplace(Ds, ins);
                 n++;
             }
@@ -1848,7 +1868,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty() && !Is.empty()) {
                 LIns* base = rndPick(Ms);
-                mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACC_STORE_ANY);
+                mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACCSET_OTHER);
                 n++;
             }
             break;
@@ -1858,7 +1878,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LST_Q:
             if (!M8ps.empty() && !Qs.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACC_STORE_ANY);
+                mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACCSET_OTHER);
                 n++;
             }
             break;
@@ -1867,7 +1887,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LST_D:
             if (!M8ps.empty() && !Ds.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACC_STORE_ANY);
+                mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACCSET_OTHER);
                 n++;
             }
             break;
@@ -1977,7 +1997,7 @@ Lirasm::Lirasm(bool verbose) :
 #ifdef DEBUG
     if (mVerbose) {
         mLogc.lcbits = LC_ReadLIR | LC_AfterDCE | LC_Native | LC_RegAlloc | LC_Activation;
-        mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc);
+        mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc, LIRASM_NUM_USED_ACCS);
     }
 #endif
 
@@ -2016,13 +2036,13 @@ Lirasm::lookupFunction(const string &name, CallInfo *&ci)
         // The ABI, arg types and ret type will be overridden by the caller.
         if (func->second.mReturnType == RT_FLOAT) {
             CallInfo target = {(uintptr_t) func->second.rfloat,
-                               0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY
+                               0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY
                                verbose_only(, func->first.c_str()) };
             *ci = target;
 
         } else {
             CallInfo target = {(uintptr_t) func->second.rint,
-                               0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY
+                               0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY
                                verbose_only(, func->first.c_str()) };
             *ci = target;
         }
diff --git a/js/src/nanojit/LIR.cpp b/js/src/nanojit/LIR.cpp
index 7cf057882be..4ae1c1e461e 100644
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -294,18 +294,18 @@ namespace nanojit
         return ins;
     }
 
-    LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
+    LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual)
     {
         if (isS16(d)) {
             LInsLd* insLd = (LInsLd*)_buf->makeRoom(sizeof(LInsLd));
             LIns*   ins   = insLd->getLIns();
-            ins->initLInsLd(op, base, d, accSet);
+            ins->initLInsLd(op, base, d, accSet, loadQual);
             return ins;
         } else {
             // If the displacement is more than 16 bits, put it in a separate instruction.
             // Note that CseFilter::insLoad() also does this, so this will
             // only occur if CseFilter has been removed from the pipeline.
-            return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet);
+            return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet, loadQual);
         }
     }
 
@@ -1102,7 +1102,7 @@ namespace nanojit
         return out->insBranchJov(op, oprnd1, oprnd2, target);
     }
 
-    LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet) {
+    LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual) {
         if (base->isImmP() && !isS8(off)) {
             // if the effective address is constant, then transform:
             // ld const[bigconst] => ld (const+bigconst)[0]
@@ -1110,9 +1110,9 @@ namespace nanojit
             // under the assumption that we're more likely to CSE-match the
             // constant base address if we dont const-fold small offsets.
             uintptr_t p = (uintptr_t)base->immP() + off;
-            return out->insLoad(op, insImmP((void*)p), 0, accSet);
+            return out->insLoad(op, insImmP((void*)p), 0, accSet, loadQual);
         }
-        return out->insLoad(op, base, off, accSet);
+        return out->insLoad(op, base, off, accSet, loadQual);
     }
 
     LIns* LirWriter::insStore(LIns* value, LIns* base, int32_t d, AccSet accSet)
@@ -1602,19 +1602,27 @@ namespace nanojit
         return e ? e->name : NULL;
     }
 
-
     char* LInsPrinter::formatAccSet(RefBuf* buf, AccSet accSet) {
-        int i = 0;
-        // 'c' is short for "const", because 'r' is used for RSTACK.
-        if (accSet & ACC_READONLY) { buf->buf[i++] = 'c'; accSet &= ~ACC_READONLY; }
-        if (accSet & ACC_STACK)    { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
-        if (accSet & ACC_RSTACK)   { buf->buf[i++] = 'r'; accSet &= ~ACC_RSTACK; }
-        if (accSet & ACC_OTHER)    { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
-        // This assertion will fail if we add a new accSet value but
-        // forget to handle it here.
-        NanoAssert(accSet == 0);
-        buf->buf[i] = 0;
-        NanoAssert(size_t(i) < buf->len);
+        if (accSet == ACCSET_NONE) {
+            VMPI_sprintf(buf->buf, ".none");
+        } else if (accSet == ACCSET_ALL) {
+            VMPI_sprintf(buf->buf, ".all");
+        } else {
+            char* b = buf->buf;
+            b[0] = 0;
+            // The AccSet may contain bits set for regions not used by the
+            // embedding, if any have been specified via 
+            // (ACCSET_ALL & ~ACCSET_XYZ).  So only print those that are
+            // relevant.
+            for (int i = 0; i < EMB_NUM_USED_ACCS; i++) {
+                if (accSet & (1 << i)) {
+                    VMPI_strcat(b, ".");
+                    VMPI_strcat(b, accNames[i]);
+                    accSet &= ~(1 << i);
+                }
+            }
+            NanoAssert(VMPI_strlen(b) < buf->len);
+        }
         return buf->buf;
     }
 
@@ -1740,11 +1748,11 @@ namespace nanojit
                 int32_t argc = i->argc();
                 int32_t m = int32_t(n);     // Windows doesn't have 'ssize_t'
                 if (call->isIndirect())
-                    m -= VMPI_snprintf(s, m, "%s = %s.%s [%s] ( ", formatRef(&b1, i), lirNames[op],
+                    m -= VMPI_snprintf(s, m, "%s = %s%s [%s] ( ", formatRef(&b1, i), lirNames[op],
                                        formatAccSet(&b2, call->_storeAccSet),
                                        formatRef(&b3, i->arg(--argc)));
                 else
-                    m -= VMPI_snprintf(s, m, "%s = %s.%s #%s ( ", formatRef(&b1, i), lirNames[op],
+                    m -= VMPI_snprintf(s, m, "%s = %s%s #%s ( ", formatRef(&b1, i), lirNames[op],
                                        formatAccSet(&b2, call->_storeAccSet), call->_name);
                 if (m < 0) break;
                 for (int32_t j = argc - 1; j >= 0; j--) {
@@ -1907,12 +1915,19 @@ namespace nanojit
             case LIR_ldus2ui:
             case LIR_ldc2i:
             case LIR_lds2i:
-            case LIR_ldf2d:
-                VMPI_snprintf(s, n, "%s = %s.%s %s[%d]", formatRef(&b1, i), lirNames[op],
-                    formatAccSet(&b2, i->accSet()),
-                    formatRef(&b3, i->oprnd1()),
+            case LIR_ldf2d: {
+                const char* qualStr;
+                switch (i->loadQual()) {
+                case LOAD_CONST:        qualStr = "/c"; break;
+                case LOAD_NORMAL:       qualStr = "";   break; 
+                case LOAD_VOLATILE:     qualStr = "/v"; break;
+                default: NanoAssert(0); qualStr = "/?"; break;
+                }
+                VMPI_snprintf(s, n, "%s = %s%s%s %s[%d]", formatRef(&b1, i), lirNames[op],
+                    formatAccSet(&b2, i->accSet()), qualStr, formatRef(&b3, i->oprnd1()),
                     i->disp());
                 break;
+            }
 
             case LIR_sti:
             CASE64(LIR_stq:)
@@ -1920,7 +1935,7 @@ namespace nanojit
             case LIR_sti2c:
             case LIR_sti2s:
             case LIR_std2f:
-                VMPI_snprintf(s, n, "%s.%s %s[%d] = %s", lirNames[op],
+                VMPI_snprintf(s, n, "%s%s %s[%d] = %s", lirNames[op],
                     formatAccSet(&b1, i->accSet()),
                     formatRef(&b2, i->oprnd2()),
                     i->disp(),
@@ -1935,40 +1950,42 @@ namespace nanojit
     }
 #endif
 
-
-    CseFilter::CseFilter(LirWriter *out, Allocator& alloc)
-        : LirWriter(out), storesSinceLastLoad(ACC_NONE), alloc(alloc)
+    CseFilter::CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator& alloc)
+        : LirWriter(out),
+          EMB_NUM_USED_ACCS(embNumUsedAccs),
+          CSE_NUM_USED_ACCS(EMB_NUM_USED_ACCS + 2),
+          CSE_ACC_CONST(    EMB_NUM_USED_ACCS + 0),
+          CSE_ACC_MULTIPLE( EMB_NUM_USED_ACCS + 1),
+          storesSinceLastLoad(ACCSET_NONE),
+          alloc(alloc)
     {
-        m_find[LInsImmI]         = &CseFilter::findImmI;
-        m_find[LInsImmQ]         = PTR_SIZE(NULL, &CseFilter::findImmQ);
-        m_find[LInsImmD]         = &CseFilter::findImmD;
-        m_find[LIns1]            = &CseFilter::find1;
-        m_find[LIns2]            = &CseFilter::find2;
-        m_find[LIns3]            = &CseFilter::find3;
-        m_find[LInsCall]         = &CseFilter::findCall;
-        m_find[LInsLoadReadOnly] = &CseFilter::findLoadReadOnly;
-        m_find[LInsLoadStack]    = &CseFilter::findLoadStack;
-        m_find[LInsLoadRStack]   = &CseFilter::findLoadRStack;
-        m_find[LInsLoadOther]    = &CseFilter::findLoadOther;
-        m_find[LInsLoadMultiple] = &CseFilter::findLoadMultiple;
 
-        m_cap[LInsImmI]         = 128;
-        m_cap[LInsImmQ]         = PTR_SIZE(0, 16);
-        m_cap[LInsImmD]         = 16;
-        m_cap[LIns1]            = 256;
-        m_cap[LIns2]            = 512;
-        m_cap[LIns3]            = 16;
-        m_cap[LInsCall]         = 64;
-        m_cap[LInsLoadReadOnly] = 16;
-        m_cap[LInsLoadStack]    = 16;
-        m_cap[LInsLoadRStack]   = 16;
-        m_cap[LInsLoadOther]    = 16;
-        m_cap[LInsLoadMultiple] = 16;
+        m_findNL[LInsImmI] = &CseFilter::findImmI;
+        m_findNL[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ);
+        m_findNL[LInsImmD] = &CseFilter::findImmD;
+        m_findNL[LIns1]    = &CseFilter::find1;
+        m_findNL[LIns2]    = &CseFilter::find2;
+        m_findNL[LIns3]    = &CseFilter::find3;
+        m_findNL[LInsCall] = &CseFilter::findCall;
 
-        for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
-            m_list[kind] = new (alloc) LIns*[m_cap[kind]];
+        m_capNL[LInsImmI]  = 128;
+        m_capNL[LInsImmQ]  = PTR_SIZE(0, 16);
+        m_capNL[LInsImmD]  = 16;
+        m_capNL[LIns1]     = 256;
+        m_capNL[LIns2]     = 512;
+        m_capNL[LIns3]     = 16;
+        m_capNL[LInsCall]  = 64;
+
+        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+            m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
+
+        // Note that this allocates the CONST and MULTIPLE tables as well.
+        for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) {
+            m_capL[a] = 16;
+            m_listL[a] = new (alloc) LIns*[m_capL[a]];
         }
-        clear();
+
+        clearAll();
     }
 
     // Inlined/separated version of SuperFastHash.
@@ -2017,15 +2034,23 @@ namespace nanojit
         return hash;
     }
 
-    void CseFilter::clear(LInsHashKind kind) {
-        VMPI_memset(m_list[kind], 0, sizeof(LIns*)*m_cap[kind]);
-        m_used[kind] = 0;
+    void CseFilter::clearNL(NLKind nlkind) {
+        VMPI_memset(m_listNL[nlkind], 0, sizeof(LIns*)*m_capNL[nlkind]);
+        m_usedNL[nlkind] = 0;
     }
 
-    void CseFilter::clear() {
-        for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
-            clear(kind);
-        }
+    void CseFilter::clearL(CseAcc a) {
+        VMPI_memset(m_listL[a], 0, sizeof(LIns*)*m_capL[a]);
+        m_usedL[a] = 0;
+    }
+
+    void CseFilter::clearAll() {
+        for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
+            clearNL(nlkind);
+
+        // Note that this clears the CONST and MULTIPLE load tables as well.
+        for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++)
+            clearL(a);
     }
 
     inline uint32_t CseFilter::hashImmI(int32_t a) {
@@ -2055,15 +2080,12 @@ namespace nanojit
         return hashfinish(hashptr(hash, c));
     }
 
-    NanoStaticAssert(sizeof(AccSet) == 1);  // required for hashLoad to work properly
-
-    // Nb: no need to hash the load's AccSet because each region's loads go in
-    // a different hash table.
-    inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet) {
-        uint32_t hash = hash8(0,uint8_t(op));
+    // Nb: no need to hash the load's MiniAccSet because each every load goes
+    // into a table where all the loads have the same MiniAccSet.
+    inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d) {
+        uint32_t hash = hash8(0, uint8_t(op));
         hash = hashptr(hash, a);
-        hash = hash32(hash, d);
-        return hashfinish(hash8(hash, accSet));
+        return hashfinish(hash32(hash, d));
     }
 
     inline uint32_t CseFilter::hashCall(const CallInfo *ci, uint32_t argc, LIns* args[]) {
@@ -2073,41 +2095,69 @@ namespace nanojit
         return hashfinish(hash);
     }
 
-    void CseFilter::grow(LInsHashKind kind)
+    void CseFilter::growNL(NLKind nlkind)
     {
-        const uint32_t oldcap = m_cap[kind];
-        m_cap[kind] <<= 1;
-        LIns** oldlist = m_list[kind];
-        m_list[kind] = new (alloc) LIns*[m_cap[kind]];
-        VMPI_memset(m_list[kind], 0, m_cap[kind] * sizeof(LIns*));
-        find_t find = m_find[kind];
+        const uint32_t oldcap = m_capNL[nlkind];
+        m_capNL[nlkind] <<= 1;
+        LIns** oldlist = m_listNL[nlkind];
+        m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
+        VMPI_memset(m_listNL[nlkind], 0, m_capNL[nlkind] * sizeof(LIns*));
+        find_t find = m_findNL[nlkind];
         for (uint32_t i = 0; i < oldcap; i++) {
             LIns* ins = oldlist[i];
             if (!ins) continue;
             uint32_t j = (this->*find)(ins);
-            NanoAssert(!m_list[kind][j]);
-            m_list[kind][j] = ins;
+            NanoAssert(!m_listNL[nlkind][j]);
+            m_listNL[nlkind][j] = ins;
         }
     }
 
-    void CseFilter::add(LInsHashKind kind, LIns* ins, uint32_t k)
+    void CseFilter::growL(CseAcc cseAcc)
     {
-        NanoAssert(!m_list[kind][k]);
-        m_used[kind]++;
-        m_list[kind][k] = ins;
-        if ((m_used[kind] * 4) >= (m_cap[kind] * 3)) {  // load factor of 0.75
-            grow(kind);
+        const uint32_t oldcap = m_capL[cseAcc];
+        m_capL[cseAcc] <<= 1;
+        LIns** oldlist = m_listL[cseAcc];
+        m_listL[cseAcc] = new (alloc) LIns*[m_capL[cseAcc]];
+        VMPI_memset(m_listL[cseAcc], 0, m_capL[cseAcc] * sizeof(LIns*));
+        find_t find = &CseFilter::findLoad;
+        for (uint32_t i = 0; i < oldcap; i++) {
+            LIns* ins = oldlist[i];
+            if (!ins) continue;
+            uint32_t j = (this->*find)(ins);
+            NanoAssert(!m_listL[cseAcc][j]);
+            m_listL[cseAcc][j] = ins;
+        }
+    }
+
+    void CseFilter::addNL(NLKind nlkind, LIns* ins, uint32_t k)
+    {
+        NanoAssert(!m_listNL[nlkind][k]);
+        m_usedNL[nlkind]++;
+        m_listNL[nlkind][k] = ins;
+        if ((m_usedNL[nlkind] * 4) >= (m_capNL[nlkind] * 3)) {  // load factor of 0.75
+            growNL(nlkind);
+        }
+    }
+
+    void CseFilter::addL(LIns* ins, uint32_t k)
+    {
+        CseAcc cseAcc = miniAccSetToCseAcc(ins->miniAccSet(), ins->loadQual());
+        NanoAssert(!m_listL[cseAcc][k]);
+        m_usedL[cseAcc]++;
+        m_listL[cseAcc][k] = ins;
+        if ((m_usedL[cseAcc] * 4) >= (m_capL[cseAcc] * 3)) {  // load factor of 0.75
+            growL(cseAcc);
         }
     }
 
     inline LIns* CseFilter::findImmI(int32_t a, uint32_t &k)
     {
-        LInsHashKind kind = LInsImmI;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LInsImmI;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hashImmI(a) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             NanoAssert(ins->isImmI());
@@ -2135,12 +2185,12 @@ namespace nanojit
 #ifdef NANOJIT_64BIT
     inline LIns* CseFilter::findImmQ(uint64_t a, uint32_t &k)
     {
-        LInsHashKind kind = LInsImmQ;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LInsImmQ;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hashImmQorD(a) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             NanoAssert(ins->isImmQ());
@@ -2161,12 +2211,12 @@ namespace nanojit
 
     inline LIns* CseFilter::findImmD(uint64_t a, uint32_t &k)
     {
-        LInsHashKind kind = LInsImmD;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LInsImmD;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hashImmQorD(a) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             NanoAssert(ins->isImmD());
@@ -2186,12 +2236,12 @@ namespace nanojit
 
     inline LIns* CseFilter::find1(LOpcode op, LIns* a, uint32_t &k)
     {
-        LInsHashKind kind = LIns1;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LIns1;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hash1(op, a) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             if (ins->isop(op) && ins->oprnd1() == a)
@@ -2210,12 +2260,12 @@ namespace nanojit
 
     inline LIns* CseFilter::find2(LOpcode op, LIns* a, LIns* b, uint32_t &k)
     {
-        LInsHashKind kind = LIns2;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LIns2;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hash2(op, a, b) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b)
@@ -2234,12 +2284,12 @@ namespace nanojit
 
     inline LIns* CseFilter::find3(LOpcode op, LIns* a, LIns* b, LIns* c, uint32_t &k)
     {
-        LInsHashKind kind = LIns3;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LIns3;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hash3(op, a, b, c) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c)
@@ -2256,18 +2306,17 @@ namespace nanojit
         return k;
     }
 
-    inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet,
-                                     LInsHashKind kind, uint32_t &k)
+    inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, MiniAccSet miniAccSet,
+                                     LoadQual loadQual, uint32_t &k)
     {
-        (void)accSet;
-        const uint32_t bitmask = m_cap[kind] - 1;
-        k = hashLoad(op, a, d, accSet) & bitmask;
+        CseAcc cseAcc = miniAccSetToCseAcc(miniAccSet, loadQual);
+        const uint32_t bitmask = m_capL[cseAcc] - 1;
+        k = hashLoad(op, a, d) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listL[cseAcc][k];
             if (!ins)
                 return NULL;
-            NanoAssert(ins->accSet() == accSet);
             if (ins->isop(op) && ins->oprnd1() == a && ins->disp() == d)
                 return ins;
             k = (k + n) & bitmask;
@@ -2275,38 +2324,10 @@ namespace nanojit
         }
     }
 
-    uint32_t CseFilter::findLoadReadOnly(LIns* ins)
+    uint32_t CseFilter::findLoad(LIns* ins)
     {
         uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadReadOnly, k);
-        return k;
-    }
-
-    uint32_t CseFilter::findLoadStack(LIns* ins)
-    {
-        uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadStack, k);
-        return k;
-    }
-
-    uint32_t CseFilter::findLoadRStack(LIns* ins)
-    {
-        uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadRStack, k);
-        return k;
-    }
-
-    uint32_t CseFilter::findLoadOther(LIns* ins)
-    {
-        uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadOther, k);
-        return k;
-    }
-
-    uint32_t CseFilter::findLoadMultiple(LIns* ins)
-    {
-        uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadMultiple, k);
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->miniAccSet(), ins->loadQual(), k);
         return k;
     }
 
@@ -2320,12 +2341,12 @@ namespace nanojit
 
     inline LIns* CseFilter::findCall(const CallInfo *ci, uint32_t argc, LIns* args[], uint32_t &k)
     {
-        LInsHashKind kind = LInsCall;
-        const uint32_t bitmask = m_cap[kind] - 1;
+        NLKind nlkind = LInsCall;
+        const uint32_t bitmask = m_capNL[nlkind] - 1;
         k = hashCall(ci, argc, args) & bitmask;
         uint32_t n = 1;
         while (true) {
-            LIns* ins = m_list[kind][k];
+            LIns* ins = m_listNL[nlkind][k];
             if (!ins)
                 return NULL;
             if (ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args))
@@ -2353,7 +2374,7 @@ namespace nanojit
         LIns* ins = findImmI(imm, k);
         if (!ins) {
             ins = out->insImmI(imm);
-            add(LInsImmI, ins, k);
+            addNL(LInsImmI, ins, k);
         }
         // We assume that downstream stages do not modify the instruction, so
         // that we can insert 'ins' into slot 'k'.  Check this.
@@ -2368,7 +2389,7 @@ namespace nanojit
         LIns* ins = findImmQ(q, k);
         if (!ins) {
             ins = out->insImmQ(q);
-            add(LInsImmQ, ins, k);
+            addNL(LInsImmQ, ins, k);
         }
         NanoAssert(ins->isop(LIR_immq) && ins->immQ() == q);
         return ins;
@@ -2388,7 +2409,7 @@ namespace nanojit
         LIns* ins = findImmD(u.u64, k);
         if (!ins) {
             ins = out->insImmD(d);
-            add(LInsImmD, ins, k);
+            addNL(LInsImmD, ins, k);
         }
         NanoAssert(ins->isop(LIR_immd) && ins->immDasQ() == u.u64);
         return ins;
@@ -2397,7 +2418,7 @@ namespace nanojit
     LIns* CseFilter::ins0(LOpcode op)
     {
         if (op == LIR_label)
-            clear();
+            clearAll();
         return out->ins0(op);
     }
 
@@ -2409,7 +2430,7 @@ namespace nanojit
             ins = find1(op, a, k);
             if (!ins) {
                 ins = out->ins1(op, a);
-                add(LIns1, ins, k);
+                addNL(LIns1, ins, k);
             }
         } else {
             ins = out->ins1(op, a);
@@ -2426,7 +2447,7 @@ namespace nanojit
         ins = find2(op, a, b, k);
         if (!ins) {
             ins = out->ins2(op, a, b);
-            add(LIns2, ins, k);
+            addNL(LIns2, ins, k);
         }
         NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
         return ins;
@@ -2439,51 +2460,56 @@ namespace nanojit
         LIns* ins = find3(op, a, b, c, k);
         if (!ins) {
             ins = out->ins3(op, a, b, c);
-            add(LIns3, ins, k);
+            addNL(LIns3, ins, k);
         }
         NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c);
         return ins;
     }
 
-    LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet loadAccSet)
+    LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual)
     {
         LIns* ins;
         if (isS16(disp)) {
-            // Clear all loads aliased by stores and calls since the last time
-            // we were in this function.
-            if (storesSinceLastLoad != ACC_NONE) {
-                NanoAssert(!(storesSinceLastLoad & ACC_READONLY));  // can't store to READONLY
-                if (storesSinceLastLoad & ACC_STACK)  { clear(LInsLoadStack); }
-                if (storesSinceLastLoad & ACC_RSTACK) { clear(LInsLoadRStack); }
-                if (storesSinceLastLoad & ACC_OTHER)  { clear(LInsLoadOther); }
-                // Loads marked with multiple access regions must be treated
-                // conservatively -- we always clear all of them.
-                clear(LInsLoadMultiple);
-                storesSinceLastLoad = ACC_NONE;
+            if (storesSinceLastLoad != ACCSET_NONE) {
+                // Clear all normal (excludes CONST and MULTIPLE) loads
+                // aliased by stores and calls since the last time we were in
+                // this function.  
+                for (CseAcc a = 0; a < EMB_NUM_USED_ACCS; a++)
+                    if (storesSinceLastLoad & (1 << a))
+                        clearL(a);
+
+                // No need to clear CONST loads (those in the CSE_ACC_CONST table).
+
+                // Multi-region loads must be treated conservatively -- we
+                // always clear all of them.
+                clearL(CSE_ACC_MULTIPLE);
+
+                storesSinceLastLoad = ACCSET_NONE;
             }
 
-            LInsHashKind kind;
-            switch (loadAccSet) {
-            case ACC_READONLY:  kind = LInsLoadReadOnly;    break;
-            case ACC_STACK:     kind = LInsLoadStack;       break;
-            case ACC_RSTACK:    kind = LInsLoadRStack;      break;
-            case ACC_OTHER:     kind = LInsLoadOther;       break;
-            default:            kind = LInsLoadMultiple;    break;
+            if (loadQual == LOAD_VOLATILE) {
+                // Volatile loads are never CSE'd, don't bother looking for
+                // them or inserting them in the table.
+                ins = out->insLoad(op, base, disp, accSet, loadQual);
+            } else {
+                uint32_t k;
+                ins = findLoad(op, base, disp, compressAccSet(accSet), loadQual, k);
+                if (!ins) {
+                    ins = out->insLoad(op, base, disp, accSet, loadQual);
+                    addL(ins, k);
+                }
             }
-
-            uint32_t k;
-            ins = findLoad(op, base, disp, loadAccSet, kind, k);
-            if (!ins) {
-                ins = out->insLoad(op, base, disp, loadAccSet);
-                add(kind, ins, k);
-            }
-            NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp);
+            // Nb: must compare miniAccSets, not AccSets, because the AccSet
+            // stored in the load may have lost info if it's multi-region.
+            NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp &&
+                       ins->miniAccSet().val == compressAccSet(accSet).val &&
+                       ins->loadQual() == loadQual);
 
         } else {
             // If the displacement is more than 16 bits, put it in a separate
             // instruction.  Nb: LirBufWriter also does this, we do it here
             // too because CseFilter relies on LirBufWriter not changing code.
-            ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, loadAccSet);
+            ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, accSet, loadQual);
         }
         return ins;
     }
@@ -2531,7 +2557,7 @@ namespace nanojit
             ins = find1(op, c, k);
             if (!ins) {
                 ins = out->insGuard(op, c, gr);
-                add(LIns1, ins, k);
+                addNL(LIns1, ins, k);
             }
         } else {
             ins = out->insGuard(op, c, gr);
@@ -2549,7 +2575,7 @@ namespace nanojit
         LIns* ins = find2(op, a, b, k);
         if (!ins) {
             ins = out->insGuardXov(op, a, b, gr);
-            add(LIns2, ins, k);
+            addNL(LIns2, ins, k);
         }
         NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
         return ins;
@@ -2562,12 +2588,12 @@ namespace nanojit
         LIns* ins;
         uint32_t argc = ci->count_args();
         if (ci->_isPure) {
-            NanoAssert(ci->_storeAccSet == ACC_NONE);
+            NanoAssert(ci->_storeAccSet == ACCSET_NONE);
             uint32_t k;
             ins = findCall(ci, argc, args, k);
             if (!ins) {
                 ins = out->insCall(ci, args);
-                add(LInsCall, ins, k);
+                addNL(LInsCall, ins, k);
             }
         } else {
             // We only need to worry about aliasing if !ci->_isPure.
@@ -2601,7 +2627,7 @@ namespace nanojit
 
     #define SF_CALLINFO(name, typesig) \
         static const CallInfo name##_ci = \
-            { (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACC_NONE verbose_only(, #name) }
+            { (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACCSET_NONE verbose_only(, #name) }
 
     SF_CALLINFO(i2d,  SIG_D_I);
     SF_CALLINFO(ui2d, SIG_D_UI);
@@ -2817,6 +2843,13 @@ namespace nanojit
             whereInPipeline, what, printer->formatAccSet(&b, accSet), shouldDesc);
     }
 
+    void ValidateWriter::errorLoadQual(const char* what, LoadQual loadQual)
+    {
+        NanoAssertMsgf(0,
+            "LIR LoadQual error (%s): '%s' loadQual is '%d'",
+            whereInPipeline, what, loadQual);
+    }
+
     void ValidateWriter::checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins)
     {
         // We could introduce a LTy_B32 type in the type system but that's a
@@ -2839,60 +2872,26 @@ namespace nanojit
             errorStructureShouldBe(op, "argument", argN, ins, lirNames[op2]);
     }
 
-    void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet)
-    {
-        if (accSet == ACC_NONE)
-            errorAccSet(lirNames[op], accSet, "it should not equal ACC_NONE");
-
-        if (accSet & ~maxAccSet)
-            errorAccSet(lirNames[op], accSet,
-                "it should not contain bits that aren't in ACC_LOAD_ANY/ACC_STORE_ANY");
-
-        // Some sanity checking, which is based on the following assumptions:
-        // - STACK ones should use 'sp' or 'sp+k' as the base.  (We could look
-        //   for more complex patterns, but that feels dangerous.  Better to
-        //   keep it really simple.)
-        // - RSTACK ones should use 'rp' as the base.
-        // - READONLY/OTHER ones should not use 'sp'/'sp+k' or 'rp' as the base.
-        //
-        // Things that aren't checked:
-        // - There's no easy way to check if READONLY ones really are read-only.
-
-        bool isStack = base == sp ||
-                      (base->isop(LIR_addp) && base->oprnd1() == sp && base->oprnd2()->isImmP());
-        bool isRStack = base == rp;
-
-        switch (accSet) {
-        case ACC_STACK:
-            if (!isStack)
-                errorAccSet(lirNames[op], accSet, "but it's not a stack access");
-            break;
-
-        case ACC_RSTACK:
-            if (!isRStack)
-                errorAccSet(lirNames[op], accSet, "but it's not an rstack access");
-            break;
-
-        case ACC_READONLY:
-        case ACC_OTHER:
-            if (isStack)
-                errorAccSet(lirNames[op], accSet, "but it's a stack access");
-            if (isRStack)
-                errorAccSet(lirNames[op], accSet, "but it's an rstack access");
-            break;
-
-        default:
-            break;
-        }
-    }
-
     ValidateWriter::ValidateWriter(LirWriter *out, LInsPrinter* printer, const char* where)
-        : LirWriter(out), printer(printer), whereInPipeline(where), sp(0), rp(0)
+        : LirWriter(out), printer(printer), whereInPipeline(where), 
+          checkAccSetIns1(0), checkAccSetIns2(0)
     {}
 
-    LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
+    LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet,
+                                  LoadQual loadQual)
     {
-        checkAccSet(op, base, accSet, ACC_LOAD_ANY);
+        checkAccSet(op, base, accSet);
+
+        switch (loadQual) {
+        case LOAD_CONST:
+        case LOAD_NORMAL:
+        case LOAD_VOLATILE:
+            break;
+        default:
+            errorLoadQual(lirNames[op], loadQual);
+            break;
+        }
+
 
         int nArgs = 1;
         LTy formals[1] = { LTy_P };
@@ -2914,12 +2913,12 @@ namespace nanojit
 
         typeCheckArgs(op, nArgs, formals, args);
 
-        return out->insLoad(op, base, d, accSet);
+        return out->insLoad(op, base, d, accSet, loadQual);
     }
 
     LIns* ValidateWriter::insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet)
     {
-        checkAccSet(op, base, accSet, ACC_STORE_ANY);
+        checkAccSet(op, base, accSet);
 
         int nArgs = 2;
         LTy formals[2] = { LTy_V, LTy_P };     // LTy_V is overwritten shortly
@@ -3189,12 +3188,8 @@ namespace nanojit
 
         LOpcode op = getCallOpcode(ci);
 
-        if (ci->_isPure && ci->_storeAccSet != ACC_NONE)
-            errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACC_NONE for pure functions");
-
-        if (ci->_storeAccSet & ~ACC_STORE_ANY)
-            errorAccSet(lirNames[op], ci->_storeAccSet,
-                "it should not contain bits that aren't in ACC_STORE_ANY");
+        if (ci->_isPure && ci->_storeAccSet != ACCSET_NONE)
+            errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACCSET_NONE for pure functions");
 
         // This loop iterates over the args from right-to-left (because arg()
         // and getArgTypes() use right-to-left order), but puts the results
diff --git a/js/src/nanojit/LIR.h b/js/src/nanojit/LIR.h
index cc6ff89dbf2..f5c66d8ff19 100644
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -199,11 +199,12 @@ namespace nanojit
     // Access regions
     // --------------
     // Doing alias analysis precisely is difficult.  But it turns out that
-    // keeping track of aliasing at a very coarse level is enough to help with
-    // many optimisations.  So we conceptually divide the memory that is
-    // accessible from LIR into a small number of "access regions".  An access
-    // region may be non-contiguous.  No two access regions can overlap.  The
-    // union of all access regions covers all memory accessible from LIR.
+    // keeping track of aliasing at a coarse level is enough to help with many
+    // optimisations.  So we conceptually divide the memory that is accessible
+    // from LIR into a small number of "access regions" (aka. "Acc").  An
+    // access region may be non-contiguous.  No two access regions can
+    // overlap.  The union of all access regions covers all memory accessible
+    // from LIR.
     //
     // In general a (static) load or store may be executed more than once, and
     // thus may access multiple regions;  however, in practice almost all
@@ -214,64 +215,28 @@ namespace nanojit
     // If two loads/stores/calls are known to not access the same region(s),
     // then they do not alias.
     //
-    // The access regions used are as follows:
-    //
-    // - READONLY: all memory that is read-only, ie. never stored to.
-    //   A load from a READONLY region will never alias with any stores.
-    //
-    // - STACK: the stack.  Stack loads/stores can usually be easily
-    //   identified because they use SP as the base pointer.
-    //
-    // - RSTACK: the return stack.  Return stack loads/stores can usually be
-    //   easily identified because they use RP as the base pointer.
-    //
-    // - OTHER: all other regions of memory.
-    //
-    // It makes sense to add new access regions when doing so will help with
-    // one or more optimisations.
-    //
-    // One subtlety is that the meanings of the access region markings only
-    // apply to the LIR fragment that they are in.  For example, if a memory
-    // location M is read-only in a particular LIR fragment, all loads
-    // involving M in that fragment can be safely marked READONLY, even if M
-    // is modified elsewhere.  This is safe because the a LIR fragment is the
-    // unit of analysis in which the markings are used.  In other words alias
-    // region markings are only used for intra-fragment optimisations.
+    // All regions are defined by the embedding.  It makes sense to add new
+    // embedding-specific access regions when doing so will help with one or
+    // more optimisations.
     //
     // Access region sets and instruction markings
     // -------------------------------------------
-    // The LIR generator must mark each load/store with an "access region
-    // set", which is a set of one or more access regions.  This indicates
-    // which parts of LIR-accessible memory the load/store may touch.
+    // Each load/store is marked with an "access region set" (aka. "AccSet"),
+    // which is a set of one or more access regions.  This indicates which
+    // parts of LIR-accessible memory the load/store may touch.
     //
-    // The LIR generator must also mark each function called from LIR with an
-    // access region set for memory stored to by the function.  (We could also
-    // have a marking for memory loads, but there's no need at the moment.)
-    // These markings apply to the function itself, not the call site (ie.
-    // they're not context-sensitive).
+    // Each function called from LIR is also marked with an access region set
+    // for memory stored to by the function.  (We could also have a marking
+    // for memory loads done by the function, but there's no need at the
+    // moment.)  These markings apply to the function itself, not the call
+    // site, ie. they're not context-sensitive.
     //
-    // These load/store/call markings MUST BE ACCURATE -- if they are wrong
-    // then invalid optimisations might occur that change the meaning of the
-    // code.  However, they can safely be imprecise (ie. conservative), in the
-    // following ways:
-    //
-    // - A load that accesses a READONLY region can be safely marked instead
-    //   as loading from OTHER.  In other words, it's safe to underestimate
-    //   the size of the READONLY region.  (This would also apply to the load
-    //   set of a function, if we recorded that.)
-    //
-    // - A load/store can safely be marked as accessing regions that it
-    //   doesn't, so long as the regions it does access are also included (one
-    //   exception: marking a store with READONLY is nonsense and will cause
-    //   assertions).
-    //
-    //   In other words, a load/store can be marked with an access region set
-    //   that is a superset of its actual access region set.  Taking this to
-    //   its logical conclusion, any load can be safely marked with LOAD_ANY and
-    //   any store can be safely marked with with STORE_ANY (and the latter is
-    //   true for the store set of a function.)
-    //
-    // Such imprecision is safe but may reduce optimisation opportunities.
+    // These load/store/call markings MUST BE ACCURATE -- if not then invalid
+    // optimisations might occur that change the meaning of the code.
+    // However, they can safely be imprecise (ie. conservative), ie. a
+    // load/store/call can be marked with an access region set that is a
+    // superset of the actual access region set.  Such imprecision is safe but
+    // may reduce optimisation opportunities.
     //
     // Optimisations that use access region info
     // -----------------------------------------
@@ -282,35 +247,100 @@ namespace nanojit
     // load with a single access region, you might as well use ACC_LOAD_ANY.
     //-----------------------------------------------------------------------
 
-    // An access region set is represented as a bitset.  Nb: this restricts us
-    // to at most eight alias regions for the moment.
-    typedef uint8_t AccSet;
+    // An access region set is represented as a bitset.  Using a uint32_t
+    // restricts us to at most 32 alias regions for the moment.  This could be
+    // expanded to a uint64_t easily if needed.
+    typedef uint32_t AccSet;
+    static const int NUM_ACCS = sizeof(AccSet) * 8;
 
-    // The access regions.  Note that because of the bitset representation
-    // these constants are also valid (singleton) AccSet values.  If you add
-    // new ones please update ACC_ALL_STORABLE and formatAccSet() and
-    // CseFilter.
-    //
-    static const AccSet ACC_READONLY = 1 << 0;      // 0000_0001b
-    static const AccSet ACC_STACK    = 1 << 1;      // 0000_0010b
-    static const AccSet ACC_RSTACK   = 1 << 2;      // 0000_0100b
-    static const AccSet ACC_OTHER    = 1 << 3;      // 0000_1000b
-
-    // Some common (non-singleton) access region sets.  ACC_NONE does not make
+    // Some common (non-singleton) access region sets.  ACCSET_NONE does not make
     // sense for loads or stores (which must access at least one region), it
     // only makes sense for calls.
     //
-    // A convention that's worth using:  use ACC_LOAD_ANY/ACC_STORE_ANY for
-    // cases that you're unsure about or haven't considered carefully.  Use
-    // ACC_ALL/ACC_ALL_STORABLE for cases that you have considered carefully.
-    // That way it's easy to tell which ones have been considered and which
-    // haven't.
-    static const AccSet ACC_NONE         = 0x0;
-    static const AccSet ACC_ALL_STORABLE = ACC_STACK | ACC_RSTACK | ACC_OTHER;
-    static const AccSet ACC_ALL          = ACC_READONLY | ACC_ALL_STORABLE;
-    static const AccSet ACC_LOAD_ANY     = ACC_ALL;            // synonym
-    static const AccSet ACC_STORE_ANY    = ACC_ALL_STORABLE;   // synonym
+    static const AccSet ACCSET_NONE      = 0x0;
+    static const AccSet ACCSET_ALL       = 0xffffffff;
+    static const AccSet ACCSET_LOAD_ANY  = ACCSET_ALL;      // synonym
+    static const AccSet ACCSET_STORE_ANY = ACCSET_ALL;      // synonym
 
+    // Full AccSets don't fit into load and store instructions.  But
+    // load/store AccSets almost always contain a single access region.  We
+    // take advantage of this to create a compressed AccSet, MiniAccSet, that
+    // does fit.  
+    //
+    // The 32 single-region AccSets get compressed into a number in the range
+    // 0..31 (according to the position of the set bit), and all other
+    // (multi-region) AccSets get converted into MINI_ACCSET_MULTIPLE.  So the
+    // representation is lossy in the latter case, but that case is rare for
+    // loads/stores.  We use a full AccSet for the storeAccSets of calls, for
+    // which multi-region AccSets are common.
+    //
+    // We wrap the uint8_t inside a struct to avoid the possiblity of subtle
+    // bugs caused by mixing up AccSet and MiniAccSet, which is easy to do.
+    // However, the struct gets padded inside LInsLd in an inconsistent way on
+    // Windows, so we actually store a MiniAccSetVal inside LInsLd.  Sigh.
+    // But we use MiniAccSet everywhere else.
+    //
+    typedef uint8_t MiniAccSetVal;
+    struct MiniAccSet { MiniAccSetVal val; };
+    static const MiniAccSet MINI_ACCSET_MULTIPLE = { 255 };
+
+    static MiniAccSet compressAccSet(AccSet accSet) {
+        // As the number of regions increase, this may become a bottleneck.
+        // If it does we can first count the number of bits using Kernighan's
+        // technique 
+        // (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan)
+        // and if it's a single-region set, use a bit-scanning instruction to
+        // work out which single-region set it is.  That would require
+        // factoring out the bit-scanning code currently in
+        // nRegisterAllocFromSet().
+        //
+        // Try all the single-region AccSets first.
+        for (int i = 0; i < NUM_ACCS; i++) {
+            if (accSet == (1U << i)) {
+                MiniAccSet ret = { uint8_t(i) };
+                return ret;
+            }
+        }
+        // If we got here, it must be a multi-region AccSet.
+        return MINI_ACCSET_MULTIPLE;
+    }
+
+    static AccSet decompressMiniAccSet(MiniAccSet miniAccSet) {
+        return (miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? ACCSET_ALL : (1 << miniAccSet.val);
+    }
+
+    // The LoadQual affects how a load can be optimised:  
+    //
+    // - CONST: These loads are guaranteed to always return the same value
+    //   during a single execution of a fragment (but the value is allowed to
+    //   change between executions of the fragment).  This means that the
+    //   location is never stored to by the LIR, and is never modified by an
+    //   external entity while the fragment is running.
+    //
+    // - NORMAL: These loads may be stored to by the LIR, but are never
+    //   modified by an external entity while the fragment is running.
+    //
+    // - VOLATILE: These loads may be stored to by the LIR, and may be
+    //   modified by an external entity while the fragment is running.
+    //
+    // This gives a lattice with the ordering:  CONST < NORMAL < VOLATILE.
+    // As usual, it's safe to mark a load with a value higher (less precise)
+    // that actual, but it may result in fewer optimisations occurring.
+    //
+    // Generally CONST loads are highly amenable to optimisation (eg. CSE),
+    // VOLATILE loads are entirely unoptimisable, and NORMAL loads are in
+    // between and require some alias analysis to optimise.
+    //
+    // Note that CONST has a stronger meaning to "const" in C and C++;  in C
+    // and C++ a "const" variable may be modified by an external entity, such
+    // as hardware.  Hence "const volatile" makes sense in C and C++, but
+    // CONST+VOLATILE doesn't make sense in LIR.
+    //
+    enum LoadQual {
+        LOAD_CONST,
+        LOAD_NORMAL,
+        LOAD_VOLATILE
+    };
 
     struct CallInfo
     {
@@ -687,7 +717,7 @@ namespace nanojit
         inline void initLInsOp1(LOpcode opcode, LIns* oprnd1);
         inline void initLInsOp2(LOpcode opcode, LIns* oprnd1, LIns* oprnd2);
         inline void initLInsOp3(LOpcode opcode, LIns* oprnd1, LIns* oprnd2, LIns* oprnd3);
-        inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet);
+        inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual);
         inline void initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet);
         inline void initLInsSk(LIns* prevLIns);
         // Nb: args[] must be allocated and initialised before being passed in;
@@ -790,8 +820,12 @@ namespace nanojit
         // For guards.
         inline GuardRecord* record() const;
 
+        // For loads.
+        inline LoadQual loadQual() const;
+
         // For loads/stores.
         inline int32_t  disp() const;
+        inline MiniAccSet miniAccSet() const;
         inline AccSet   accSet() const;
 
         // For LInsSk.
@@ -1085,7 +1119,8 @@ namespace nanojit
         // could go to 24 bits but then it would happen so rarely that the
         // handler code would be difficult to test and thus untrustworthy.
         int16_t     disp;
-        AccSet      accSet;
+        MiniAccSetVal miniAccSetVal;    // not 'MiniAccSet' due to Windows padding;  see above
+        LoadQual    loadQual:2;
 
         LIns*       oprnd_1;
 
@@ -1102,7 +1137,7 @@ namespace nanojit
         friend class LIns;
 
         int16_t     disp;
-        AccSet      accSet;
+        MiniAccSetVal miniAccSetVal;
 
         LIns*       oprnd_2;
 
@@ -1251,12 +1286,13 @@ namespace nanojit
         toLInsOp3()->oprnd_3 = oprnd3;
         NanoAssert(isLInsOp3());
     }
-    void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet) {
+    void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual) {
         initSharedFields(opcode);
         toLInsLd()->oprnd_1 = val;
         NanoAssert(d == int16_t(d));
         toLInsLd()->disp = int16_t(d);
-        toLInsLd()->accSet = accSet;
+        toLInsLd()->miniAccSetVal = compressAccSet(accSet).val;
+        toLInsLd()->loadQual = loadQual;
         NanoAssert(isLInsLd());
     }
     void LIns::initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet) {
@@ -1265,7 +1301,7 @@ namespace nanojit
         toLInsSt()->oprnd_2 = base;
         NanoAssert(d == int16_t(d));
         toLInsSt()->disp = int16_t(d);
-        toLInsSt()->accSet = accSet;
+        toLInsSt()->miniAccSetVal = compressAccSet(accSet).val;
         NanoAssert(isLInsSt());
     }
     void LIns::initLInsSk(LIns* prevLIns) {
@@ -1369,6 +1405,11 @@ namespace nanojit
         }
     }
 
+    LoadQual LIns::loadQual() const {
+        NanoAssert(isLInsLd());
+        return toLInsLd()->loadQual;
+    }
+
     int32_t LIns::disp() const {
         if (isLInsSt()) {
             return toLInsSt()->disp;
@@ -1378,13 +1419,19 @@ namespace nanojit
         }
     }
 
-    AccSet LIns::accSet() const {
+    MiniAccSet LIns::miniAccSet() const {
+        MiniAccSet miniAccSet;
         if (isLInsSt()) {
-            return toLInsSt()->accSet;
+            miniAccSet.val = toLInsSt()->miniAccSetVal;
         } else {
             NanoAssert(isLInsLd());
-            return toLInsLd()->accSet;
+            miniAccSet.val = toLInsLd()->miniAccSetVal;
         }
+        return miniAccSet;
+    }
+
+    AccSet LIns::accSet() const {
+        return decompressMiniAccSet(miniAccSet());
     }
 
     LIns* LIns::prevLIns() const {
@@ -1510,8 +1557,8 @@ namespace nanojit
         virtual LIns* insImmD(double d) {
             return out->insImmD(d);
         }
-        virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) {
-            return out->insLoad(op, base, d, accSet);
+        virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual) {
+            return out->insLoad(op, base, d, accSet, loadQual);
         }
         virtual LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet) {
             return out->insStore(op, value, base, d, accSet);
@@ -1584,6 +1631,11 @@ namespace nanojit
     #endif
         }
 
+        // Do a load with LoadQual==LOAD_NORMAL.
+        LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) {
+            return insLoad(op, base, d, accSet, LOAD_NORMAL);
+        }
+
         // Chooses LIR_sti, LIR_stq or LIR_std according to the type of 'value'.
         LIns* insStore(LIns* value, LIns* base, int32_t d, AccSet accSet);
     };
@@ -1680,16 +1732,19 @@ namespace nanojit
     {
     private:
         Allocator& alloc;
+        const int EMB_NUM_USED_ACCS;
 
         char *formatImmI(RefBuf* buf, int32_t c);
         char *formatImmQ(RefBuf* buf, uint64_t c);
         char *formatImmD(RefBuf* buf, double c);
-        void formatGuard(InsBuf* buf, LIns* ins);
-        void formatGuardXov(InsBuf* buf, LIns* ins);
+        void formatGuard(InsBuf* buf, LIns* ins);       // defined by the embedder
+        void formatGuardXov(InsBuf* buf, LIns* ins);    // defined by the embedder
+        static const char* accNames[];                  // defined by the embedder
 
     public:
-        LInsPrinter(Allocator& alloc)
-            : alloc(alloc)
+
+        LInsPrinter(Allocator& alloc, int embNumUsedAccs)
+            : alloc(alloc), EMB_NUM_USED_ACCS(embNumUsedAccs)
         {
             addrNameMap = new (alloc) AddrNameMap(alloc);
             lirNameMap = new (alloc) LirNameMap(alloc);
@@ -1790,8 +1845,8 @@ namespace nanojit
         LIns* insParam(int32_t i, int32_t kind) {
             return add(out->insParam(i, kind));
         }
-        LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet) {
-            return add(out->insLoad(v, base, disp, accSet));
+        LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual) {
+            return add(out->insLoad(v, base, disp, accSet, loadQual));
         }
         LIns* insStore(LOpcode op, LIns* v, LIns* b, int32_t d, AccSet accSet) {
             return add(out->insStore(op, v, b, d, accSet));
@@ -1825,16 +1880,17 @@ namespace nanojit
         LIns* insGuardXov(LOpcode, LIns* a, LIns* b, GuardRecord *);
         LIns* insBranch(LOpcode, LIns* cond, LIns* target);
         LIns* insBranchJov(LOpcode, LIns* a, LIns* b, LIns* target);
-        LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet);
+        LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual);
     private:
         LIns* simplifyOverflowArith(LOpcode op, LIns** opnd1, LIns** opnd2);
     };
 
     class CseFilter: public LirWriter
     {
-        enum LInsHashKind {
+        enum NLKind {
             // We divide instruction kinds into groups.  LIns0 isn't present
-            // because we don't need to record any 0-ary instructions.
+            // because we don't need to record any 0-ary instructions.  Loads
+            // aren't here, they're handled separately.
             LInsImmI = 0,
             LInsImmQ = 1,   // only occurs on 64-bit platforms
             LInsImmD = 2,
@@ -1843,46 +1899,69 @@ namespace nanojit
             LIns3    = 5,
             LInsCall = 6,
 
-            // Loads are special.  We group them by access region:  one table for
-            // each region, and then a catch-all table for any loads marked with
-            // multiple regions.  This arrangement makes the removal of
-            // invalidated loads fast -- eg. we can invalidate all STACK loads by
-            // just clearing the LInsLoadStack table.  The disadvantage is that
-            // loads marked with multiple regions must be invalidated
-            // conservatively, eg. if any intervening stores occur.  But loads
-            // marked with multiple regions should be rare.
-            LInsLoadReadOnly = 7,
-            LInsLoadStack    = 8,
-            LInsLoadRStack   = 9,
-            LInsLoadOther    = 10,
-            LInsLoadMultiple = 11,
-
             LInsFirst = 0,
-            LInsLast = 11,
+            LInsLast = 6,
             // Need a value after "last" to outsmart compilers that insist last+1 is impossible.
-            LInsInvalid = 12
+            LInsInvalid = 7
         };
-        #define nextKind(kind)  LInsHashKind(kind+1)
+        #define nextNLKind(kind)  NLKind(kind+1)
 
-        // There is one list for each instruction kind.  This lets us size the
-        // lists appropriately (some instructions are more common than others).
-        // It also lets us have kind-specific find/add/grow functions, which
+        // There is one table for each NLKind.  This lets us size the lists
+        // appropriately (some instruction kinds are more common than others).
+        // It also lets us have NLKind-specific find/add/grow functions, which
         // are faster than generic versions.
         //
-        // Nb: Size must be a power of 2.
-        //     Don't start too small, or we'll waste time growing and rehashing.
-        //     Don't start too large, will waste memory.
+        // Nb: m_listNL and m_capNL sizes must be a power of 2.
+        //     Don't start m_capNL too small, or we'll waste time growing and rehashing.
+        //     Don't start m_capNL too large, will waste memory.
         //
-        LIns**      m_list[LInsLast + 1];
-        uint32_t    m_cap[LInsLast + 1];
-        uint32_t    m_used[LInsLast + 1];
+        LIns**      m_listNL[LInsLast + 1];
+        uint32_t    m_capNL[ LInsLast + 1];
+        uint32_t    m_usedNL[LInsLast + 1];
         typedef uint32_t (CseFilter::*find_t)(LIns*);
-        find_t      m_find[LInsLast + 1];
+        find_t      m_findNL[LInsLast + 1];
+
+        // Similarly, for loads, there is one table for each CseAcc.  A CseAcc
+        // is like a normal access region, but there are two extra possible
+        // values: CSE_ACC_CONST, which is where we put all CONST-qualified
+        // loads, and CSE_ACC_MULTIPLE, where we put all multi-region loads.
+        // All remaining loads are single-region and go in the table entry for
+        // their region.
+        //
+        // This arrangement makes the removal of invalidated loads fast -- we
+        // can invalidate all loads from a single region by clearing that
+        // region's table.
+        //
+        typedef uint8_t CseAcc;     // same type as MiniAccSet
+
+        static const uint8_t CSE_NUM_ACCS = NUM_ACCS + 2;
+
+        // These values would be 'static const' except they are defined in
+        // terms of EMB_NUM_USED_ACCS which is itself not 'static const'
+        // because it's passed in by the embedding.
+        const uint8_t EMB_NUM_USED_ACCS;      // number of access regions used by the embedding
+        const uint8_t CSE_NUM_USED_ACCS;      // EMB_NUM_USED_ACCS + 2
+        const CseAcc CSE_ACC_CONST;           // EMB_NUM_USED_ACCS + 0
+        const CseAcc CSE_ACC_MULTIPLE;        // EMB_NUM_USED_ACCS + 1
+
+        // We will only use CSE_NUM_USED_ACCS of these entries, ie. the
+        // number of lists allocated depends on the number of access regions
+        // in use by the embedding.
+        LIns**      m_listL[CSE_NUM_ACCS];
+        uint32_t    m_capL[ CSE_NUM_ACCS];
+        uint32_t    m_usedL[CSE_NUM_ACCS];
 
         AccSet      storesSinceLastLoad;    // regions stored to since the last load
 
         Allocator& alloc;
 
+        CseAcc miniAccSetToCseAcc(MiniAccSet miniAccSet, LoadQual loadQual) {
+            NanoAssert(miniAccSet.val < NUM_ACCS || miniAccSet.val == MINI_ACCSET_MULTIPLE.val);
+            return (loadQual == LOAD_CONST) ? CSE_ACC_CONST :
+                   (miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? CSE_ACC_MULTIPLE :
+                   miniAccSet.val;
+        }
+
         static uint32_t hash8(uint32_t hash, const uint8_t data);
         static uint32_t hash32(uint32_t hash, const uint32_t data);
         static uint32_t hashptr(uint32_t hash, const void* data);
@@ -1893,7 +1972,7 @@ namespace nanojit
         static uint32_t hash1(LOpcode op, LIns*);
         static uint32_t hash2(LOpcode op, LIns*, LIns*);
         static uint32_t hash3(LOpcode op, LIns*, LIns*, LIns*);
-        static uint32_t hashLoad(LOpcode op, LIns*, int32_t, AccSet);
+        static uint32_t hashLoad(LOpcode op, LIns*, int32_t);
         static uint32_t hashCall(const CallInfo *call, uint32_t argc, LIns* args[]);
 
         // These versions are used before an LIns has been created.
@@ -1905,7 +1984,7 @@ namespace nanojit
         LIns* find1(LOpcode v, LIns* a, uint32_t &k);
         LIns* find2(LOpcode v, LIns* a, LIns* b, uint32_t &k);
         LIns* find3(LOpcode v, LIns* a, LIns* b, LIns* c, uint32_t &k);
-        LIns* findLoad(LOpcode v, LIns* a, int32_t b, AccSet accSet, LInsHashKind kind,
+        LIns* findLoad(LOpcode v, LIns* a, int32_t b, MiniAccSet miniAccSet, LoadQual loadQual,
                        uint32_t &k);
         LIns* findCall(const CallInfo *call, uint32_t argc, LIns* args[], uint32_t &k);
 
@@ -1921,22 +2000,21 @@ namespace nanojit
         uint32_t find2(LIns* ins);
         uint32_t find3(LIns* ins);
         uint32_t findCall(LIns* ins);
-        uint32_t findLoadReadOnly(LIns* ins);
-        uint32_t findLoadStack(LIns* ins);
-        uint32_t findLoadRStack(LIns* ins);
-        uint32_t findLoadOther(LIns* ins);
-        uint32_t findLoadMultiple(LIns* ins);
+        uint32_t findLoad(LIns* ins);
 
-        void grow(LInsHashKind kind);
+        void growNL(NLKind kind);
+        void growL(CseAcc cseAcc);
 
         // 'k' is the index found by findXYZ().
-        void add(LInsHashKind kind, LIns* ins, uint32_t k);
+        void addNL(NLKind kind, LIns* ins, uint32_t k);
+        void addL(LIns* ins, uint32_t k);
 
-        void clear();               // clears all tables
-        void clear(LInsHashKind);   // clears one table
+        void clearAll();            // clears all tables
+        void clearNL(NLKind);       // clears one non-load table
+        void clearL(CseAcc);        // clears one load table
 
     public:
-        CseFilter(LirWriter *out, Allocator&);
+        CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator&);
 
         LIns* insImmI(int32_t imm);
 #ifdef NANOJIT_64BIT
@@ -1947,7 +2025,7 @@ namespace nanojit
         LIns* ins1(LOpcode v, LIns*);
         LIns* ins2(LOpcode v, LIns*, LIns*);
         LIns* ins3(LOpcode v, LIns*, LIns*, LIns*);
-        LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
+        LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual);
         LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
         LIns* insCall(const CallInfo *call, LIns* args[]);
         LIns* insGuard(LOpcode op, LIns* cond, GuardRecord *gr);
@@ -2006,7 +2084,7 @@ namespace nanojit
             }
 
             // LirWriter interface
-            LIns*   insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet);
+            LIns*   insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual);
             LIns*   insStore(LOpcode op, LIns* o1, LIns* o2, int32_t disp, AccSet accSet);
             LIns*   ins0(LOpcode op);
             LIns*   ins1(LOpcode op, LIns* o1);
@@ -2142,19 +2220,21 @@ namespace nanojit
         void errorStructureShouldBe(LOpcode op, const char* argDesc, int argN, LIns* arg,
                                     const char* shouldBeDesc);
         void errorAccSet(const char* what, AccSet accSet, const char* shouldDesc);
+        void errorLoadQual(const char* what, LoadQual loadQual);
         void checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2);
         void checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins);
         void checkLInsIsNull(LOpcode op, int argN, LIns* ins);
-        void checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet);
+        void checkAccSet(LOpcode op, LIns* base, AccSet accSet);   // defined by the embedder
 
-        LIns *sp, *rp;
+        // These can be set by the embedder and used in checkAccSet().
+        LIns *checkAccSetIns1, *checkAccSetIns2;
 
     public:
         ValidateWriter(LirWriter* out, LInsPrinter* printer, const char* where);
-        void setSp(LIns* ins) { sp = ins; }
-        void setRp(LIns* ins) { rp = ins; }
+        void setCheckAccSetIns1(LIns* ins) { checkAccSetIns1 = ins; }
+        void setCheckAccSetIns2(LIns* ins) { checkAccSetIns2 = ins; }
 
-        LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
+        LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual);
         LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
         LIns* ins0(LOpcode v);
         LIns* ins1(LOpcode v, LIns* a);