From 1b27c99df0590f1333bb5a22c671a05395a0788d Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <nnethercote@mozilla.com>
Date: Tue, 23 Mar 2010 15:05:47 -0700
Subject: [PATCH] Bug 517910 - NJ: add more alias-set annotations to LIR so as
 to improve CSEing of loads.  r=edwsmith.

--HG--
extra : convert_revision : 26cbea5a2acdcc6156b4a72b0c40c0d675f69571
---
 js/src/lirasm/lirasm.cpp |  22 +-
 js/src/nanojit/LIR.cpp   | 653 ++++++++++++++++++++++-----------------
 js/src/nanojit/LIR.h     | 148 +++++----
 3 files changed, 448 insertions(+), 375 deletions(-)

diff --git a/js/src/lirasm/lirasm.cpp b/js/src/lirasm/lirasm.cpp
index 565fa0d11ed..a74ec5475d4 100644
--- a/js/src/lirasm/lirasm.cpp
+++ b/js/src/lirasm/lirasm.cpp
@@ -518,7 +518,8 @@ FragmentAssembler::FragmentAssembler(Lirasm &parent, const string &fragmentName,
     mLir = mBufWriter  = new LirBufWriter(mParent.mLirbuf, nanojit::AvmCore::config);
 #ifdef DEBUG
     if (optimize) {     // don't re-validate if no optimization has taken place
-        mLir = mValidateWriter2 = new ValidateWriter(mLir, "end of writer pipeline");
+        mLir = mValidateWriter2 =
+            new ValidateWriter(mLir, mFragment->lirbuf->printer, "end of writer pipeline");
     }
 #endif
 #ifdef DEBUG
@@ -540,7 +541,8 @@ FragmentAssembler::FragmentAssembler(Lirasm &parent, const string &fragmentName,
         mLir = mExprFilter = new ExprFilter(mLir);
     }
 #ifdef DEBUG
-    mLir = mValidateWriter1 = new ValidateWriter(mLir, "start of writer pipeline");
+    mLir = mValidateWriter1 =
+            new ValidateWriter(mLir, mFragment->lirbuf->printer, "start of writer pipeline");
 #endif
 
     mReturnTypeBits = 0;
@@ -634,7 +636,7 @@ FragmentAssembler::assemble_load()
         mTokens[1].find_first_of("0123456789") == 0) {
         return mLir->insLoad(mOpcode,
                              ref(mTokens[0]),
-                             imm(mTokens[1]));
+                             imm(mTokens[1]), ACC_LOAD_ANY);
     }
     bad("immediate offset required for load");
     return NULL;  // not reached
@@ -1059,7 +1061,7 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons
             need(3);
             ins = mLir->insStore(mOpcode, ref(mTokens[0]),
                                   ref(mTokens[1]),
-                                  imm(mTokens[2]));
+                                  imm(mTokens[2]), ACC_STORE_ANY);
             break;
 
 #if NJ_EXPANDED_LOADSTORE_SUPPORTED 
@@ -1808,7 +1810,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty()) {
                 LIns* base = rndPick(Ms);
-                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()));
+                ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Is, ins);
                 n++;
             }
@@ -1819,7 +1821,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LLD_Q:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()));
+                ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Qs, ins);
                 n++;
             }
@@ -1829,7 +1831,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LLD_F:
             if (!M8ps.empty()) {
                 LIns* base = rndPick(M8ps);
-                ins = mLir->insLoad(rndPick(F_loads), base, rndOffset64(base->size()));
+                ins = mLir->insLoad(rndPick(F_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
                 addOrReplace(Fs, ins);
                 n++;
             }
@@ -1839,7 +1841,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
             vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
             if (!Ms.empty() && !Is.empty()) {
                 LIns* base = rndPick(Ms);
-                mLir->insStorei(rndPick(Is), base, rndOffset32(base->size()));
+                mLir->insStorei(rndPick(Is), base, rndOffset32(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
@@ -1849,7 +1851,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LST_Q:
             if (!M8ps.empty() && !Qs.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStorei(rndPick(Qs), base, rndOffset64(base->size()));
+                mLir->insStorei(rndPick(Qs), base, rndOffset64(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
@@ -1858,7 +1860,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
         case LST_F:
             if (!M8ps.empty() && !Fs.empty()) {
                 LIns* base = rndPick(M8ps);
-                mLir->insStorei(rndPick(Fs), base, rndOffset64(base->size()));
+                mLir->insStorei(rndPick(Fs), base, rndOffset64(base->size()), ACC_STORE_ANY);
                 n++;
             }
             break;
diff --git a/js/src/nanojit/LIR.cpp b/js/src/nanojit/LIR.cpp
index 0e3ad7a2e04..45fca4f3bde 100644
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -1149,24 +1149,31 @@ namespace nanojit
             m_list[kind] = new (alloc) LInsp[m_cap[kind]];
         }
         clear();
-        m_find[LInsImm]   = &LInsHashSet::findImm;
-        m_find[LInsImmq]  = PTR_SIZE(NULL, &LInsHashSet::findImmq);
-        m_find[LInsImmf]  = &LInsHashSet::findImmf;
-        m_find[LIns1]     = &LInsHashSet::find1;
-        m_find[LIns2]     = &LInsHashSet::find2;
-        m_find[LIns3]     = &LInsHashSet::find3;
-        m_find[LInsLoad]  = &LInsHashSet::findLoad;
-        m_find[LInsCall]  = &LInsHashSet::findCall;
+        m_find[LInsImm]          = &LInsHashSet::findImm;
+        m_find[LInsImmq]         = PTR_SIZE(NULL, &LInsHashSet::findImmq);
+        m_find[LInsImmf]         = &LInsHashSet::findImmf;
+        m_find[LIns1]            = &LInsHashSet::find1;
+        m_find[LIns2]            = &LInsHashSet::find2;
+        m_find[LIns3]            = &LInsHashSet::find3;
+        m_find[LInsCall]         = &LInsHashSet::findCall;
+        m_find[LInsLoadReadOnly] = &LInsHashSet::findLoadReadOnly;
+        m_find[LInsLoadStack]    = &LInsHashSet::findLoadStack;
+        m_find[LInsLoadRStack]   = &LInsHashSet::findLoadRStack;
+        m_find[LInsLoadOther]    = &LInsHashSet::findLoadOther;
+        m_find[LInsLoadMultiple] = &LInsHashSet::findLoadMultiple;
+    }
+
+    void LInsHashSet::clear(LInsHashKind kind) {
+        VMPI_memset(m_list[kind], 0, sizeof(LInsp)*m_cap[kind]);
+        m_used[kind] = 0;
     }
 
     void LInsHashSet::clear() {
         for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
-            VMPI_memset(m_list[kind], 0, sizeof(LInsp)*m_cap[kind]);
-            m_used[kind] = 0;
+            clear(kind);
         }
     }
 
-
     inline uint32_t LInsHashSet::hashImm(int32_t a) {
         return _hashfinish(_hash32(0,a));
     }
@@ -1194,10 +1201,15 @@ namespace nanojit
         return _hashfinish(_hashptr(hash, c));
     }
 
-    inline uint32_t LInsHashSet::hashLoad(LOpcode op, LInsp a, int32_t d) {
+    NanoStaticAssert(sizeof(AccSet) == 1);  // required for hashLoad to work properly
+
+    // Nb: no need to hash the load's AccSet because each region's loads go in
+    // a different hash table.
+    inline uint32_t LInsHashSet::hashLoad(LOpcode op, LInsp a, int32_t d, AccSet accSet) {
         uint32_t hash = _hash8(0,uint8_t(op));
         hash = _hashptr(hash, a);
-        return _hashfinish(_hash32(hash, d));
+        hash = _hash32(hash, d);
+        return _hashfinish(_hash8(hash, accSet));
     }
 
     inline uint32_t LInsHashSet::hashCall(const CallInfo *ci, uint32_t argc, LInsp args[]) {
@@ -1219,11 +1231,12 @@ namespace nanojit
             LInsp ins = oldlist[i];
             if (!ins) continue;
             uint32_t j = (this->*find)(ins);
+            NanoAssert(!m_list[kind][j]);
             m_list[kind][j] = ins;
         }
     }
 
-    LInsp LInsHashSet::add(LInsHashKind kind, LInsp ins, uint32_t k)
+    void LInsHashSet::add(LInsHashKind kind, LInsp ins, uint32_t k)
     {
         NanoAssert(!m_list[kind][k]);
         m_used[kind]++;
@@ -1231,20 +1244,21 @@ namespace nanojit
         if ((m_used[kind] * 4) >= (m_cap[kind] * 3)) {  // load factor of 0.75
             grow(kind);
         }
-        return ins;
     }
 
     LInsp LInsHashSet::findImm(int32_t a, uint32_t &k)
     {
         LInsHashKind kind = LInsImm;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImm(a) & bitmask;
+        k = hashImm(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm32() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconst());
+            if (ins->imm32() == a)
+                return ins;
             // Quadratic probe:  h(k,i) = h(k) + 0.5i + 0.5i^2, which gives the
             // sequence h(k), h(k)+1, h(k)+3, h(k)+6, h+10, ...  This is a
             // good sequence for 2^n-sized tables as the values h(k,i) for i
@@ -1252,11 +1266,9 @@ namespace nanojit
             // See http://portal.acm.org/citation.cfm?id=360737 and
             // http://en.wikipedia.org/wiki/Quadratic_probing (fetched
             // 06-Nov-2009) for more details.
-            hash = (hash + n) & bitmask;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImm(LInsp ins)
@@ -1271,18 +1283,18 @@ namespace nanojit
     {
         LInsHashKind kind = LInsImmq;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImmq(a) & bitmask;
+        k = hashImmq(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm64() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconstq());
-            hash = (hash + n) & bitmask;
+            if (ins->imm64() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImmq(LInsp ins)
@@ -1297,18 +1309,18 @@ namespace nanojit
     {
         LInsHashKind kind = LInsImmf;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashImmq(a) & bitmask;
+        k = hashImmq(a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->imm64() != a))
-        {
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
             NanoAssert(ins->isconstf());
-            hash = (hash + n) & bitmask;
+            if (ins->imm64() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findImmf(LInsp ins)
@@ -1322,17 +1334,17 @@ namespace nanojit
     {
         LInsHashKind kind = LIns1;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash1(op,a) & bitmask;
+        k = hash1(op, a) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find1(LInsp ins)
@@ -1346,17 +1358,17 @@ namespace nanojit
     {
         LInsHashKind kind = LIns2;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash2(op,a,b) & bitmask;
+        k = hash2(op, a, b) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->oprnd2() != b))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find2(LInsp ins)
@@ -1370,17 +1382,17 @@ namespace nanojit
     {
         LInsHashKind kind = LIns3;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hash3(op,a,b,c) & bitmask;
+        k = hash3(op, a, b, c) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->oprnd2() != b || ins->oprnd3() != c))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::find3(LInsp ins)
@@ -1390,27 +1402,57 @@ namespace nanojit
         return k;
     }
 
-    LInsp LInsHashSet::findLoad(LOpcode op, LInsp a, int32_t d, uint32_t &k)
+    LInsp LInsHashSet::findLoad(LOpcode op, LInsp a, int32_t d, AccSet accSet, LInsHashKind kind,
+                                uint32_t &k)
     {
-        LInsHashKind kind = LInsLoad;
+        (void)accSet;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashLoad(op,a,d) & bitmask;
+        k = hashLoad(op, a, d, accSet) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (ins->opcode() != op || ins->oprnd1() != a || ins->disp() != d))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            NanoAssert(ins->accSet() == accSet);
+            if (ins->isop(op) && ins->oprnd1() == a && ins->disp() == d)
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
-    uint32_t LInsHashSet::findLoad(LInsp ins)
+    uint32_t LInsHashSet::findLoadReadOnly(LInsp ins)
     {
         uint32_t k;
-        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), k);
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadReadOnly, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadStack(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadStack, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadRStack(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadRStack, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadOther(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadOther, k);
+        return k;
+    }
+
+    uint32_t LInsHashSet::findLoadMultiple(LInsp ins)
+    {
+        uint32_t k;
+        findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadMultiple, k);
         return k;
     }
 
@@ -1426,17 +1468,17 @@ namespace nanojit
     {
         LInsHashKind kind = LInsCall;
         const uint32_t bitmask = m_cap[kind] - 1;
-        uint32_t hash = hashCall(ci, argc, args) & bitmask;
+        k = hashCall(ci, argc, args) & bitmask;
         uint32_t n = 1;
-        LInsp ins;
-        while ((ins = m_list[kind][hash]) != NULL &&
-            (!ins->isCall() || ins->callInfo() != ci || !argsmatch(ins, argc, args)))
-        {
-            hash = (hash + n) & bitmask;
+        while (true) {
+            LInsp ins = m_list[kind][k];
+            if (!ins)
+                return NULL;
+            if (ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args))
+                return ins;
+            k = (k + n) & bitmask;
             n += 1;
         }
-        k = hash;
-        return ins;
     }
 
     uint32_t LInsHashSet::findCall(LInsp ins)
@@ -1728,7 +1770,6 @@ namespace nanojit
         }
     }
 
-
     void LirNameMap::addNameWithSuffix(LInsp ins, const char *name, int suffix,
                                        bool ignoreOneSuffix) {
         // The lookup may succeed, ie. we may already have a name for this
@@ -1792,23 +1833,18 @@ namespace nanojit
     }
 
 
-    char* LInsPrinter::formatAccSet(RefBuf* buf, LInsp ins, bool isLoad) {
-        AccSet accSet = ins->accSet();
+    char* LInsPrinter::formatAccSet(RefBuf* buf, AccSet accSet) {
         int i = 0;
-        if ((isLoad && accSet == ACC_LOAD_ANY) ||
-            (!isLoad && accSet == ACC_STORE_ANY))
-        {
-            // boring, don't bother with a suffix
-        } else {
-            buf->buf[i++] = '.';
-            if (accSet & ACC_READONLY) { buf->buf[i++] = 'r'; accSet &= ~ACC_READONLY; }
-            if (accSet & ACC_STACK)    { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
-            if (accSet & ACC_OTHER)    { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
-            // This assertion will fail if we add a new accSet value but
-            // forget to handle it here.
-            NanoAssert(accSet == 0);
-        }
+        // 'c' is short for "const", because 'r' is used for RSTACK.
+        if (accSet & ACC_READONLY) { buf->buf[i++] = 'c'; accSet &= ~ACC_READONLY; }
+        if (accSet & ACC_STACK)    { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
+        if (accSet & ACC_RSTACK)   { buf->buf[i++] = 'r'; accSet &= ~ACC_RSTACK; }
+        if (accSet & ACC_OTHER)    { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
+        // This assertion will fail if we add a new accSet value but
+        // forget to handle it here.
+        NanoAssert(accSet == 0);
         buf->buf[i] = 0;
+        NanoAssert(size_t(i) < buf->len);
         return buf->buf;
     }
 
@@ -1919,11 +1955,12 @@ namespace nanojit
                 int32_t argc = i->argc();
                 int32_t m = int32_t(n);     // Windows doesn't have 'ssize_t'
                 if (call->isIndirect())
-                    m -= VMPI_snprintf(s, m, "%s = %s [%s] ( ", formatRef(&b1, i), lirNames[op],
-                                       formatRef(&b2, i->arg(--argc)));
+                    m -= VMPI_snprintf(s, m, "%s = %s.%s [%s] ( ", formatRef(&b1, i), lirNames[op],
+                                       formatAccSet(&b2, call->_storeAccSet),
+                                       formatRef(&b3, i->arg(--argc)));
                 else
-                    m -= VMPI_snprintf(s, m, "%s = %s #%s ( ", formatRef(&b1, i), lirNames[op],
-                                       call->_name);
+                    m -= VMPI_snprintf(s, m, "%s = %s.%s #%s ( ", formatRef(&b1, i), lirNames[op],
+                                       formatAccSet(&b2, call->_storeAccSet), call->_name);
                 if (m < 0) break;
                 for (int32_t j = argc - 1; j >= 0; j--) {
                     s += VMPI_strlen(s);
@@ -2074,8 +2111,8 @@ namespace nanojit
             case LIR_ldsb:
             case LIR_ldss:
             case LIR_ld32f:
-                VMPI_snprintf(s, n, "%s = %s%s %s[%d]", formatRef(&b1, i), lirNames[op],
-                    formatAccSet(&b2, i, /*isLoad*/true),
+                VMPI_snprintf(s, n, "%s = %s.%s %s[%d]", formatRef(&b1, i), lirNames[op],
+                    formatAccSet(&b2, i->accSet()),
                     formatRef(&b3, i->oprnd1()),
                     i->disp());
                 break;
@@ -2086,8 +2123,8 @@ namespace nanojit
             case LIR_stb:
             case LIR_sts:
             case LIR_st32f:
-                VMPI_snprintf(s, n, "%s%s %s[%d] = %s", lirNames[op],
-                    formatAccSet(&b1, i, /*isLoad*/false),
+                VMPI_snprintf(s, n, "%s.%s %s[%d] = %s", lirNames[op],
+                    formatAccSet(&b1, i->accSet()),
                     formatRef(&b2, i->oprnd2()),
                     i->disp(),
                     formatRef(&b3, i->oprnd1()));
@@ -2103,17 +2140,21 @@ namespace nanojit
 
 
     CseFilter::CseFilter(LirWriter *out, Allocator& alloc)
-        : LirWriter(out)
+        : LirWriter(out), storesSinceLastLoad(ACC_NONE)
     {
         uint32_t kInitialCaps[LInsLast + 1];
-        kInitialCaps[LInsImm]   = 128;
-        kInitialCaps[LInsImmq]  = PTR_SIZE(0, 16);
-        kInitialCaps[LInsImmf]  = 16;
-        kInitialCaps[LIns1]     = 256;
-        kInitialCaps[LIns2]     = 512;
-        kInitialCaps[LIns3]     = 16;
-        kInitialCaps[LInsLoad]  = 16;
-        kInitialCaps[LInsCall]  = 64;
+        kInitialCaps[LInsImm]          = 128;
+        kInitialCaps[LInsImmq]         = PTR_SIZE(0, 16);
+        kInitialCaps[LInsImmf]         = 16;
+        kInitialCaps[LIns1]            = 256;
+        kInitialCaps[LIns2]            = 512;
+        kInitialCaps[LIns3]            = 16;
+        kInitialCaps[LInsCall]         = 64;
+        kInitialCaps[LInsLoadReadOnly] = 16;
+        kInitialCaps[LInsLoadStack]    = 16;
+        kInitialCaps[LInsLoadRStack]   = 16;
+        kInitialCaps[LInsLoadOther]    = 16;
+        kInitialCaps[LInsLoadMultiple] = 16;
         exprs = new (alloc) LInsHashSet(alloc, kInitialCaps);
     }
 
@@ -2121,13 +2162,14 @@ namespace nanojit
     {
         uint32_t k;
         LInsp ins = exprs->findImm(imm, k);
-        if (ins)
-            return ins;
-        ins = out->insImm(imm);
+        if (!ins) {
+            ins = out->insImm(imm);
+            exprs->add(LInsImm, ins, k);
+        }
         // We assume that downstream stages do not modify the instruction, so
         // that we can insert 'ins' into slot 'k'.  Check this.
-        NanoAssert(ins->opcode() == LIR_int && ins->imm32() == imm);
-        return exprs->add(LInsImm, ins, k);
+        NanoAssert(ins->isop(LIR_int) && ins->imm32() == imm);
+        return ins;
     }
 
 #ifdef NANOJIT_64BIT
@@ -2135,11 +2177,12 @@ namespace nanojit
     {
         uint32_t k;
         LInsp ins = exprs->findImmq(q, k);
-        if (ins)
-            return ins;
-        ins = out->insImmq(q);
-        NanoAssert(ins->opcode() == LIR_quad && ins->imm64() == q);
-        return exprs->add(LInsImmq, ins, k);
+        if (!ins) {
+            ins = out->insImmq(q);
+            exprs->add(LInsImmq, ins, k);
+        }
+        NanoAssert(ins->isop(LIR_quad) && ins->imm64() == q);
+        return ins;
     }
 #endif
 
@@ -2154,85 +2197,121 @@ namespace nanojit
         } u;
         u.d = d;
         LInsp ins = exprs->findImmf(u.u64, k);
-        if (ins)
-            return ins;
-        ins = out->insImmf(d);
-        NanoAssert(ins->opcode() == LIR_float && ins->imm64() == u.u64);
-        return exprs->add(LInsImmf, ins, k);
+        if (!ins) {
+            ins = out->insImmf(d);
+            exprs->add(LInsImmf, ins, k);
+        }
+        NanoAssert(ins->isop(LIR_float) && ins->imm64() == u.u64);
+        return ins;
     }
 
-    LIns* CseFilter::ins0(LOpcode v)
+    LIns* CseFilter::ins0(LOpcode op)
     {
-        if (v == LIR_label)
+        if (op == LIR_label)
             exprs->clear();
-        return out->ins0(v);
+        return out->ins0(op);
     }
 
-    LIns* CseFilter::ins1(LOpcode v, LInsp a)
+    LIns* CseFilter::ins1(LOpcode op, LInsp a)
     {
-        if (isCseOpcode(v)) {
+        LInsp ins;
+        if (isCseOpcode(op)) {
             uint32_t k;
-            LInsp ins = exprs->find1(v, a, k);
-            if (ins)
-                return ins;
-            ins = out->ins1(v, a);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == a);
-            return exprs->add(LIns1, ins, k);
-        }
-        return out->ins1(v,a);
-    }
-
-    LIns* CseFilter::ins2(LOpcode v, LInsp a, LInsp b)
-    {
-        if (isCseOpcode(v)) {
-            uint32_t k;
-            LInsp ins = exprs->find2(v, a, b, k);
-            if (ins)
-                return ins;
-            ins = out->ins2(v, a, b);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b);
-            return exprs->add(LIns2, ins, k);
-        }
-        return out->ins2(v,a,b);
-    }
-
-    LIns* CseFilter::ins3(LOpcode v, LInsp a, LInsp b, LInsp c)
-    {
-        NanoAssert(isCseOpcode(v));
-        uint32_t k;
-        LInsp ins = exprs->find3(v, a, b, c, k);
-        if (ins)
-            return ins;
-        ins = out->ins3(v, a, b, c);
-        NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b &&
-                                                               ins->oprnd3() == c);
-        return exprs->add(LIns3, ins, k);
-    }
-
-    LIns* CseFilter::insLoad(LOpcode v, LInsp base, int32_t disp, AccSet accSet)
-    {
-        if (isS16(disp)) {
-            // XXX: This condition is overly strict.  Bug 517910 will make it better.
-            if (accSet == ACC_READONLY) {
-                uint32_t k;
-                LInsp ins = exprs->findLoad(v, base, disp, k);
-                if (ins)
-                    return ins;
-                ins = out->insLoad(v, base, disp, accSet);
-                NanoAssert(ins->opcode() == v && ins->oprnd1() == base && ins->disp() == disp);
-                return exprs->add(LInsLoad, ins, k);
+            ins = exprs->find1(op, a, k);
+            if (!ins) {
+                ins = out->ins1(op, a);
+                exprs->add(LIns1, ins, k);
             }
-            return out->insLoad(v, base, disp, accSet);
+        } else {
+            ins = out->ins1(op, a);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a);
+        return ins;
+    }
+
+    LIns* CseFilter::ins2(LOpcode op, LInsp a, LInsp b)
+    {
+        LInsp ins;
+        if (isCseOpcode(op)) {
+            uint32_t k;
+            ins = exprs->find2(op, a, b, k);
+            if (!ins) {
+                ins = out->ins2(op, a, b);
+                exprs->add(LIns2, ins, k);
+            }
+        } else {
+            ins = out->ins2(op, a, b);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
+        return ins;
+    }
+
+    LIns* CseFilter::ins3(LOpcode op, LInsp a, LInsp b, LInsp c)
+    {
+        NanoAssert(isCseOpcode(op));
+        uint32_t k;
+        LInsp ins = exprs->find3(op, a, b, c, k);
+        if (!ins) {
+            ins = out->ins3(op, a, b, c);
+            exprs->add(LIns3, ins, k);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c);
+        return ins;
+    }
+
+    LIns* CseFilter::insLoad(LOpcode op, LInsp base, int32_t disp, AccSet loadAccSet)
+    {
+        LInsp ins;
+        if (isS16(disp)) {
+            // Clear all loads aliased by stores and calls since the last time
+            // we were in this function.
+            if (storesSinceLastLoad != ACC_NONE) {
+                NanoAssert(!(storesSinceLastLoad & ACC_READONLY));  // can't store to READONLY
+                if (storesSinceLastLoad & ACC_STACK)  { exprs->clear(LInsLoadStack); }
+                if (storesSinceLastLoad & ACC_RSTACK) { exprs->clear(LInsLoadRStack); }
+                if (storesSinceLastLoad & ACC_OTHER)  { exprs->clear(LInsLoadOther); }
+                // Loads marked with multiple access regions must be treated
+                // conservatively -- we always clear all of them.
+                exprs->clear(LInsLoadMultiple);
+                storesSinceLastLoad = ACC_NONE;
+            }
+
+            LInsHashKind kind;
+            switch (loadAccSet) {
+            case ACC_READONLY:  kind = LInsLoadReadOnly;    break;
+            case ACC_STACK:     kind = LInsLoadStack;       break;
+            case ACC_RSTACK:    kind = LInsLoadRStack;      break;
+            case ACC_OTHER:     kind = LInsLoadOther;       break;
+            default:            kind = LInsLoadMultiple;    break;
+            }
+
+            uint32_t k;
+            ins = exprs->findLoad(op, base, disp, loadAccSet, kind, k);
+            if (!ins) {
+                ins = out->insLoad(op, base, disp, loadAccSet);
+                exprs->add(kind, ins, k);
+            }
+            NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp);
+
         } else {
             // If the displacement is more than 16 bits, put it in a separate
-            // instruction.  LirBufWriter also does this, we do it here as
-            // well because CseFilter relies on LirBufWriter not changing
-            // code.
-            return insLoad(v, ins2(LIR_addp, base, insImmWord(disp)), 0, accSet);
+            // instruction.  Nb: LirBufWriter also does this, we do it here
+            // too because CseFilter relies on LirBufWriter not changing code.
+            ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, loadAccSet);
         }
+        return ins;
     }
 
-    LInsp CseFilter::insGuard(LOpcode v, LInsp c, GuardRecord *gr)
+    LIns* CseFilter::insStore(LOpcode op, LInsp value, LInsp base, int32_t disp, AccSet accSet)
+    {
+        storesSinceLastLoad |= accSet;
+        LIns* ins = out->insStore(op, value, base, disp, accSet);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == value && ins->oprnd2() == base &&
+                   ins->disp() == disp && ins->accSet() == accSet);
+        return ins;
+    }
+
+    LInsp CseFilter::insGuard(LOpcode op, LInsp c, GuardRecord *gr)
     {
         // LIR_xt and LIR_xf guards are CSEable.  Note that we compare the
         // opcode and condition when determining if two guards are equivalent
@@ -2251,104 +2330,58 @@ namespace nanojit
         // - The CSE algorithm will always keep guard 1 and remove guard 2
         //   (not vice versa).  The current algorithm does this.
         //
-        if (isCseOpcode(v)) {
+        LInsp ins;
+        if (isCseOpcode(op)) {
             // conditional guard
             uint32_t k;
-            LInsp ins = exprs->find1(v, c, k);
-            if (ins)
-                return 0;
-            ins = out->insGuard(v, c, gr);
-            NanoAssert(ins->opcode() == v && ins->oprnd1() == c);
-            return exprs->add(LIns1, ins, k);
+            ins = exprs->find1(op, c, k);
+            if (!ins) {
+                ins = out->insGuard(op, c, gr);
+                exprs->add(LIns1, ins, k);
+            }
+        } else {
+            ins = out->insGuard(op, c, gr);
         }
-        return out->insGuard(v, c, gr);
+        NanoAssert(ins->isop(op) && ins->oprnd1() == c);
+        return ins;
     }
 
-    LInsp CseFilter::insGuardXov(LOpcode v, LInsp a, LInsp b, GuardRecord *gr)
+    LInsp CseFilter::insGuardXov(LOpcode op, LInsp a, LInsp b, GuardRecord *gr)
     {
         // LIR_*xov are CSEable.  See CseFilter::insGuard() for details.
-        NanoAssert(isCseOpcode(v));
+        NanoAssert(isCseOpcode(op));
         // conditional guard
         uint32_t k;
-        LInsp ins = exprs->find2(v, a, b, k);
-        if (ins)
-            return ins;
-        ins = out->insGuardXov(v, a, b, gr);
-        NanoAssert(ins->opcode() == v && ins->oprnd1() == a && ins->oprnd2() == b);
-        return exprs->add(LIns2, ins, k);
+        LInsp ins = exprs->find2(op, a, b, k);
+        if (!ins) {
+            ins = out->insGuardXov(op, a, b, gr);
+            exprs->add(LIns2, ins, k);
+        }
+        NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
+        return ins;
     }
 
     LInsp CseFilter::insCall(const CallInfo *ci, LInsp args[])
     {
+        LInsp ins;
+        uint32_t argc = ci->count_args();
         if (ci->_isPure) {
             NanoAssert(ci->_storeAccSet == ACC_NONE);
             uint32_t k;
-            uint32_t argc = ci->count_args();
-            LInsp ins = exprs->findCall(ci, argc, args, k);
-            if (ins)
-                return ins;
-            ins = out->insCall(ci, args);
-            NanoAssert(ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args));
-            return exprs->add(LInsCall, ins, k);
-        }
-        return out->insCall(ci, args);
-    }
-
-    LInsp LoadFilter::insLoad(LOpcode v, LInsp base, int32_t disp, AccSet accSet)
-    {
-        if (base != sp && base != rp)
-        {
-            switch (v)
-            {
-                case LIR_ld:
-                CASE64(LIR_ldq:)
-                case LIR_ldf:
-                case LIR_ld32f:
-                case LIR_ldsb:
-                case LIR_ldss:
-                case LIR_ldzb:
-                case LIR_ldzs:
-                {
-                    uint32_t k;
-                    LInsp ins = exprs->findLoad(v, base, disp, k);
-                    if (ins)
-                        return ins;
-                    ins = out->insLoad(v, base, disp, accSet);
-                    return exprs->add(LInsLoad, ins, k);
-                }
-                default:
-                    // fall thru
-                    break;
+            ins = exprs->findCall(ci, argc, args, k);
+            if (!ins) {
+                ins = out->insCall(ci, args);
+                exprs->add(LInsCall, ins, k);
             }
+        } else {
+            // We only need to worry about aliasing if !ci->_isPure.
+            storesSinceLastLoad |= ci->_storeAccSet;
+            ins = out->insCall(ci, args);
         }
-        return out->insLoad(v, base, disp, accSet);
+        NanoAssert(ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args));
+        return ins;
     }
 
-    void LoadFilter::clear(LInsp p)
-    {
-        if (p != sp && p != rp)
-            exprs->clear();
-    }
-
-    LInsp LoadFilter::insStore(LOpcode op, LInsp v, LInsp b, int32_t d, AccSet accSet)
-    {
-        clear(b);
-        return out->insStore(op, v, b, d, accSet);
-    }
-
-    LInsp LoadFilter::insCall(const CallInfo *ci, LInsp args[])
-    {
-        if (!ci->_isPure)
-            exprs->clear();
-        return out->insCall(ci, args);
-    }
-
-    LInsp LoadFilter::ins0(LOpcode op)
-    {
-        if (op == LIR_label)
-            exprs->clear();
-        return out->ins0(op);
-    }
 
 #if NJ_SOFTFLOAT_SUPPORTED
     static double FASTCALL i2f(int32_t i)           { return i; }
@@ -2564,7 +2597,7 @@ namespace nanojit
                 NanoAssertMsgf(0,
                     "LIR type error (%s): arg %d of '%s' is '%s' "
                     "which has type %s (expected %s)",
-                    _whereInPipeline, i+1, lirNames[op],
+                    whereInPipeline, i+1, lirNames[op],
                     lirNames[args[i]->opcode()],
                     type2string(actual), type2string(formal));
             }
@@ -2576,15 +2609,16 @@ namespace nanojit
     {
         NanoAssertMsgf(0,
             "LIR structure error (%s): %s %d of '%s' is '%s' (expected %s)",
-            _whereInPipeline, argDesc, argN, 
+            whereInPipeline, argDesc, argN, 
             lirNames[op], lirNames[arg->opcode()], shouldBeDesc);
     }
 
-    void ValidateWriter::errorAccSetShould(const char* what, AccSet accSet, const char* shouldDesc)
+    void ValidateWriter::errorAccSet(const char* what, AccSet accSet, const char* shouldDesc)
     {
+        RefBuf b;
         NanoAssertMsgf(0,
-            "LIR AccSet error (%s): '%s' AccSet is %d; it should %s",
-            _whereInPipeline, what, accSet, shouldDesc);
+            "LIR AccSet error (%s): '%s' AccSet is '%s'; %s",
+            whereInPipeline, what, printer->formatAccSet(&b, accSet), shouldDesc);
     }
 
     void ValidateWriter::checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins)
@@ -2609,17 +2643,60 @@ namespace nanojit
             errorStructureShouldBe(op, "argument", argN, ins, lirNames[op2]);
     }
 
-    ValidateWriter::ValidateWriter(LirWriter *out, const char* stageName)
-        : LirWriter(out), _whereInPipeline(stageName)
+    void ValidateWriter::checkAccSet(LOpcode op, LInsp base, AccSet accSet, AccSet maxAccSet)
+    {
+        if (accSet == ACC_NONE)
+            errorAccSet(lirNames[op], accSet, "it should not equal ACC_NONE");
+
+        if (accSet & ~maxAccSet)
+            errorAccSet(lirNames[op], accSet,
+                "it should not contain bits that aren't in ACC_LOAD_ANY/ACC_STORE_ANY");
+
+        // Some sanity checking, which is based on the following assumptions:
+        // - STACK ones should use 'sp' or 'sp+k' as the base.  (We could look
+        //   for more complex patterns, but that feels dangerous.  Better to
+        //   keep it really simple.)
+        // - RSTACK ones should use 'rp' as the base.
+        // - READONLY/OTHER ones should not use 'sp'/'sp+k' or 'rp' as the base.
+        //
+        // Things that aren't checked:
+        // - There's no easy way to check if READONLY ones really are read-only.
+
+        bool isStack = base == sp ||
+                      (base->isop(LIR_piadd) && base->oprnd1() == sp && base->oprnd2()->isconstp());
+        bool isRStack = base == rp;
+
+        switch (accSet) {
+        case ACC_STACK:
+            if (!isStack)
+                errorAccSet(lirNames[op], accSet, "but it's not a stack access");
+            break;
+
+        case ACC_RSTACK:
+            if (!isRStack)
+                errorAccSet(lirNames[op], accSet, "but it's not an rstack access");
+            break;
+
+        case ACC_READONLY:
+        case ACC_OTHER:
+            if (isStack)
+                errorAccSet(lirNames[op], accSet, "but it's a stack access");
+            if (isRStack)
+                errorAccSet(lirNames[op], accSet, "but it's an rstack access");
+            break;
+
+        default:
+            break;
+        }
+    }
+
+    ValidateWriter::ValidateWriter(LirWriter *out, LInsPrinter* printer, const char* where)
+        : LirWriter(out), printer(printer), whereInPipeline(where), sp(0), rp(0)
     {}
 
     LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
     {
-        if (accSet == ACC_NONE)
-            errorAccSetShould(lirNames[op], accSet, "not equal ACC_NONE");
-
-        if (accSet & ~ACC_LOAD_ANY)
-            errorAccSetShould(lirNames[op], accSet, "not contain bits that aren't in ACC_LOAD_ANY");
+        checkAccSet(op, base, accSet, ACC_LOAD_ANY);
 
         int nArgs = 1;
         LTy formals[1] = { LTy_Ptr };
@@ -2646,11 +2723,7 @@ namespace nanojit
 
     LIns* ValidateWriter::insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet)
     {
-        if (accSet == ACC_NONE)
-            errorAccSetShould(lirNames[op], accSet, "not equal ACC_NONE");
-
-        if (accSet & ~ACC_STORE_ANY)
-            errorAccSetShould(lirNames[op], accSet, "not contain bits that aren't in ACC_STORE_ANY");
+        checkAccSet(op, base, accSet, ACC_STORE_ANY);
 
         int nArgs = 2;
         LTy formals[2] = { LTy_Void, LTy_Ptr };     // LTy_Void is overwritten shortly
@@ -2680,7 +2753,7 @@ namespace nanojit
 
         typeCheckArgs(op, nArgs, formals, args);
 
-        return out->insStore(op, value, base, d);
+        return out->insStore(op, value, base, d, accSet);
     }
 
     LIns* ValidateWriter::ins0(LOpcode op)
@@ -2923,11 +2996,11 @@ namespace nanojit
         LOpcode op = getCallOpcode(ci);
 
         if (ci->_isPure && ci->_storeAccSet != ACC_NONE)
-            errorAccSetShould(ci->_name, ci->_storeAccSet, "equal ACC_NONE for pure functions");
+            errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACC_NONE for pure functions");
 
         if (ci->_storeAccSet & ~ACC_STORE_ANY)
-            errorAccSetShould(lirNames[op], ci->_storeAccSet,
-                "not contain bits that aren't in ACC_STORE_ANY");
+            errorAccSet(lirNames[op], ci->_storeAccSet,
+                "it should not contain bits that aren't in ACC_STORE_ANY");
 
         // This loop iterates over the args from right-to-left (because arg()
         // and getArgTypes() use right-to-left order), but puts the results
diff --git a/js/src/nanojit/LIR.h b/js/src/nanojit/LIR.h
index 783afeb79ff..7f16a6b7f58 100644
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -210,7 +210,10 @@ namespace nanojit
     //   A load from a READONLY region will never alias with any stores.
     //
     // - STACK: the stack.  Stack loads/stores can usually be easily
-    //   identified because they use SP as the stack pointer.
+    //   identified because they use SP as the base pointer.
+    //
+    // - RSTACK: the return stack.  Return stack loads/stores can usually be
+    //   easily identified because they use RP as the base pointer.
     //
     // - OTHER: all other regions of memory.
     //
@@ -259,6 +262,14 @@ namespace nanojit
     //   true for the store set of a function.)
     //
     // Such imprecision is safe but may reduce optimisation opportunities.
+    //
+    // Optimisations that use access region info
+    // -----------------------------------------
+    // Currently only CseFilter uses this, and only for determining whether
+    // loads can be CSE'd.  Note that CseFilter treats loads that are marked
+    // with a single access region precisely, but all loads marked with
+    // multiple access regions get lumped together.  So if you can't mark a
+    // load with a single access region, you might as well use ACC_LOAD_ANY.
     //-----------------------------------------------------------------------
 
     // An access region set is represented as a bitset.  Nb: this restricts us
@@ -267,11 +278,13 @@ namespace nanojit
 
     // The access regions.  Note that because of the bitset representation
     // these constants are also valid (singleton) AccSet values.  If you add
-    // new ones please update ACC_ALL_WRITABLE and LirNameMap::formatAccSet().
+    // new ones please update ACC_ALL_STORABLE and formatAccSet() and
+    // CseFilter.
     //
     static const AccSet ACC_READONLY = 1 << 0;      // 0000_0001b
     static const AccSet ACC_STACK    = 1 << 1;      // 0000_0010b
-    static const AccSet ACC_OTHER    = 1 << 2;      // 0000_0100b
+    static const AccSet ACC_RSTACK   = 1 << 2;      // 0000_0100b
+    static const AccSet ACC_OTHER    = 1 << 3;      // 0000_1000b
 
     // Some common (non-singleton) access region sets.  ACC_NONE does not make
     // sense for loads or stores (which must access at least one region), it
@@ -279,15 +292,14 @@ namespace nanojit
     //
     // A convention that's worth using:  use ACC_LOAD_ANY/ACC_STORE_ANY for
     // cases that you're unsure about or haven't considered carefully.  Use
-    // ACC_ALL/ACC_ALL_WRITABLE for cases that you have considered carefully.
+    // ACC_ALL/ACC_ALL_STORABLE for cases that you have considered carefully.
     // That way it's easy to tell which ones have been considered and which
     // haven't.
     static const AccSet ACC_NONE         = 0x0;
-    static const AccSet ACC_ALL_WRITABLE = ACC_STACK | ACC_OTHER;
-    static const AccSet ACC_ALL          = ACC_READONLY | ACC_ALL_WRITABLE;
+    static const AccSet ACC_ALL_STORABLE = ACC_STACK | ACC_RSTACK | ACC_OTHER;
+    static const AccSet ACC_ALL          = ACC_READONLY | ACC_ALL_STORABLE;
     static const AccSet ACC_LOAD_ANY     = ACC_ALL;            // synonym
-    static const AccSet ACC_STORE_ANY    = ACC_ALL_WRITABLE;   // synonym
-
+    static const AccSet ACC_STORE_ANY    = ACC_ALL_STORABLE;   // synonym
 
     struct CallInfo
     {
@@ -1488,17 +1500,6 @@ namespace nanojit
 
         // Chooses LIR_sti or LIR_stqi based on size of value.
         LIns* insStorei(LIns* value, LIns* base, int32_t d, AccSet accSet);
-
-        // Insert a load/store with the most pessimistic region access info, which is always safe.
-        LIns* insLoad(LOpcode op, LIns* base, int32_t d) {
-            return insLoad(op, base, d, ACC_LOAD_ANY);
-        }
-        LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d) {
-            return insStore(op, value, base, d, ACC_STORE_ANY);
-        }
-        LIns* insStorei(LIns* value, LIns* base, int32_t d) {
-            return insStorei(value, base, d, ACC_STORE_ANY);
-        }
     };
 
 
@@ -1598,7 +1599,6 @@ namespace nanojit
         void formatImmq(RefBuf* buf, uint64_t c);
         void formatGuard(InsBuf* buf, LInsp ins);
         void formatGuardXov(InsBuf* buf, LInsp ins);
-        char* formatAccSet(RefBuf* buf, LInsp ins, bool isLoad);
 
     public:
         LInsPrinter(Allocator& alloc)
@@ -1611,6 +1611,7 @@ namespace nanojit
         char *formatAddr(RefBuf* buf, void* p);
         char *formatRef(RefBuf* buf, LInsp ref);
         char *formatIns(InsBuf* buf, LInsp ins);
+        char *formatAccSet(RefBuf* buf, AccSet accSet);
 
         AddrNameMap* addrNameMap;
         LirNameMap* lirNameMap;
@@ -1739,23 +1740,35 @@ namespace nanojit
         // We divide instruction kinds into groups for the use of LInsHashSet.
         // LIns0 isn't present because we don't need to record any 0-ary
         // instructions.
-        LInsImm   = 0,
-        LInsImmq  = 1,  // only occurs on 64-bit platforms
-        LInsImmf  = 2,
-        LIns1     = 3,
-        LIns2     = 4,
-        LIns3     = 5,
-        LInsLoad  = 6,
-        LInsCall  = 7,
+        LInsImm  = 0,
+        LInsImmq = 1,   // only occurs on 64-bit platforms
+        LInsImmf = 2,
+        LIns1    = 3,
+        LIns2    = 4,
+        LIns3    = 5,
+        LInsCall = 6,
+
+        // Loads are special.  We group them by access region:  one table for
+        // each region, and then a catch-all table for any loads marked with
+        // multiple regions.  This arrangement makes the removal of
+        // invalidated loads fast -- eg. we can invalidate all STACK loads by
+        // just clearing the LInsLoadStack table.  The disadvantage is that
+        // loads marked with multiple regions must be invalidated
+        // conservatively, eg. if any intervening stores occur.  But loads
+        // marked with multiple regions should be rare.
+        LInsLoadReadOnly = 7,
+        LInsLoadStack    = 8,
+        LInsLoadRStack   = 9,
+        LInsLoadOther    = 10,
+        LInsLoadMultiple = 11,
 
         LInsFirst = 0,
-        LInsLast = 7,
+        LInsLast = 11,
         // need a value after "last" to outsmart compilers that will insist last+1 is impossible
-        LInsInvalid = 8
+        LInsInvalid = 12
     };
     #define nextKind(kind)  LInsHashKind(kind+1)
 
-    // @todo, this could be replaced by a generic HashMap or HashSet, if we had one
     class LInsHashSet
     {
         // Must be a power of 2.
@@ -1772,14 +1785,15 @@ namespace nanojit
         uint32_t m_used[LInsLast + 1];
         typedef uint32_t (LInsHashSet::*find_t)(LInsp);
         find_t m_find[LInsLast + 1];
+
         Allocator& alloc;
 
         static uint32_t hashImm(int32_t);
-        static uint32_t hashImmq(uint64_t);     // not NANOJIT_64BIT only used by findImmf()
-        static uint32_t hash1(LOpcode v, LInsp);
-        static uint32_t hash2(LOpcode v, LInsp, LInsp);
-        static uint32_t hash3(LOpcode v, LInsp, LInsp, LInsp);
-        static uint32_t hashLoad(LOpcode v, LInsp, int32_t);
+        static uint32_t hashImmq(uint64_t);     // not NANOJIT_64BIT-only -- used by findImmf()
+        static uint32_t hash1(LOpcode op, LInsp);
+        static uint32_t hash2(LOpcode op, LInsp, LInsp);
+        static uint32_t hash3(LOpcode op, LInsp, LInsp, LInsp);
+        static uint32_t hashLoad(LOpcode op, LInsp, int32_t, AccSet);
         static uint32_t hashCall(const CallInfo *call, uint32_t argc, LInsp args[]);
 
         // These private versions are used after an LIns has been created;
@@ -1792,8 +1806,12 @@ namespace nanojit
         uint32_t find1(LInsp ins);
         uint32_t find2(LInsp ins);
         uint32_t find3(LInsp ins);
-        uint32_t findLoad(LInsp ins);
         uint32_t findCall(LInsp ins);
+        uint32_t findLoadReadOnly(LInsp ins);
+        uint32_t findLoadStack(LInsp ins);
+        uint32_t findLoadRStack(LInsp ins);
+        uint32_t findLoadOther(LInsp ins);
+        uint32_t findLoadMultiple(LInsp ins);
 
         void grow(LInsHashKind kind);
 
@@ -1810,19 +1828,22 @@ namespace nanojit
         LInsp find1(LOpcode v, LInsp a, uint32_t &k);
         LInsp find2(LOpcode v, LInsp a, LInsp b, uint32_t &k);
         LInsp find3(LOpcode v, LInsp a, LInsp b, LInsp c, uint32_t &k);
-        LInsp findLoad(LOpcode v, LInsp a, int32_t b, uint32_t &k);
+        LInsp findLoad(LOpcode v, LInsp a, int32_t b, AccSet accSet, LInsHashKind kind,
+                       uint32_t &k);
         LInsp findCall(const CallInfo *call, uint32_t argc, LInsp args[], uint32_t &k);
 
         // 'k' is the index found by findXYZ().
-        LInsp add(LInsHashKind kind, LInsp ins, uint32_t k);
+        void add(LInsHashKind kind, LInsp ins, uint32_t k);
 
-        void clear();
+        void clear();               // clears all tables
+        void clear(LInsHashKind);   // clears one table
     };
 
     class CseFilter: public LirWriter
     {
     private:
         LInsHashSet* exprs;
+        AccSet       storesSinceLastLoad;   // regions stored to since the last load
 
     public:
         CseFilter(LirWriter *out, Allocator&);
@@ -1836,7 +1857,8 @@ namespace nanojit
         LIns* ins1(LOpcode v, LInsp);
         LIns* ins2(LOpcode v, LInsp, LInsp);
         LIns* ins3(LOpcode v, LInsp, LInsp, LInsp);
-        LIns* insLoad(LOpcode op, LInsp cond, int32_t d, AccSet accSet);
+        LIns* insLoad(LOpcode op, LInsp base, int32_t d, AccSet accSet);
+        LIns* insStore(LOpcode op, LInsp value, LInsp base, int32_t d, AccSet accSet);
         LIns* insCall(const CallInfo *call, LInsp args[]);
         LIns* insGuard(LOpcode op, LInsp cond, GuardRecord *gr);
         LIns* insGuardXov(LOpcode op, LInsp a, LInsp b, GuardRecord *gr);
@@ -1975,37 +1997,6 @@ namespace nanojit
         LInsp read();
     };
 
-    // eliminate redundant loads by watching for stores & mutator calls
-    class LoadFilter: public LirWriter
-    {
-    public:
-        LInsp sp, rp;
-        LInsHashSet* exprs;
-
-        void clear(LInsp p);
-
-    public:
-        LoadFilter(LirWriter *out, Allocator& alloc)
-            : LirWriter(out), sp(NULL), rp(NULL)
-        {
-            uint32_t kInitialCaps[LInsLast + 1];
-            kInitialCaps[LInsImm]   = 1;
-            kInitialCaps[LInsImmq]  = 1;
-            kInitialCaps[LInsImmf]  = 1;
-            kInitialCaps[LIns1]     = 1;
-            kInitialCaps[LIns2]     = 1;
-            kInitialCaps[LIns3]     = 1;
-            kInitialCaps[LInsLoad]  = 64;
-            kInitialCaps[LInsCall]  = 1;
-            exprs = new (alloc) LInsHashSet(alloc, kInitialCaps);
-        }
-
-        LInsp ins0(LOpcode);
-        LInsp insLoad(LOpcode op, LInsp base, int32_t disp, AccSet accSet);
-        LInsp insStore(LOpcode op, LInsp value, LInsp base, int32_t disp, AccSet accSet);
-        LInsp insCall(const CallInfo *call, LInsp args[]);
-    };
-
     struct SoftFloatOps
     {
         const CallInfo* opmap[LIR_sentinel];
@@ -2049,19 +2040,26 @@ namespace nanojit
     class ValidateWriter : public LirWriter
     {
     private:
-        const char* _whereInPipeline;
+        LInsPrinter* printer;
+        const char* whereInPipeline;
 
         const char* type2string(LTy type);
         void typeCheckArgs(LOpcode op, int nArgs, LTy formals[], LIns* args[]);
         void errorStructureShouldBe(LOpcode op, const char* argDesc, int argN, LIns* arg,
                                     const char* shouldBeDesc);
-        void errorAccSetShould(const char* what, AccSet accSet, const char* shouldDesc);
+        void errorAccSet(const char* what, AccSet accSet, const char* shouldDesc);
         void checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2);
         void checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins);
         void checkLInsIsNull(LOpcode op, int argN, LIns* ins);
+        void checkAccSet(LOpcode op, LInsp base, AccSet accSet, AccSet maxAccSet);
+
+        LInsp sp, rp;
 
     public:
-        ValidateWriter(LirWriter* out, const char* stageName);
+        ValidateWriter(LirWriter* out, LInsPrinter* printer, const char* where);
+        void setSp(LInsp ins) { sp = ins; }
+        void setRp(LInsp ins) { rp = ins; }
+
         LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
         LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
         LIns* ins0(LOpcode v);