From e82972a6ac5c6cebb1419cfe98105c7cc9f09f6d Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Sun, 25 Jul 2010 19:17:39 -0700 Subject: [PATCH] Bug 552812 - nanojit: factor out AccSet differences into TM and TR (take 2). r=edwsmith. --HG-- extra : convert_revision : 7e11df137a4d30bd8411cf3b35cb26a003188f86 --- js/src/lirasm/lirasm.cpp | 50 ++-- js/src/nanojit/LIR.cpp | 477 +++++++++++++++++++-------------------- js/src/nanojit/LIR.h | 376 ++++++++++++++++++------------ 3 files changed, 499 insertions(+), 404 deletions(-) diff --git a/js/src/lirasm/lirasm.cpp b/js/src/lirasm/lirasm.cpp index b211002406f..067066d955a 100644 --- a/js/src/lirasm/lirasm.cpp +++ b/js/src/lirasm/lirasm.cpp @@ -95,6 +95,10 @@ nanojit::StackFilter::getTop(LIns*) return 0; } +// We lump everything into a single access region for lirasm. +static const AccSet ACCSET_OTHER = (1 << 0); +static const uint8_t LIRASM_NUM_USED_ACCS = 1; + #if defined NJ_VERBOSE void nanojit::LInsPrinter::formatGuard(InsBuf *buf, LIns *ins) @@ -124,6 +128,22 @@ nanojit::LInsPrinter::formatGuardXov(InsBuf *buf, LIns *ins) (long)x->line, ins->record()->profGuardID); } + +const char* +nanojit::LInsPrinter::accNames[] = { + "o", // (1 << 0) == ACCSET_OTHER + "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 1..10 (unused) + "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 11..20 (unused) + "?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 21..30 (unused) + "?" // 31 (unused) +}; +#endif + +#ifdef DEBUG +void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet) +{ + NanoAssert(accSet == ACCSET_OTHER); +} #endif typedef int32_t (FASTCALL *RetInt)(); @@ -148,7 +168,7 @@ enum ReturnType { #endif #define CI(name, args) \ - {(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACC_STORE_ANY \ + {(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACCSET_STORE_ANY \ DEBUG_ONLY_NAME(name)} #define FN(name, args) \ @@ -504,7 +524,7 @@ FragmentAssembler::FragmentAssembler(Lirasm &parent, const string &fragmentName, } #endif if (optimize) { - mLir = mCseFilter = new CseFilter(mLir, mParent.mAlloc); + mLir = mCseFilter = new CseFilter(mLir, LIRASM_NUM_USED_ACCS, mParent.mAlloc); } #if NJ_SOFTFLOAT_SUPPORTED if (avmplus::AvmCore::config.soft_float) { @@ -610,7 +630,7 @@ FragmentAssembler::assemble_load() mTokens[1].find_first_of("0123456789") == 0) { return mLir->insLoad(mOpcode, ref(mTokens[0]), - immI(mTokens[1]), ACC_LOAD_ANY); + immI(mTokens[1]), ACCSET_OTHER); } bad("immediate offset required for load"); return NULL; // not reached @@ -1061,7 +1081,7 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons need(3); ins = mLir->insStore(mOpcode, ref(mTokens[0]), ref(mTokens[1]), - immI(mTokens[2]), ACC_STORE_ANY); + immI(mTokens[2]), ACCSET_OTHER); break; #if NJ_EXPANDED_LOADSTORE_SUPPORTED @@ -1313,8 +1333,8 @@ const CallInfo ci_V_IQF = CI(f_V_IQF, CallInfo::typeSig3(ARGTYPE_V, ARGTYPE_I, A // - LIR_modd (not implemented in NJ backends) // // Other limitations: -// - Loads always use accSet==ACC_LOAD_ANY -// - Stores always use accSet==ACC_STORE_ANY +// - Loads always use accSet==ACCSET_OTHER +// - Stores always use accSet==ACCSET_OTHER // void FragmentAssembler::assembleRandomFragment(int nIns) @@ -1817,7 +1837,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) vector Ms = rnd(2) ? M4s : M8ps; if (!Ms.empty()) { LIns* base = rndPick(Ms); - ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACC_LOAD_ANY); + ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACCSET_OTHER); addOrReplace(Is, ins); n++; } @@ -1828,7 +1848,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) case LLD_Q: if (!M8ps.empty()) { LIns* base = rndPick(M8ps); - ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY); + ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACCSET_OTHER); addOrReplace(Qs, ins); n++; } @@ -1838,7 +1858,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) case LLD_D: if (!M8ps.empty()) { LIns* base = rndPick(M8ps); - ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY); + ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACCSET_OTHER); addOrReplace(Ds, ins); n++; } @@ -1848,7 +1868,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) vector Ms = rnd(2) ? M4s : M8ps; if (!Ms.empty() && !Is.empty()) { LIns* base = rndPick(Ms); - mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACC_STORE_ANY); + mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACCSET_OTHER); n++; } break; @@ -1858,7 +1878,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) case LST_Q: if (!M8ps.empty() && !Qs.empty()) { LIns* base = rndPick(M8ps); - mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACC_STORE_ANY); + mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACCSET_OTHER); n++; } break; @@ -1867,7 +1887,7 @@ FragmentAssembler::assembleRandomFragment(int nIns) case LST_D: if (!M8ps.empty() && !Ds.empty()) { LIns* base = rndPick(M8ps); - mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACC_STORE_ANY); + mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACCSET_OTHER); n++; } break; @@ -1977,7 +1997,7 @@ Lirasm::Lirasm(bool verbose) : #ifdef DEBUG if (mVerbose) { mLogc.lcbits = LC_ReadLIR | LC_AfterDCE | LC_Native | LC_RegAlloc | LC_Activation; - mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc); + mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc, LIRASM_NUM_USED_ACCS); } #endif @@ -2016,13 +2036,13 @@ Lirasm::lookupFunction(const string &name, CallInfo *&ci) // The ABI, arg types and ret type will be overridden by the caller. if (func->second.mReturnType == RT_FLOAT) { CallInfo target = {(uintptr_t) func->second.rfloat, - 0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY + 0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY verbose_only(, func->first.c_str()) }; *ci = target; } else { CallInfo target = {(uintptr_t) func->second.rint, - 0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY + 0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY verbose_only(, func->first.c_str()) }; *ci = target; } diff --git a/js/src/nanojit/LIR.cpp b/js/src/nanojit/LIR.cpp index 7cf057882be..4ae1c1e461e 100644 --- a/js/src/nanojit/LIR.cpp +++ b/js/src/nanojit/LIR.cpp @@ -294,18 +294,18 @@ namespace nanojit return ins; } - LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) + LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual) { if (isS16(d)) { LInsLd* insLd = (LInsLd*)_buf->makeRoom(sizeof(LInsLd)); LIns* ins = insLd->getLIns(); - ins->initLInsLd(op, base, d, accSet); + ins->initLInsLd(op, base, d, accSet, loadQual); return ins; } else { // If the displacement is more than 16 bits, put it in a separate instruction. // Note that CseFilter::insLoad() also does this, so this will // only occur if CseFilter has been removed from the pipeline. - return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet); + return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet, loadQual); } } @@ -1102,7 +1102,7 @@ namespace nanojit return out->insBranchJov(op, oprnd1, oprnd2, target); } - LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet) { + LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual) { if (base->isImmP() && !isS8(off)) { // if the effective address is constant, then transform: // ld const[bigconst] => ld (const+bigconst)[0] @@ -1110,9 +1110,9 @@ namespace nanojit // under the assumption that we're more likely to CSE-match the // constant base address if we dont const-fold small offsets. uintptr_t p = (uintptr_t)base->immP() + off; - return out->insLoad(op, insImmP((void*)p), 0, accSet); + return out->insLoad(op, insImmP((void*)p), 0, accSet, loadQual); } - return out->insLoad(op, base, off, accSet); + return out->insLoad(op, base, off, accSet, loadQual); } LIns* LirWriter::insStore(LIns* value, LIns* base, int32_t d, AccSet accSet) @@ -1602,19 +1602,27 @@ namespace nanojit return e ? e->name : NULL; } - char* LInsPrinter::formatAccSet(RefBuf* buf, AccSet accSet) { - int i = 0; - // 'c' is short for "const", because 'r' is used for RSTACK. - if (accSet & ACC_READONLY) { buf->buf[i++] = 'c'; accSet &= ~ACC_READONLY; } - if (accSet & ACC_STACK) { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; } - if (accSet & ACC_RSTACK) { buf->buf[i++] = 'r'; accSet &= ~ACC_RSTACK; } - if (accSet & ACC_OTHER) { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; } - // This assertion will fail if we add a new accSet value but - // forget to handle it here. - NanoAssert(accSet == 0); - buf->buf[i] = 0; - NanoAssert(size_t(i) < buf->len); + if (accSet == ACCSET_NONE) { + VMPI_sprintf(buf->buf, ".none"); + } else if (accSet == ACCSET_ALL) { + VMPI_sprintf(buf->buf, ".all"); + } else { + char* b = buf->buf; + b[0] = 0; + // The AccSet may contain bits set for regions not used by the + // embedding, if any have been specified via + // (ACCSET_ALL & ~ACCSET_XYZ). So only print those that are + // relevant. + for (int i = 0; i < EMB_NUM_USED_ACCS; i++) { + if (accSet & (1 << i)) { + VMPI_strcat(b, "."); + VMPI_strcat(b, accNames[i]); + accSet &= ~(1 << i); + } + } + NanoAssert(VMPI_strlen(b) < buf->len); + } return buf->buf; } @@ -1740,11 +1748,11 @@ namespace nanojit int32_t argc = i->argc(); int32_t m = int32_t(n); // Windows doesn't have 'ssize_t' if (call->isIndirect()) - m -= VMPI_snprintf(s, m, "%s = %s.%s [%s] ( ", formatRef(&b1, i), lirNames[op], + m -= VMPI_snprintf(s, m, "%s = %s%s [%s] ( ", formatRef(&b1, i), lirNames[op], formatAccSet(&b2, call->_storeAccSet), formatRef(&b3, i->arg(--argc))); else - m -= VMPI_snprintf(s, m, "%s = %s.%s #%s ( ", formatRef(&b1, i), lirNames[op], + m -= VMPI_snprintf(s, m, "%s = %s%s #%s ( ", formatRef(&b1, i), lirNames[op], formatAccSet(&b2, call->_storeAccSet), call->_name); if (m < 0) break; for (int32_t j = argc - 1; j >= 0; j--) { @@ -1907,12 +1915,19 @@ namespace nanojit case LIR_ldus2ui: case LIR_ldc2i: case LIR_lds2i: - case LIR_ldf2d: - VMPI_snprintf(s, n, "%s = %s.%s %s[%d]", formatRef(&b1, i), lirNames[op], - formatAccSet(&b2, i->accSet()), - formatRef(&b3, i->oprnd1()), + case LIR_ldf2d: { + const char* qualStr; + switch (i->loadQual()) { + case LOAD_CONST: qualStr = "/c"; break; + case LOAD_NORMAL: qualStr = ""; break; + case LOAD_VOLATILE: qualStr = "/v"; break; + default: NanoAssert(0); qualStr = "/?"; break; + } + VMPI_snprintf(s, n, "%s = %s%s%s %s[%d]", formatRef(&b1, i), lirNames[op], + formatAccSet(&b2, i->accSet()), qualStr, formatRef(&b3, i->oprnd1()), i->disp()); break; + } case LIR_sti: CASE64(LIR_stq:) @@ -1920,7 +1935,7 @@ namespace nanojit case LIR_sti2c: case LIR_sti2s: case LIR_std2f: - VMPI_snprintf(s, n, "%s.%s %s[%d] = %s", lirNames[op], + VMPI_snprintf(s, n, "%s%s %s[%d] = %s", lirNames[op], formatAccSet(&b1, i->accSet()), formatRef(&b2, i->oprnd2()), i->disp(), @@ -1935,40 +1950,42 @@ namespace nanojit } #endif - - CseFilter::CseFilter(LirWriter *out, Allocator& alloc) - : LirWriter(out), storesSinceLastLoad(ACC_NONE), alloc(alloc) + CseFilter::CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator& alloc) + : LirWriter(out), + EMB_NUM_USED_ACCS(embNumUsedAccs), + CSE_NUM_USED_ACCS(EMB_NUM_USED_ACCS + 2), + CSE_ACC_CONST( EMB_NUM_USED_ACCS + 0), + CSE_ACC_MULTIPLE( EMB_NUM_USED_ACCS + 1), + storesSinceLastLoad(ACCSET_NONE), + alloc(alloc) { - m_find[LInsImmI] = &CseFilter::findImmI; - m_find[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ); - m_find[LInsImmD] = &CseFilter::findImmD; - m_find[LIns1] = &CseFilter::find1; - m_find[LIns2] = &CseFilter::find2; - m_find[LIns3] = &CseFilter::find3; - m_find[LInsCall] = &CseFilter::findCall; - m_find[LInsLoadReadOnly] = &CseFilter::findLoadReadOnly; - m_find[LInsLoadStack] = &CseFilter::findLoadStack; - m_find[LInsLoadRStack] = &CseFilter::findLoadRStack; - m_find[LInsLoadOther] = &CseFilter::findLoadOther; - m_find[LInsLoadMultiple] = &CseFilter::findLoadMultiple; - m_cap[LInsImmI] = 128; - m_cap[LInsImmQ] = PTR_SIZE(0, 16); - m_cap[LInsImmD] = 16; - m_cap[LIns1] = 256; - m_cap[LIns2] = 512; - m_cap[LIns3] = 16; - m_cap[LInsCall] = 64; - m_cap[LInsLoadReadOnly] = 16; - m_cap[LInsLoadStack] = 16; - m_cap[LInsLoadRStack] = 16; - m_cap[LInsLoadOther] = 16; - m_cap[LInsLoadMultiple] = 16; + m_findNL[LInsImmI] = &CseFilter::findImmI; + m_findNL[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ); + m_findNL[LInsImmD] = &CseFilter::findImmD; + m_findNL[LIns1] = &CseFilter::find1; + m_findNL[LIns2] = &CseFilter::find2; + m_findNL[LIns3] = &CseFilter::find3; + m_findNL[LInsCall] = &CseFilter::findCall; - for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) { - m_list[kind] = new (alloc) LIns*[m_cap[kind]]; + m_capNL[LInsImmI] = 128; + m_capNL[LInsImmQ] = PTR_SIZE(0, 16); + m_capNL[LInsImmD] = 16; + m_capNL[LIns1] = 256; + m_capNL[LIns2] = 512; + m_capNL[LIns3] = 16; + m_capNL[LInsCall] = 64; + + for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) + m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]]; + + // Note that this allocates the CONST and MULTIPLE tables as well. + for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) { + m_capL[a] = 16; + m_listL[a] = new (alloc) LIns*[m_capL[a]]; } - clear(); + + clearAll(); } // Inlined/separated version of SuperFastHash. @@ -2017,15 +2034,23 @@ namespace nanojit return hash; } - void CseFilter::clear(LInsHashKind kind) { - VMPI_memset(m_list[kind], 0, sizeof(LIns*)*m_cap[kind]); - m_used[kind] = 0; + void CseFilter::clearNL(NLKind nlkind) { + VMPI_memset(m_listNL[nlkind], 0, sizeof(LIns*)*m_capNL[nlkind]); + m_usedNL[nlkind] = 0; } - void CseFilter::clear() { - for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) { - clear(kind); - } + void CseFilter::clearL(CseAcc a) { + VMPI_memset(m_listL[a], 0, sizeof(LIns*)*m_capL[a]); + m_usedL[a] = 0; + } + + void CseFilter::clearAll() { + for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind)) + clearNL(nlkind); + + // Note that this clears the CONST and MULTIPLE load tables as well. + for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) + clearL(a); } inline uint32_t CseFilter::hashImmI(int32_t a) { @@ -2055,15 +2080,12 @@ namespace nanojit return hashfinish(hashptr(hash, c)); } - NanoStaticAssert(sizeof(AccSet) == 1); // required for hashLoad to work properly - - // Nb: no need to hash the load's AccSet because each region's loads go in - // a different hash table. - inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet) { - uint32_t hash = hash8(0,uint8_t(op)); + // Nb: no need to hash the load's MiniAccSet because each every load goes + // into a table where all the loads have the same MiniAccSet. + inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d) { + uint32_t hash = hash8(0, uint8_t(op)); hash = hashptr(hash, a); - hash = hash32(hash, d); - return hashfinish(hash8(hash, accSet)); + return hashfinish(hash32(hash, d)); } inline uint32_t CseFilter::hashCall(const CallInfo *ci, uint32_t argc, LIns* args[]) { @@ -2073,41 +2095,69 @@ namespace nanojit return hashfinish(hash); } - void CseFilter::grow(LInsHashKind kind) + void CseFilter::growNL(NLKind nlkind) { - const uint32_t oldcap = m_cap[kind]; - m_cap[kind] <<= 1; - LIns** oldlist = m_list[kind]; - m_list[kind] = new (alloc) LIns*[m_cap[kind]]; - VMPI_memset(m_list[kind], 0, m_cap[kind] * sizeof(LIns*)); - find_t find = m_find[kind]; + const uint32_t oldcap = m_capNL[nlkind]; + m_capNL[nlkind] <<= 1; + LIns** oldlist = m_listNL[nlkind]; + m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]]; + VMPI_memset(m_listNL[nlkind], 0, m_capNL[nlkind] * sizeof(LIns*)); + find_t find = m_findNL[nlkind]; for (uint32_t i = 0; i < oldcap; i++) { LIns* ins = oldlist[i]; if (!ins) continue; uint32_t j = (this->*find)(ins); - NanoAssert(!m_list[kind][j]); - m_list[kind][j] = ins; + NanoAssert(!m_listNL[nlkind][j]); + m_listNL[nlkind][j] = ins; } } - void CseFilter::add(LInsHashKind kind, LIns* ins, uint32_t k) + void CseFilter::growL(CseAcc cseAcc) { - NanoAssert(!m_list[kind][k]); - m_used[kind]++; - m_list[kind][k] = ins; - if ((m_used[kind] * 4) >= (m_cap[kind] * 3)) { // load factor of 0.75 - grow(kind); + const uint32_t oldcap = m_capL[cseAcc]; + m_capL[cseAcc] <<= 1; + LIns** oldlist = m_listL[cseAcc]; + m_listL[cseAcc] = new (alloc) LIns*[m_capL[cseAcc]]; + VMPI_memset(m_listL[cseAcc], 0, m_capL[cseAcc] * sizeof(LIns*)); + find_t find = &CseFilter::findLoad; + for (uint32_t i = 0; i < oldcap; i++) { + LIns* ins = oldlist[i]; + if (!ins) continue; + uint32_t j = (this->*find)(ins); + NanoAssert(!m_listL[cseAcc][j]); + m_listL[cseAcc][j] = ins; + } + } + + void CseFilter::addNL(NLKind nlkind, LIns* ins, uint32_t k) + { + NanoAssert(!m_listNL[nlkind][k]); + m_usedNL[nlkind]++; + m_listNL[nlkind][k] = ins; + if ((m_usedNL[nlkind] * 4) >= (m_capNL[nlkind] * 3)) { // load factor of 0.75 + growNL(nlkind); + } + } + + void CseFilter::addL(LIns* ins, uint32_t k) + { + CseAcc cseAcc = miniAccSetToCseAcc(ins->miniAccSet(), ins->loadQual()); + NanoAssert(!m_listL[cseAcc][k]); + m_usedL[cseAcc]++; + m_listL[cseAcc][k] = ins; + if ((m_usedL[cseAcc] * 4) >= (m_capL[cseAcc] * 3)) { // load factor of 0.75 + growL(cseAcc); } } inline LIns* CseFilter::findImmI(int32_t a, uint32_t &k) { - LInsHashKind kind = LInsImmI; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LInsImmI; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hashImmI(a) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; NanoAssert(ins->isImmI()); @@ -2135,12 +2185,12 @@ namespace nanojit #ifdef NANOJIT_64BIT inline LIns* CseFilter::findImmQ(uint64_t a, uint32_t &k) { - LInsHashKind kind = LInsImmQ; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LInsImmQ; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hashImmQorD(a) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; NanoAssert(ins->isImmQ()); @@ -2161,12 +2211,12 @@ namespace nanojit inline LIns* CseFilter::findImmD(uint64_t a, uint32_t &k) { - LInsHashKind kind = LInsImmD; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LInsImmD; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hashImmQorD(a) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; NanoAssert(ins->isImmD()); @@ -2186,12 +2236,12 @@ namespace nanojit inline LIns* CseFilter::find1(LOpcode op, LIns* a, uint32_t &k) { - LInsHashKind kind = LIns1; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LIns1; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hash1(op, a) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; if (ins->isop(op) && ins->oprnd1() == a) @@ -2210,12 +2260,12 @@ namespace nanojit inline LIns* CseFilter::find2(LOpcode op, LIns* a, LIns* b, uint32_t &k) { - LInsHashKind kind = LIns2; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LIns2; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hash2(op, a, b) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b) @@ -2234,12 +2284,12 @@ namespace nanojit inline LIns* CseFilter::find3(LOpcode op, LIns* a, LIns* b, LIns* c, uint32_t &k) { - LInsHashKind kind = LIns3; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LIns3; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hash3(op, a, b, c) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c) @@ -2256,18 +2306,17 @@ namespace nanojit return k; } - inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet, - LInsHashKind kind, uint32_t &k) + inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, MiniAccSet miniAccSet, + LoadQual loadQual, uint32_t &k) { - (void)accSet; - const uint32_t bitmask = m_cap[kind] - 1; - k = hashLoad(op, a, d, accSet) & bitmask; + CseAcc cseAcc = miniAccSetToCseAcc(miniAccSet, loadQual); + const uint32_t bitmask = m_capL[cseAcc] - 1; + k = hashLoad(op, a, d) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listL[cseAcc][k]; if (!ins) return NULL; - NanoAssert(ins->accSet() == accSet); if (ins->isop(op) && ins->oprnd1() == a && ins->disp() == d) return ins; k = (k + n) & bitmask; @@ -2275,38 +2324,10 @@ namespace nanojit } } - uint32_t CseFilter::findLoadReadOnly(LIns* ins) + uint32_t CseFilter::findLoad(LIns* ins) { uint32_t k; - findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadReadOnly, k); - return k; - } - - uint32_t CseFilter::findLoadStack(LIns* ins) - { - uint32_t k; - findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadStack, k); - return k; - } - - uint32_t CseFilter::findLoadRStack(LIns* ins) - { - uint32_t k; - findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadRStack, k); - return k; - } - - uint32_t CseFilter::findLoadOther(LIns* ins) - { - uint32_t k; - findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadOther, k); - return k; - } - - uint32_t CseFilter::findLoadMultiple(LIns* ins) - { - uint32_t k; - findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadMultiple, k); + findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->miniAccSet(), ins->loadQual(), k); return k; } @@ -2320,12 +2341,12 @@ namespace nanojit inline LIns* CseFilter::findCall(const CallInfo *ci, uint32_t argc, LIns* args[], uint32_t &k) { - LInsHashKind kind = LInsCall; - const uint32_t bitmask = m_cap[kind] - 1; + NLKind nlkind = LInsCall; + const uint32_t bitmask = m_capNL[nlkind] - 1; k = hashCall(ci, argc, args) & bitmask; uint32_t n = 1; while (true) { - LIns* ins = m_list[kind][k]; + LIns* ins = m_listNL[nlkind][k]; if (!ins) return NULL; if (ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args)) @@ -2353,7 +2374,7 @@ namespace nanojit LIns* ins = findImmI(imm, k); if (!ins) { ins = out->insImmI(imm); - add(LInsImmI, ins, k); + addNL(LInsImmI, ins, k); } // We assume that downstream stages do not modify the instruction, so // that we can insert 'ins' into slot 'k'. Check this. @@ -2368,7 +2389,7 @@ namespace nanojit LIns* ins = findImmQ(q, k); if (!ins) { ins = out->insImmQ(q); - add(LInsImmQ, ins, k); + addNL(LInsImmQ, ins, k); } NanoAssert(ins->isop(LIR_immq) && ins->immQ() == q); return ins; @@ -2388,7 +2409,7 @@ namespace nanojit LIns* ins = findImmD(u.u64, k); if (!ins) { ins = out->insImmD(d); - add(LInsImmD, ins, k); + addNL(LInsImmD, ins, k); } NanoAssert(ins->isop(LIR_immd) && ins->immDasQ() == u.u64); return ins; @@ -2397,7 +2418,7 @@ namespace nanojit LIns* CseFilter::ins0(LOpcode op) { if (op == LIR_label) - clear(); + clearAll(); return out->ins0(op); } @@ -2409,7 +2430,7 @@ namespace nanojit ins = find1(op, a, k); if (!ins) { ins = out->ins1(op, a); - add(LIns1, ins, k); + addNL(LIns1, ins, k); } } else { ins = out->ins1(op, a); @@ -2426,7 +2447,7 @@ namespace nanojit ins = find2(op, a, b, k); if (!ins) { ins = out->ins2(op, a, b); - add(LIns2, ins, k); + addNL(LIns2, ins, k); } NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b); return ins; @@ -2439,51 +2460,56 @@ namespace nanojit LIns* ins = find3(op, a, b, c, k); if (!ins) { ins = out->ins3(op, a, b, c); - add(LIns3, ins, k); + addNL(LIns3, ins, k); } NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c); return ins; } - LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet loadAccSet) + LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual) { LIns* ins; if (isS16(disp)) { - // Clear all loads aliased by stores and calls since the last time - // we were in this function. - if (storesSinceLastLoad != ACC_NONE) { - NanoAssert(!(storesSinceLastLoad & ACC_READONLY)); // can't store to READONLY - if (storesSinceLastLoad & ACC_STACK) { clear(LInsLoadStack); } - if (storesSinceLastLoad & ACC_RSTACK) { clear(LInsLoadRStack); } - if (storesSinceLastLoad & ACC_OTHER) { clear(LInsLoadOther); } - // Loads marked with multiple access regions must be treated - // conservatively -- we always clear all of them. - clear(LInsLoadMultiple); - storesSinceLastLoad = ACC_NONE; + if (storesSinceLastLoad != ACCSET_NONE) { + // Clear all normal (excludes CONST and MULTIPLE) loads + // aliased by stores and calls since the last time we were in + // this function. + for (CseAcc a = 0; a < EMB_NUM_USED_ACCS; a++) + if (storesSinceLastLoad & (1 << a)) + clearL(a); + + // No need to clear CONST loads (those in the CSE_ACC_CONST table). + + // Multi-region loads must be treated conservatively -- we + // always clear all of them. + clearL(CSE_ACC_MULTIPLE); + + storesSinceLastLoad = ACCSET_NONE; } - LInsHashKind kind; - switch (loadAccSet) { - case ACC_READONLY: kind = LInsLoadReadOnly; break; - case ACC_STACK: kind = LInsLoadStack; break; - case ACC_RSTACK: kind = LInsLoadRStack; break; - case ACC_OTHER: kind = LInsLoadOther; break; - default: kind = LInsLoadMultiple; break; + if (loadQual == LOAD_VOLATILE) { + // Volatile loads are never CSE'd, don't bother looking for + // them or inserting them in the table. + ins = out->insLoad(op, base, disp, accSet, loadQual); + } else { + uint32_t k; + ins = findLoad(op, base, disp, compressAccSet(accSet), loadQual, k); + if (!ins) { + ins = out->insLoad(op, base, disp, accSet, loadQual); + addL(ins, k); + } } - - uint32_t k; - ins = findLoad(op, base, disp, loadAccSet, kind, k); - if (!ins) { - ins = out->insLoad(op, base, disp, loadAccSet); - add(kind, ins, k); - } - NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp); + // Nb: must compare miniAccSets, not AccSets, because the AccSet + // stored in the load may have lost info if it's multi-region. + NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp && + ins->miniAccSet().val == compressAccSet(accSet).val && + ins->loadQual() == loadQual); } else { // If the displacement is more than 16 bits, put it in a separate // instruction. Nb: LirBufWriter also does this, we do it here // too because CseFilter relies on LirBufWriter not changing code. - ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, loadAccSet); + ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, accSet, loadQual); } return ins; } @@ -2531,7 +2557,7 @@ namespace nanojit ins = find1(op, c, k); if (!ins) { ins = out->insGuard(op, c, gr); - add(LIns1, ins, k); + addNL(LIns1, ins, k); } } else { ins = out->insGuard(op, c, gr); @@ -2549,7 +2575,7 @@ namespace nanojit LIns* ins = find2(op, a, b, k); if (!ins) { ins = out->insGuardXov(op, a, b, gr); - add(LIns2, ins, k); + addNL(LIns2, ins, k); } NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b); return ins; @@ -2562,12 +2588,12 @@ namespace nanojit LIns* ins; uint32_t argc = ci->count_args(); if (ci->_isPure) { - NanoAssert(ci->_storeAccSet == ACC_NONE); + NanoAssert(ci->_storeAccSet == ACCSET_NONE); uint32_t k; ins = findCall(ci, argc, args, k); if (!ins) { ins = out->insCall(ci, args); - add(LInsCall, ins, k); + addNL(LInsCall, ins, k); } } else { // We only need to worry about aliasing if !ci->_isPure. @@ -2601,7 +2627,7 @@ namespace nanojit #define SF_CALLINFO(name, typesig) \ static const CallInfo name##_ci = \ - { (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACC_NONE verbose_only(, #name) } + { (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACCSET_NONE verbose_only(, #name) } SF_CALLINFO(i2d, SIG_D_I); SF_CALLINFO(ui2d, SIG_D_UI); @@ -2817,6 +2843,13 @@ namespace nanojit whereInPipeline, what, printer->formatAccSet(&b, accSet), shouldDesc); } + void ValidateWriter::errorLoadQual(const char* what, LoadQual loadQual) + { + NanoAssertMsgf(0, + "LIR LoadQual error (%s): '%s' loadQual is '%d'", + whereInPipeline, what, loadQual); + } + void ValidateWriter::checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins) { // We could introduce a LTy_B32 type in the type system but that's a @@ -2839,60 +2872,26 @@ namespace nanojit errorStructureShouldBe(op, "argument", argN, ins, lirNames[op2]); } - void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet) - { - if (accSet == ACC_NONE) - errorAccSet(lirNames[op], accSet, "it should not equal ACC_NONE"); - - if (accSet & ~maxAccSet) - errorAccSet(lirNames[op], accSet, - "it should not contain bits that aren't in ACC_LOAD_ANY/ACC_STORE_ANY"); - - // Some sanity checking, which is based on the following assumptions: - // - STACK ones should use 'sp' or 'sp+k' as the base. (We could look - // for more complex patterns, but that feels dangerous. Better to - // keep it really simple.) - // - RSTACK ones should use 'rp' as the base. - // - READONLY/OTHER ones should not use 'sp'/'sp+k' or 'rp' as the base. - // - // Things that aren't checked: - // - There's no easy way to check if READONLY ones really are read-only. - - bool isStack = base == sp || - (base->isop(LIR_addp) && base->oprnd1() == sp && base->oprnd2()->isImmP()); - bool isRStack = base == rp; - - switch (accSet) { - case ACC_STACK: - if (!isStack) - errorAccSet(lirNames[op], accSet, "but it's not a stack access"); - break; - - case ACC_RSTACK: - if (!isRStack) - errorAccSet(lirNames[op], accSet, "but it's not an rstack access"); - break; - - case ACC_READONLY: - case ACC_OTHER: - if (isStack) - errorAccSet(lirNames[op], accSet, "but it's a stack access"); - if (isRStack) - errorAccSet(lirNames[op], accSet, "but it's an rstack access"); - break; - - default: - break; - } - } - ValidateWriter::ValidateWriter(LirWriter *out, LInsPrinter* printer, const char* where) - : LirWriter(out), printer(printer), whereInPipeline(where), sp(0), rp(0) + : LirWriter(out), printer(printer), whereInPipeline(where), + checkAccSetIns1(0), checkAccSetIns2(0) {} - LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) + LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, + LoadQual loadQual) { - checkAccSet(op, base, accSet, ACC_LOAD_ANY); + checkAccSet(op, base, accSet); + + switch (loadQual) { + case LOAD_CONST: + case LOAD_NORMAL: + case LOAD_VOLATILE: + break; + default: + errorLoadQual(lirNames[op], loadQual); + break; + } + int nArgs = 1; LTy formals[1] = { LTy_P }; @@ -2914,12 +2913,12 @@ namespace nanojit typeCheckArgs(op, nArgs, formals, args); - return out->insLoad(op, base, d, accSet); + return out->insLoad(op, base, d, accSet, loadQual); } LIns* ValidateWriter::insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet) { - checkAccSet(op, base, accSet, ACC_STORE_ANY); + checkAccSet(op, base, accSet); int nArgs = 2; LTy formals[2] = { LTy_V, LTy_P }; // LTy_V is overwritten shortly @@ -3189,12 +3188,8 @@ namespace nanojit LOpcode op = getCallOpcode(ci); - if (ci->_isPure && ci->_storeAccSet != ACC_NONE) - errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACC_NONE for pure functions"); - - if (ci->_storeAccSet & ~ACC_STORE_ANY) - errorAccSet(lirNames[op], ci->_storeAccSet, - "it should not contain bits that aren't in ACC_STORE_ANY"); + if (ci->_isPure && ci->_storeAccSet != ACCSET_NONE) + errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACCSET_NONE for pure functions"); // This loop iterates over the args from right-to-left (because arg() // and getArgTypes() use right-to-left order), but puts the results diff --git a/js/src/nanojit/LIR.h b/js/src/nanojit/LIR.h index cc6ff89dbf2..f5c66d8ff19 100644 --- a/js/src/nanojit/LIR.h +++ b/js/src/nanojit/LIR.h @@ -199,11 +199,12 @@ namespace nanojit // Access regions // -------------- // Doing alias analysis precisely is difficult. But it turns out that - // keeping track of aliasing at a very coarse level is enough to help with - // many optimisations. So we conceptually divide the memory that is - // accessible from LIR into a small number of "access regions". An access - // region may be non-contiguous. No two access regions can overlap. The - // union of all access regions covers all memory accessible from LIR. + // keeping track of aliasing at a coarse level is enough to help with many + // optimisations. So we conceptually divide the memory that is accessible + // from LIR into a small number of "access regions" (aka. "Acc"). An + // access region may be non-contiguous. No two access regions can + // overlap. The union of all access regions covers all memory accessible + // from LIR. // // In general a (static) load or store may be executed more than once, and // thus may access multiple regions; however, in practice almost all @@ -214,64 +215,28 @@ namespace nanojit // If two loads/stores/calls are known to not access the same region(s), // then they do not alias. // - // The access regions used are as follows: - // - // - READONLY: all memory that is read-only, ie. never stored to. - // A load from a READONLY region will never alias with any stores. - // - // - STACK: the stack. Stack loads/stores can usually be easily - // identified because they use SP as the base pointer. - // - // - RSTACK: the return stack. Return stack loads/stores can usually be - // easily identified because they use RP as the base pointer. - // - // - OTHER: all other regions of memory. - // - // It makes sense to add new access regions when doing so will help with - // one or more optimisations. - // - // One subtlety is that the meanings of the access region markings only - // apply to the LIR fragment that they are in. For example, if a memory - // location M is read-only in a particular LIR fragment, all loads - // involving M in that fragment can be safely marked READONLY, even if M - // is modified elsewhere. This is safe because the a LIR fragment is the - // unit of analysis in which the markings are used. In other words alias - // region markings are only used for intra-fragment optimisations. + // All regions are defined by the embedding. It makes sense to add new + // embedding-specific access regions when doing so will help with one or + // more optimisations. // // Access region sets and instruction markings // ------------------------------------------- - // The LIR generator must mark each load/store with an "access region - // set", which is a set of one or more access regions. This indicates - // which parts of LIR-accessible memory the load/store may touch. + // Each load/store is marked with an "access region set" (aka. "AccSet"), + // which is a set of one or more access regions. This indicates which + // parts of LIR-accessible memory the load/store may touch. // - // The LIR generator must also mark each function called from LIR with an - // access region set for memory stored to by the function. (We could also - // have a marking for memory loads, but there's no need at the moment.) - // These markings apply to the function itself, not the call site (ie. - // they're not context-sensitive). + // Each function called from LIR is also marked with an access region set + // for memory stored to by the function. (We could also have a marking + // for memory loads done by the function, but there's no need at the + // moment.) These markings apply to the function itself, not the call + // site, ie. they're not context-sensitive. // - // These load/store/call markings MUST BE ACCURATE -- if they are wrong - // then invalid optimisations might occur that change the meaning of the - // code. However, they can safely be imprecise (ie. conservative), in the - // following ways: - // - // - A load that accesses a READONLY region can be safely marked instead - // as loading from OTHER. In other words, it's safe to underestimate - // the size of the READONLY region. (This would also apply to the load - // set of a function, if we recorded that.) - // - // - A load/store can safely be marked as accessing regions that it - // doesn't, so long as the regions it does access are also included (one - // exception: marking a store with READONLY is nonsense and will cause - // assertions). - // - // In other words, a load/store can be marked with an access region set - // that is a superset of its actual access region set. Taking this to - // its logical conclusion, any load can be safely marked with LOAD_ANY and - // any store can be safely marked with with STORE_ANY (and the latter is - // true for the store set of a function.) - // - // Such imprecision is safe but may reduce optimisation opportunities. + // These load/store/call markings MUST BE ACCURATE -- if not then invalid + // optimisations might occur that change the meaning of the code. + // However, they can safely be imprecise (ie. conservative), ie. a + // load/store/call can be marked with an access region set that is a + // superset of the actual access region set. Such imprecision is safe but + // may reduce optimisation opportunities. // // Optimisations that use access region info // ----------------------------------------- @@ -282,35 +247,100 @@ namespace nanojit // load with a single access region, you might as well use ACC_LOAD_ANY. //----------------------------------------------------------------------- - // An access region set is represented as a bitset. Nb: this restricts us - // to at most eight alias regions for the moment. - typedef uint8_t AccSet; + // An access region set is represented as a bitset. Using a uint32_t + // restricts us to at most 32 alias regions for the moment. This could be + // expanded to a uint64_t easily if needed. + typedef uint32_t AccSet; + static const int NUM_ACCS = sizeof(AccSet) * 8; - // The access regions. Note that because of the bitset representation - // these constants are also valid (singleton) AccSet values. If you add - // new ones please update ACC_ALL_STORABLE and formatAccSet() and - // CseFilter. - // - static const AccSet ACC_READONLY = 1 << 0; // 0000_0001b - static const AccSet ACC_STACK = 1 << 1; // 0000_0010b - static const AccSet ACC_RSTACK = 1 << 2; // 0000_0100b - static const AccSet ACC_OTHER = 1 << 3; // 0000_1000b - - // Some common (non-singleton) access region sets. ACC_NONE does not make + // Some common (non-singleton) access region sets. ACCSET_NONE does not make // sense for loads or stores (which must access at least one region), it // only makes sense for calls. // - // A convention that's worth using: use ACC_LOAD_ANY/ACC_STORE_ANY for - // cases that you're unsure about or haven't considered carefully. Use - // ACC_ALL/ACC_ALL_STORABLE for cases that you have considered carefully. - // That way it's easy to tell which ones have been considered and which - // haven't. - static const AccSet ACC_NONE = 0x0; - static const AccSet ACC_ALL_STORABLE = ACC_STACK | ACC_RSTACK | ACC_OTHER; - static const AccSet ACC_ALL = ACC_READONLY | ACC_ALL_STORABLE; - static const AccSet ACC_LOAD_ANY = ACC_ALL; // synonym - static const AccSet ACC_STORE_ANY = ACC_ALL_STORABLE; // synonym + static const AccSet ACCSET_NONE = 0x0; + static const AccSet ACCSET_ALL = 0xffffffff; + static const AccSet ACCSET_LOAD_ANY = ACCSET_ALL; // synonym + static const AccSet ACCSET_STORE_ANY = ACCSET_ALL; // synonym + // Full AccSets don't fit into load and store instructions. But + // load/store AccSets almost always contain a single access region. We + // take advantage of this to create a compressed AccSet, MiniAccSet, that + // does fit. + // + // The 32 single-region AccSets get compressed into a number in the range + // 0..31 (according to the position of the set bit), and all other + // (multi-region) AccSets get converted into MINI_ACCSET_MULTIPLE. So the + // representation is lossy in the latter case, but that case is rare for + // loads/stores. We use a full AccSet for the storeAccSets of calls, for + // which multi-region AccSets are common. + // + // We wrap the uint8_t inside a struct to avoid the possiblity of subtle + // bugs caused by mixing up AccSet and MiniAccSet, which is easy to do. + // However, the struct gets padded inside LInsLd in an inconsistent way on + // Windows, so we actually store a MiniAccSetVal inside LInsLd. Sigh. + // But we use MiniAccSet everywhere else. + // + typedef uint8_t MiniAccSetVal; + struct MiniAccSet { MiniAccSetVal val; }; + static const MiniAccSet MINI_ACCSET_MULTIPLE = { 255 }; + + static MiniAccSet compressAccSet(AccSet accSet) { + // As the number of regions increase, this may become a bottleneck. + // If it does we can first count the number of bits using Kernighan's + // technique + // (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan) + // and if it's a single-region set, use a bit-scanning instruction to + // work out which single-region set it is. That would require + // factoring out the bit-scanning code currently in + // nRegisterAllocFromSet(). + // + // Try all the single-region AccSets first. + for (int i = 0; i < NUM_ACCS; i++) { + if (accSet == (1U << i)) { + MiniAccSet ret = { uint8_t(i) }; + return ret; + } + } + // If we got here, it must be a multi-region AccSet. + return MINI_ACCSET_MULTIPLE; + } + + static AccSet decompressMiniAccSet(MiniAccSet miniAccSet) { + return (miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? ACCSET_ALL : (1 << miniAccSet.val); + } + + // The LoadQual affects how a load can be optimised: + // + // - CONST: These loads are guaranteed to always return the same value + // during a single execution of a fragment (but the value is allowed to + // change between executions of the fragment). This means that the + // location is never stored to by the LIR, and is never modified by an + // external entity while the fragment is running. + // + // - NORMAL: These loads may be stored to by the LIR, but are never + // modified by an external entity while the fragment is running. + // + // - VOLATILE: These loads may be stored to by the LIR, and may be + // modified by an external entity while the fragment is running. + // + // This gives a lattice with the ordering: CONST < NORMAL < VOLATILE. + // As usual, it's safe to mark a load with a value higher (less precise) + // that actual, but it may result in fewer optimisations occurring. + // + // Generally CONST loads are highly amenable to optimisation (eg. CSE), + // VOLATILE loads are entirely unoptimisable, and NORMAL loads are in + // between and require some alias analysis to optimise. + // + // Note that CONST has a stronger meaning to "const" in C and C++; in C + // and C++ a "const" variable may be modified by an external entity, such + // as hardware. Hence "const volatile" makes sense in C and C++, but + // CONST+VOLATILE doesn't make sense in LIR. + // + enum LoadQual { + LOAD_CONST, + LOAD_NORMAL, + LOAD_VOLATILE + }; struct CallInfo { @@ -687,7 +717,7 @@ namespace nanojit inline void initLInsOp1(LOpcode opcode, LIns* oprnd1); inline void initLInsOp2(LOpcode opcode, LIns* oprnd1, LIns* oprnd2); inline void initLInsOp3(LOpcode opcode, LIns* oprnd1, LIns* oprnd2, LIns* oprnd3); - inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet); + inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual); inline void initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet); inline void initLInsSk(LIns* prevLIns); // Nb: args[] must be allocated and initialised before being passed in; @@ -790,8 +820,12 @@ namespace nanojit // For guards. inline GuardRecord* record() const; + // For loads. + inline LoadQual loadQual() const; + // For loads/stores. inline int32_t disp() const; + inline MiniAccSet miniAccSet() const; inline AccSet accSet() const; // For LInsSk. @@ -1085,7 +1119,8 @@ namespace nanojit // could go to 24 bits but then it would happen so rarely that the // handler code would be difficult to test and thus untrustworthy. int16_t disp; - AccSet accSet; + MiniAccSetVal miniAccSetVal; // not 'MiniAccSet' due to Windows padding; see above + LoadQual loadQual:2; LIns* oprnd_1; @@ -1102,7 +1137,7 @@ namespace nanojit friend class LIns; int16_t disp; - AccSet accSet; + MiniAccSetVal miniAccSetVal; LIns* oprnd_2; @@ -1251,12 +1286,13 @@ namespace nanojit toLInsOp3()->oprnd_3 = oprnd3; NanoAssert(isLInsOp3()); } - void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet) { + void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual) { initSharedFields(opcode); toLInsLd()->oprnd_1 = val; NanoAssert(d == int16_t(d)); toLInsLd()->disp = int16_t(d); - toLInsLd()->accSet = accSet; + toLInsLd()->miniAccSetVal = compressAccSet(accSet).val; + toLInsLd()->loadQual = loadQual; NanoAssert(isLInsLd()); } void LIns::initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet) { @@ -1265,7 +1301,7 @@ namespace nanojit toLInsSt()->oprnd_2 = base; NanoAssert(d == int16_t(d)); toLInsSt()->disp = int16_t(d); - toLInsSt()->accSet = accSet; + toLInsSt()->miniAccSetVal = compressAccSet(accSet).val; NanoAssert(isLInsSt()); } void LIns::initLInsSk(LIns* prevLIns) { @@ -1369,6 +1405,11 @@ namespace nanojit } } + LoadQual LIns::loadQual() const { + NanoAssert(isLInsLd()); + return toLInsLd()->loadQual; + } + int32_t LIns::disp() const { if (isLInsSt()) { return toLInsSt()->disp; @@ -1378,13 +1419,19 @@ namespace nanojit } } - AccSet LIns::accSet() const { + MiniAccSet LIns::miniAccSet() const { + MiniAccSet miniAccSet; if (isLInsSt()) { - return toLInsSt()->accSet; + miniAccSet.val = toLInsSt()->miniAccSetVal; } else { NanoAssert(isLInsLd()); - return toLInsLd()->accSet; + miniAccSet.val = toLInsLd()->miniAccSetVal; } + return miniAccSet; + } + + AccSet LIns::accSet() const { + return decompressMiniAccSet(miniAccSet()); } LIns* LIns::prevLIns() const { @@ -1510,8 +1557,8 @@ namespace nanojit virtual LIns* insImmD(double d) { return out->insImmD(d); } - virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) { - return out->insLoad(op, base, d, accSet); + virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual) { + return out->insLoad(op, base, d, accSet, loadQual); } virtual LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet) { return out->insStore(op, value, base, d, accSet); @@ -1584,6 +1631,11 @@ namespace nanojit #endif } + // Do a load with LoadQual==LOAD_NORMAL. + LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) { + return insLoad(op, base, d, accSet, LOAD_NORMAL); + } + // Chooses LIR_sti, LIR_stq or LIR_std according to the type of 'value'. LIns* insStore(LIns* value, LIns* base, int32_t d, AccSet accSet); }; @@ -1680,16 +1732,19 @@ namespace nanojit { private: Allocator& alloc; + const int EMB_NUM_USED_ACCS; char *formatImmI(RefBuf* buf, int32_t c); char *formatImmQ(RefBuf* buf, uint64_t c); char *formatImmD(RefBuf* buf, double c); - void formatGuard(InsBuf* buf, LIns* ins); - void formatGuardXov(InsBuf* buf, LIns* ins); + void formatGuard(InsBuf* buf, LIns* ins); // defined by the embedder + void formatGuardXov(InsBuf* buf, LIns* ins); // defined by the embedder + static const char* accNames[]; // defined by the embedder public: - LInsPrinter(Allocator& alloc) - : alloc(alloc) + + LInsPrinter(Allocator& alloc, int embNumUsedAccs) + : alloc(alloc), EMB_NUM_USED_ACCS(embNumUsedAccs) { addrNameMap = new (alloc) AddrNameMap(alloc); lirNameMap = new (alloc) LirNameMap(alloc); @@ -1790,8 +1845,8 @@ namespace nanojit LIns* insParam(int32_t i, int32_t kind) { return add(out->insParam(i, kind)); } - LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet) { - return add(out->insLoad(v, base, disp, accSet)); + LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual) { + return add(out->insLoad(v, base, disp, accSet, loadQual)); } LIns* insStore(LOpcode op, LIns* v, LIns* b, int32_t d, AccSet accSet) { return add(out->insStore(op, v, b, d, accSet)); @@ -1825,16 +1880,17 @@ namespace nanojit LIns* insGuardXov(LOpcode, LIns* a, LIns* b, GuardRecord *); LIns* insBranch(LOpcode, LIns* cond, LIns* target); LIns* insBranchJov(LOpcode, LIns* a, LIns* b, LIns* target); - LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet); + LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual); private: LIns* simplifyOverflowArith(LOpcode op, LIns** opnd1, LIns** opnd2); }; class CseFilter: public LirWriter { - enum LInsHashKind { + enum NLKind { // We divide instruction kinds into groups. LIns0 isn't present - // because we don't need to record any 0-ary instructions. + // because we don't need to record any 0-ary instructions. Loads + // aren't here, they're handled separately. LInsImmI = 0, LInsImmQ = 1, // only occurs on 64-bit platforms LInsImmD = 2, @@ -1843,46 +1899,69 @@ namespace nanojit LIns3 = 5, LInsCall = 6, - // Loads are special. We group them by access region: one table for - // each region, and then a catch-all table for any loads marked with - // multiple regions. This arrangement makes the removal of - // invalidated loads fast -- eg. we can invalidate all STACK loads by - // just clearing the LInsLoadStack table. The disadvantage is that - // loads marked with multiple regions must be invalidated - // conservatively, eg. if any intervening stores occur. But loads - // marked with multiple regions should be rare. - LInsLoadReadOnly = 7, - LInsLoadStack = 8, - LInsLoadRStack = 9, - LInsLoadOther = 10, - LInsLoadMultiple = 11, - LInsFirst = 0, - LInsLast = 11, + LInsLast = 6, // Need a value after "last" to outsmart compilers that insist last+1 is impossible. - LInsInvalid = 12 + LInsInvalid = 7 }; - #define nextKind(kind) LInsHashKind(kind+1) + #define nextNLKind(kind) NLKind(kind+1) - // There is one list for each instruction kind. This lets us size the - // lists appropriately (some instructions are more common than others). - // It also lets us have kind-specific find/add/grow functions, which + // There is one table for each NLKind. This lets us size the lists + // appropriately (some instruction kinds are more common than others). + // It also lets us have NLKind-specific find/add/grow functions, which // are faster than generic versions. // - // Nb: Size must be a power of 2. - // Don't start too small, or we'll waste time growing and rehashing. - // Don't start too large, will waste memory. + // Nb: m_listNL and m_capNL sizes must be a power of 2. + // Don't start m_capNL too small, or we'll waste time growing and rehashing. + // Don't start m_capNL too large, will waste memory. // - LIns** m_list[LInsLast + 1]; - uint32_t m_cap[LInsLast + 1]; - uint32_t m_used[LInsLast + 1]; + LIns** m_listNL[LInsLast + 1]; + uint32_t m_capNL[ LInsLast + 1]; + uint32_t m_usedNL[LInsLast + 1]; typedef uint32_t (CseFilter::*find_t)(LIns*); - find_t m_find[LInsLast + 1]; + find_t m_findNL[LInsLast + 1]; + + // Similarly, for loads, there is one table for each CseAcc. A CseAcc + // is like a normal access region, but there are two extra possible + // values: CSE_ACC_CONST, which is where we put all CONST-qualified + // loads, and CSE_ACC_MULTIPLE, where we put all multi-region loads. + // All remaining loads are single-region and go in the table entry for + // their region. + // + // This arrangement makes the removal of invalidated loads fast -- we + // can invalidate all loads from a single region by clearing that + // region's table. + // + typedef uint8_t CseAcc; // same type as MiniAccSet + + static const uint8_t CSE_NUM_ACCS = NUM_ACCS + 2; + + // These values would be 'static const' except they are defined in + // terms of EMB_NUM_USED_ACCS which is itself not 'static const' + // because it's passed in by the embedding. + const uint8_t EMB_NUM_USED_ACCS; // number of access regions used by the embedding + const uint8_t CSE_NUM_USED_ACCS; // EMB_NUM_USED_ACCS + 2 + const CseAcc CSE_ACC_CONST; // EMB_NUM_USED_ACCS + 0 + const CseAcc CSE_ACC_MULTIPLE; // EMB_NUM_USED_ACCS + 1 + + // We will only use CSE_NUM_USED_ACCS of these entries, ie. the + // number of lists allocated depends on the number of access regions + // in use by the embedding. + LIns** m_listL[CSE_NUM_ACCS]; + uint32_t m_capL[ CSE_NUM_ACCS]; + uint32_t m_usedL[CSE_NUM_ACCS]; AccSet storesSinceLastLoad; // regions stored to since the last load Allocator& alloc; + CseAcc miniAccSetToCseAcc(MiniAccSet miniAccSet, LoadQual loadQual) { + NanoAssert(miniAccSet.val < NUM_ACCS || miniAccSet.val == MINI_ACCSET_MULTIPLE.val); + return (loadQual == LOAD_CONST) ? CSE_ACC_CONST : + (miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? CSE_ACC_MULTIPLE : + miniAccSet.val; + } + static uint32_t hash8(uint32_t hash, const uint8_t data); static uint32_t hash32(uint32_t hash, const uint32_t data); static uint32_t hashptr(uint32_t hash, const void* data); @@ -1893,7 +1972,7 @@ namespace nanojit static uint32_t hash1(LOpcode op, LIns*); static uint32_t hash2(LOpcode op, LIns*, LIns*); static uint32_t hash3(LOpcode op, LIns*, LIns*, LIns*); - static uint32_t hashLoad(LOpcode op, LIns*, int32_t, AccSet); + static uint32_t hashLoad(LOpcode op, LIns*, int32_t); static uint32_t hashCall(const CallInfo *call, uint32_t argc, LIns* args[]); // These versions are used before an LIns has been created. @@ -1905,7 +1984,7 @@ namespace nanojit LIns* find1(LOpcode v, LIns* a, uint32_t &k); LIns* find2(LOpcode v, LIns* a, LIns* b, uint32_t &k); LIns* find3(LOpcode v, LIns* a, LIns* b, LIns* c, uint32_t &k); - LIns* findLoad(LOpcode v, LIns* a, int32_t b, AccSet accSet, LInsHashKind kind, + LIns* findLoad(LOpcode v, LIns* a, int32_t b, MiniAccSet miniAccSet, LoadQual loadQual, uint32_t &k); LIns* findCall(const CallInfo *call, uint32_t argc, LIns* args[], uint32_t &k); @@ -1921,22 +2000,21 @@ namespace nanojit uint32_t find2(LIns* ins); uint32_t find3(LIns* ins); uint32_t findCall(LIns* ins); - uint32_t findLoadReadOnly(LIns* ins); - uint32_t findLoadStack(LIns* ins); - uint32_t findLoadRStack(LIns* ins); - uint32_t findLoadOther(LIns* ins); - uint32_t findLoadMultiple(LIns* ins); + uint32_t findLoad(LIns* ins); - void grow(LInsHashKind kind); + void growNL(NLKind kind); + void growL(CseAcc cseAcc); // 'k' is the index found by findXYZ(). - void add(LInsHashKind kind, LIns* ins, uint32_t k); + void addNL(NLKind kind, LIns* ins, uint32_t k); + void addL(LIns* ins, uint32_t k); - void clear(); // clears all tables - void clear(LInsHashKind); // clears one table + void clearAll(); // clears all tables + void clearNL(NLKind); // clears one non-load table + void clearL(CseAcc); // clears one load table public: - CseFilter(LirWriter *out, Allocator&); + CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator&); LIns* insImmI(int32_t imm); #ifdef NANOJIT_64BIT @@ -1947,7 +2025,7 @@ namespace nanojit LIns* ins1(LOpcode v, LIns*); LIns* ins2(LOpcode v, LIns*, LIns*); LIns* ins3(LOpcode v, LIns*, LIns*, LIns*); - LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet); + LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual); LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet); LIns* insCall(const CallInfo *call, LIns* args[]); LIns* insGuard(LOpcode op, LIns* cond, GuardRecord *gr); @@ -2006,7 +2084,7 @@ namespace nanojit } // LirWriter interface - LIns* insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet); + LIns* insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual); LIns* insStore(LOpcode op, LIns* o1, LIns* o2, int32_t disp, AccSet accSet); LIns* ins0(LOpcode op); LIns* ins1(LOpcode op, LIns* o1); @@ -2142,19 +2220,21 @@ namespace nanojit void errorStructureShouldBe(LOpcode op, const char* argDesc, int argN, LIns* arg, const char* shouldBeDesc); void errorAccSet(const char* what, AccSet accSet, const char* shouldDesc); + void errorLoadQual(const char* what, LoadQual loadQual); void checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2); void checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins); void checkLInsIsNull(LOpcode op, int argN, LIns* ins); - void checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet); + void checkAccSet(LOpcode op, LIns* base, AccSet accSet); // defined by the embedder - LIns *sp, *rp; + // These can be set by the embedder and used in checkAccSet(). + LIns *checkAccSetIns1, *checkAccSetIns2; public: ValidateWriter(LirWriter* out, LInsPrinter* printer, const char* where); - void setSp(LIns* ins) { sp = ins; } - void setRp(LIns* ins) { rp = ins; } + void setCheckAccSetIns1(LIns* ins) { checkAccSetIns1 = ins; } + void setCheckAccSetIns2(LIns* ins) { checkAccSetIns2 = ins; } - LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet); + LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual); LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet); LIns* ins0(LOpcode v); LIns* ins1(LOpcode v, LIns* a);