Bug 552812 - nanojit: factor out AccSet differences into TM and TR (take 2). r=edwsmith.

--HG--
extra : convert_revision : 7e11df137a4d30bd8411cf3b35cb26a003188f86
This commit is contained in:
Nicholas Nethercote 2010-07-25 19:17:39 -07:00
parent 5272ac360b
commit e82972a6ac
3 changed files with 499 additions and 404 deletions

View File

@ -95,6 +95,10 @@ nanojit::StackFilter::getTop(LIns*)
return 0;
}
// We lump everything into a single access region for lirasm.
static const AccSet ACCSET_OTHER = (1 << 0);
static const uint8_t LIRASM_NUM_USED_ACCS = 1;
#if defined NJ_VERBOSE
void
nanojit::LInsPrinter::formatGuard(InsBuf *buf, LIns *ins)
@ -124,6 +128,22 @@ nanojit::LInsPrinter::formatGuardXov(InsBuf *buf, LIns *ins)
(long)x->line,
ins->record()->profGuardID);
}
const char*
nanojit::LInsPrinter::accNames[] = {
"o", // (1 << 0) == ACCSET_OTHER
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 1..10 (unused)
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 11..20 (unused)
"?", "?", "?", "?", "?", "?", "?", "?", "?", "?", // 21..30 (unused)
"?" // 31 (unused)
};
#endif
#ifdef DEBUG
void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet)
{
NanoAssert(accSet == ACCSET_OTHER);
}
#endif
typedef int32_t (FASTCALL *RetInt)();
@ -148,7 +168,7 @@ enum ReturnType {
#endif
#define CI(name, args) \
{(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACC_STORE_ANY \
{(uintptr_t) (&name), args, nanojit::ABI_CDECL, /*isPure*/0, ACCSET_STORE_ANY \
DEBUG_ONLY_NAME(name)}
#define FN(name, args) \
@ -504,7 +524,7 @@ FragmentAssembler::FragmentAssembler(Lirasm &parent, const string &fragmentName,
}
#endif
if (optimize) {
mLir = mCseFilter = new CseFilter(mLir, mParent.mAlloc);
mLir = mCseFilter = new CseFilter(mLir, LIRASM_NUM_USED_ACCS, mParent.mAlloc);
}
#if NJ_SOFTFLOAT_SUPPORTED
if (avmplus::AvmCore::config.soft_float) {
@ -610,7 +630,7 @@ FragmentAssembler::assemble_load()
mTokens[1].find_first_of("0123456789") == 0) {
return mLir->insLoad(mOpcode,
ref(mTokens[0]),
immI(mTokens[1]), ACC_LOAD_ANY);
immI(mTokens[1]), ACCSET_OTHER);
}
bad("immediate offset required for load");
return NULL; // not reached
@ -1061,7 +1081,7 @@ FragmentAssembler::assembleFragment(LirTokenStream &in, bool implicitBegin, cons
need(3);
ins = mLir->insStore(mOpcode, ref(mTokens[0]),
ref(mTokens[1]),
immI(mTokens[2]), ACC_STORE_ANY);
immI(mTokens[2]), ACCSET_OTHER);
break;
#if NJ_EXPANDED_LOADSTORE_SUPPORTED
@ -1313,8 +1333,8 @@ const CallInfo ci_V_IQF = CI(f_V_IQF, CallInfo::typeSig3(ARGTYPE_V, ARGTYPE_I, A
// - LIR_modd (not implemented in NJ backends)
//
// Other limitations:
// - Loads always use accSet==ACC_LOAD_ANY
// - Stores always use accSet==ACC_STORE_ANY
// - Loads always use accSet==ACCSET_OTHER
// - Stores always use accSet==ACCSET_OTHER
//
void
FragmentAssembler::assembleRandomFragment(int nIns)
@ -1817,7 +1837,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
if (!Ms.empty()) {
LIns* base = rndPick(Ms);
ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACC_LOAD_ANY);
ins = mLir->insLoad(rndPick(I_loads), base, rndOffset32(base->size()), ACCSET_OTHER);
addOrReplace(Is, ins);
n++;
}
@ -1828,7 +1848,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
case LLD_Q:
if (!M8ps.empty()) {
LIns* base = rndPick(M8ps);
ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
ins = mLir->insLoad(rndPick(Q_loads), base, rndOffset64(base->size()), ACCSET_OTHER);
addOrReplace(Qs, ins);
n++;
}
@ -1838,7 +1858,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
case LLD_D:
if (!M8ps.empty()) {
LIns* base = rndPick(M8ps);
ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACC_LOAD_ANY);
ins = mLir->insLoad(rndPick(D_loads), base, rndOffset64(base->size()), ACCSET_OTHER);
addOrReplace(Ds, ins);
n++;
}
@ -1848,7 +1868,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
vector<LIns*> Ms = rnd(2) ? M4s : M8ps;
if (!Ms.empty() && !Is.empty()) {
LIns* base = rndPick(Ms);
mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACC_STORE_ANY);
mLir->insStore(rndPick(Is), base, rndOffset32(base->size()), ACCSET_OTHER);
n++;
}
break;
@ -1858,7 +1878,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
case LST_Q:
if (!M8ps.empty() && !Qs.empty()) {
LIns* base = rndPick(M8ps);
mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACC_STORE_ANY);
mLir->insStore(rndPick(Qs), base, rndOffset64(base->size()), ACCSET_OTHER);
n++;
}
break;
@ -1867,7 +1887,7 @@ FragmentAssembler::assembleRandomFragment(int nIns)
case LST_D:
if (!M8ps.empty() && !Ds.empty()) {
LIns* base = rndPick(M8ps);
mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACC_STORE_ANY);
mLir->insStore(rndPick(Ds), base, rndOffset64(base->size()), ACCSET_OTHER);
n++;
}
break;
@ -1977,7 +1997,7 @@ Lirasm::Lirasm(bool verbose) :
#ifdef DEBUG
if (mVerbose) {
mLogc.lcbits = LC_ReadLIR | LC_AfterDCE | LC_Native | LC_RegAlloc | LC_Activation;
mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc);
mLirbuf->printer = new (mAlloc) LInsPrinter(mAlloc, LIRASM_NUM_USED_ACCS);
}
#endif
@ -2016,13 +2036,13 @@ Lirasm::lookupFunction(const string &name, CallInfo *&ci)
// The ABI, arg types and ret type will be overridden by the caller.
if (func->second.mReturnType == RT_FLOAT) {
CallInfo target = {(uintptr_t) func->second.rfloat,
0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY
0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY
verbose_only(, func->first.c_str()) };
*ci = target;
} else {
CallInfo target = {(uintptr_t) func->second.rint,
0, ABI_FASTCALL, /*isPure*/0, ACC_STORE_ANY
0, ABI_FASTCALL, /*isPure*/0, ACCSET_STORE_ANY
verbose_only(, func->first.c_str()) };
*ci = target;
}

View File

@ -294,18 +294,18 @@ namespace nanojit
return ins;
}
LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
LIns* LirBufWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual)
{
if (isS16(d)) {
LInsLd* insLd = (LInsLd*)_buf->makeRoom(sizeof(LInsLd));
LIns* ins = insLd->getLIns();
ins->initLInsLd(op, base, d, accSet);
ins->initLInsLd(op, base, d, accSet, loadQual);
return ins;
} else {
// If the displacement is more than 16 bits, put it in a separate instruction.
// Note that CseFilter::insLoad() also does this, so this will
// only occur if CseFilter has been removed from the pipeline.
return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet);
return insLoad(op, ins2(LIR_addp, base, insImmWord(d)), 0, accSet, loadQual);
}
}
@ -1102,7 +1102,7 @@ namespace nanojit
return out->insBranchJov(op, oprnd1, oprnd2, target);
}
LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet) {
LIns* ExprFilter::insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual) {
if (base->isImmP() && !isS8(off)) {
// if the effective address is constant, then transform:
// ld const[bigconst] => ld (const+bigconst)[0]
@ -1110,9 +1110,9 @@ namespace nanojit
// under the assumption that we're more likely to CSE-match the
// constant base address if we dont const-fold small offsets.
uintptr_t p = (uintptr_t)base->immP() + off;
return out->insLoad(op, insImmP((void*)p), 0, accSet);
return out->insLoad(op, insImmP((void*)p), 0, accSet, loadQual);
}
return out->insLoad(op, base, off, accSet);
return out->insLoad(op, base, off, accSet, loadQual);
}
LIns* LirWriter::insStore(LIns* value, LIns* base, int32_t d, AccSet accSet)
@ -1602,19 +1602,27 @@ namespace nanojit
return e ? e->name : NULL;
}
char* LInsPrinter::formatAccSet(RefBuf* buf, AccSet accSet) {
int i = 0;
// 'c' is short for "const", because 'r' is used for RSTACK.
if (accSet & ACC_READONLY) { buf->buf[i++] = 'c'; accSet &= ~ACC_READONLY; }
if (accSet & ACC_STACK) { buf->buf[i++] = 's'; accSet &= ~ACC_STACK; }
if (accSet & ACC_RSTACK) { buf->buf[i++] = 'r'; accSet &= ~ACC_RSTACK; }
if (accSet & ACC_OTHER) { buf->buf[i++] = 'o'; accSet &= ~ACC_OTHER; }
// This assertion will fail if we add a new accSet value but
// forget to handle it here.
NanoAssert(accSet == 0);
buf->buf[i] = 0;
NanoAssert(size_t(i) < buf->len);
if (accSet == ACCSET_NONE) {
VMPI_sprintf(buf->buf, ".none");
} else if (accSet == ACCSET_ALL) {
VMPI_sprintf(buf->buf, ".all");
} else {
char* b = buf->buf;
b[0] = 0;
// The AccSet may contain bits set for regions not used by the
// embedding, if any have been specified via
// (ACCSET_ALL & ~ACCSET_XYZ). So only print those that are
// relevant.
for (int i = 0; i < EMB_NUM_USED_ACCS; i++) {
if (accSet & (1 << i)) {
VMPI_strcat(b, ".");
VMPI_strcat(b, accNames[i]);
accSet &= ~(1 << i);
}
}
NanoAssert(VMPI_strlen(b) < buf->len);
}
return buf->buf;
}
@ -1740,11 +1748,11 @@ namespace nanojit
int32_t argc = i->argc();
int32_t m = int32_t(n); // Windows doesn't have 'ssize_t'
if (call->isIndirect())
m -= VMPI_snprintf(s, m, "%s = %s.%s [%s] ( ", formatRef(&b1, i), lirNames[op],
m -= VMPI_snprintf(s, m, "%s = %s%s [%s] ( ", formatRef(&b1, i), lirNames[op],
formatAccSet(&b2, call->_storeAccSet),
formatRef(&b3, i->arg(--argc)));
else
m -= VMPI_snprintf(s, m, "%s = %s.%s #%s ( ", formatRef(&b1, i), lirNames[op],
m -= VMPI_snprintf(s, m, "%s = %s%s #%s ( ", formatRef(&b1, i), lirNames[op],
formatAccSet(&b2, call->_storeAccSet), call->_name);
if (m < 0) break;
for (int32_t j = argc - 1; j >= 0; j--) {
@ -1907,12 +1915,19 @@ namespace nanojit
case LIR_ldus2ui:
case LIR_ldc2i:
case LIR_lds2i:
case LIR_ldf2d:
VMPI_snprintf(s, n, "%s = %s.%s %s[%d]", formatRef(&b1, i), lirNames[op],
formatAccSet(&b2, i->accSet()),
formatRef(&b3, i->oprnd1()),
case LIR_ldf2d: {
const char* qualStr;
switch (i->loadQual()) {
case LOAD_CONST: qualStr = "/c"; break;
case LOAD_NORMAL: qualStr = ""; break;
case LOAD_VOLATILE: qualStr = "/v"; break;
default: NanoAssert(0); qualStr = "/?"; break;
}
VMPI_snprintf(s, n, "%s = %s%s%s %s[%d]", formatRef(&b1, i), lirNames[op],
formatAccSet(&b2, i->accSet()), qualStr, formatRef(&b3, i->oprnd1()),
i->disp());
break;
}
case LIR_sti:
CASE64(LIR_stq:)
@ -1920,7 +1935,7 @@ namespace nanojit
case LIR_sti2c:
case LIR_sti2s:
case LIR_std2f:
VMPI_snprintf(s, n, "%s.%s %s[%d] = %s", lirNames[op],
VMPI_snprintf(s, n, "%s%s %s[%d] = %s", lirNames[op],
formatAccSet(&b1, i->accSet()),
formatRef(&b2, i->oprnd2()),
i->disp(),
@ -1935,40 +1950,42 @@ namespace nanojit
}
#endif
CseFilter::CseFilter(LirWriter *out, Allocator& alloc)
: LirWriter(out), storesSinceLastLoad(ACC_NONE), alloc(alloc)
CseFilter::CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator& alloc)
: LirWriter(out),
EMB_NUM_USED_ACCS(embNumUsedAccs),
CSE_NUM_USED_ACCS(EMB_NUM_USED_ACCS + 2),
CSE_ACC_CONST( EMB_NUM_USED_ACCS + 0),
CSE_ACC_MULTIPLE( EMB_NUM_USED_ACCS + 1),
storesSinceLastLoad(ACCSET_NONE),
alloc(alloc)
{
m_find[LInsImmI] = &CseFilter::findImmI;
m_find[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ);
m_find[LInsImmD] = &CseFilter::findImmD;
m_find[LIns1] = &CseFilter::find1;
m_find[LIns2] = &CseFilter::find2;
m_find[LIns3] = &CseFilter::find3;
m_find[LInsCall] = &CseFilter::findCall;
m_find[LInsLoadReadOnly] = &CseFilter::findLoadReadOnly;
m_find[LInsLoadStack] = &CseFilter::findLoadStack;
m_find[LInsLoadRStack] = &CseFilter::findLoadRStack;
m_find[LInsLoadOther] = &CseFilter::findLoadOther;
m_find[LInsLoadMultiple] = &CseFilter::findLoadMultiple;
m_cap[LInsImmI] = 128;
m_cap[LInsImmQ] = PTR_SIZE(0, 16);
m_cap[LInsImmD] = 16;
m_cap[LIns1] = 256;
m_cap[LIns2] = 512;
m_cap[LIns3] = 16;
m_cap[LInsCall] = 64;
m_cap[LInsLoadReadOnly] = 16;
m_cap[LInsLoadStack] = 16;
m_cap[LInsLoadRStack] = 16;
m_cap[LInsLoadOther] = 16;
m_cap[LInsLoadMultiple] = 16;
m_findNL[LInsImmI] = &CseFilter::findImmI;
m_findNL[LInsImmQ] = PTR_SIZE(NULL, &CseFilter::findImmQ);
m_findNL[LInsImmD] = &CseFilter::findImmD;
m_findNL[LIns1] = &CseFilter::find1;
m_findNL[LIns2] = &CseFilter::find2;
m_findNL[LIns3] = &CseFilter::find3;
m_findNL[LInsCall] = &CseFilter::findCall;
for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
m_list[kind] = new (alloc) LIns*[m_cap[kind]];
m_capNL[LInsImmI] = 128;
m_capNL[LInsImmQ] = PTR_SIZE(0, 16);
m_capNL[LInsImmD] = 16;
m_capNL[LIns1] = 256;
m_capNL[LIns2] = 512;
m_capNL[LIns3] = 16;
m_capNL[LInsCall] = 64;
for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
// Note that this allocates the CONST and MULTIPLE tables as well.
for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++) {
m_capL[a] = 16;
m_listL[a] = new (alloc) LIns*[m_capL[a]];
}
clear();
clearAll();
}
// Inlined/separated version of SuperFastHash.
@ -2017,15 +2034,23 @@ namespace nanojit
return hash;
}
void CseFilter::clear(LInsHashKind kind) {
VMPI_memset(m_list[kind], 0, sizeof(LIns*)*m_cap[kind]);
m_used[kind] = 0;
void CseFilter::clearNL(NLKind nlkind) {
VMPI_memset(m_listNL[nlkind], 0, sizeof(LIns*)*m_capNL[nlkind]);
m_usedNL[nlkind] = 0;
}
void CseFilter::clear() {
for (LInsHashKind kind = LInsFirst; kind <= LInsLast; kind = nextKind(kind)) {
clear(kind);
}
void CseFilter::clearL(CseAcc a) {
VMPI_memset(m_listL[a], 0, sizeof(LIns*)*m_capL[a]);
m_usedL[a] = 0;
}
void CseFilter::clearAll() {
for (NLKind nlkind = LInsFirst; nlkind <= LInsLast; nlkind = nextNLKind(nlkind))
clearNL(nlkind);
// Note that this clears the CONST and MULTIPLE load tables as well.
for (CseAcc a = 0; a < CSE_NUM_USED_ACCS; a++)
clearL(a);
}
inline uint32_t CseFilter::hashImmI(int32_t a) {
@ -2055,15 +2080,12 @@ namespace nanojit
return hashfinish(hashptr(hash, c));
}
NanoStaticAssert(sizeof(AccSet) == 1); // required for hashLoad to work properly
// Nb: no need to hash the load's AccSet because each region's loads go in
// a different hash table.
inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet) {
uint32_t hash = hash8(0,uint8_t(op));
// Nb: no need to hash the load's MiniAccSet because each every load goes
// into a table where all the loads have the same MiniAccSet.
inline uint32_t CseFilter::hashLoad(LOpcode op, LIns* a, int32_t d) {
uint32_t hash = hash8(0, uint8_t(op));
hash = hashptr(hash, a);
hash = hash32(hash, d);
return hashfinish(hash8(hash, accSet));
return hashfinish(hash32(hash, d));
}
inline uint32_t CseFilter::hashCall(const CallInfo *ci, uint32_t argc, LIns* args[]) {
@ -2073,41 +2095,69 @@ namespace nanojit
return hashfinish(hash);
}
void CseFilter::grow(LInsHashKind kind)
void CseFilter::growNL(NLKind nlkind)
{
const uint32_t oldcap = m_cap[kind];
m_cap[kind] <<= 1;
LIns** oldlist = m_list[kind];
m_list[kind] = new (alloc) LIns*[m_cap[kind]];
VMPI_memset(m_list[kind], 0, m_cap[kind] * sizeof(LIns*));
find_t find = m_find[kind];
const uint32_t oldcap = m_capNL[nlkind];
m_capNL[nlkind] <<= 1;
LIns** oldlist = m_listNL[nlkind];
m_listNL[nlkind] = new (alloc) LIns*[m_capNL[nlkind]];
VMPI_memset(m_listNL[nlkind], 0, m_capNL[nlkind] * sizeof(LIns*));
find_t find = m_findNL[nlkind];
for (uint32_t i = 0; i < oldcap; i++) {
LIns* ins = oldlist[i];
if (!ins) continue;
uint32_t j = (this->*find)(ins);
NanoAssert(!m_list[kind][j]);
m_list[kind][j] = ins;
NanoAssert(!m_listNL[nlkind][j]);
m_listNL[nlkind][j] = ins;
}
}
void CseFilter::add(LInsHashKind kind, LIns* ins, uint32_t k)
void CseFilter::growL(CseAcc cseAcc)
{
NanoAssert(!m_list[kind][k]);
m_used[kind]++;
m_list[kind][k] = ins;
if ((m_used[kind] * 4) >= (m_cap[kind] * 3)) { // load factor of 0.75
grow(kind);
const uint32_t oldcap = m_capL[cseAcc];
m_capL[cseAcc] <<= 1;
LIns** oldlist = m_listL[cseAcc];
m_listL[cseAcc] = new (alloc) LIns*[m_capL[cseAcc]];
VMPI_memset(m_listL[cseAcc], 0, m_capL[cseAcc] * sizeof(LIns*));
find_t find = &CseFilter::findLoad;
for (uint32_t i = 0; i < oldcap; i++) {
LIns* ins = oldlist[i];
if (!ins) continue;
uint32_t j = (this->*find)(ins);
NanoAssert(!m_listL[cseAcc][j]);
m_listL[cseAcc][j] = ins;
}
}
void CseFilter::addNL(NLKind nlkind, LIns* ins, uint32_t k)
{
NanoAssert(!m_listNL[nlkind][k]);
m_usedNL[nlkind]++;
m_listNL[nlkind][k] = ins;
if ((m_usedNL[nlkind] * 4) >= (m_capNL[nlkind] * 3)) { // load factor of 0.75
growNL(nlkind);
}
}
void CseFilter::addL(LIns* ins, uint32_t k)
{
CseAcc cseAcc = miniAccSetToCseAcc(ins->miniAccSet(), ins->loadQual());
NanoAssert(!m_listL[cseAcc][k]);
m_usedL[cseAcc]++;
m_listL[cseAcc][k] = ins;
if ((m_usedL[cseAcc] * 4) >= (m_capL[cseAcc] * 3)) { // load factor of 0.75
growL(cseAcc);
}
}
inline LIns* CseFilter::findImmI(int32_t a, uint32_t &k)
{
LInsHashKind kind = LInsImmI;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LInsImmI;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hashImmI(a) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
NanoAssert(ins->isImmI());
@ -2135,12 +2185,12 @@ namespace nanojit
#ifdef NANOJIT_64BIT
inline LIns* CseFilter::findImmQ(uint64_t a, uint32_t &k)
{
LInsHashKind kind = LInsImmQ;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LInsImmQ;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hashImmQorD(a) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
NanoAssert(ins->isImmQ());
@ -2161,12 +2211,12 @@ namespace nanojit
inline LIns* CseFilter::findImmD(uint64_t a, uint32_t &k)
{
LInsHashKind kind = LInsImmD;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LInsImmD;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hashImmQorD(a) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
NanoAssert(ins->isImmD());
@ -2186,12 +2236,12 @@ namespace nanojit
inline LIns* CseFilter::find1(LOpcode op, LIns* a, uint32_t &k)
{
LInsHashKind kind = LIns1;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LIns1;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hash1(op, a) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
if (ins->isop(op) && ins->oprnd1() == a)
@ -2210,12 +2260,12 @@ namespace nanojit
inline LIns* CseFilter::find2(LOpcode op, LIns* a, LIns* b, uint32_t &k)
{
LInsHashKind kind = LIns2;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LIns2;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hash2(op, a, b) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b)
@ -2234,12 +2284,12 @@ namespace nanojit
inline LIns* CseFilter::find3(LOpcode op, LIns* a, LIns* b, LIns* c, uint32_t &k)
{
LInsHashKind kind = LIns3;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LIns3;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hash3(op, a, b, c) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
if (ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c)
@ -2256,18 +2306,17 @@ namespace nanojit
return k;
}
inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, AccSet accSet,
LInsHashKind kind, uint32_t &k)
inline LIns* CseFilter::findLoad(LOpcode op, LIns* a, int32_t d, MiniAccSet miniAccSet,
LoadQual loadQual, uint32_t &k)
{
(void)accSet;
const uint32_t bitmask = m_cap[kind] - 1;
k = hashLoad(op, a, d, accSet) & bitmask;
CseAcc cseAcc = miniAccSetToCseAcc(miniAccSet, loadQual);
const uint32_t bitmask = m_capL[cseAcc] - 1;
k = hashLoad(op, a, d) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listL[cseAcc][k];
if (!ins)
return NULL;
NanoAssert(ins->accSet() == accSet);
if (ins->isop(op) && ins->oprnd1() == a && ins->disp() == d)
return ins;
k = (k + n) & bitmask;
@ -2275,38 +2324,10 @@ namespace nanojit
}
}
uint32_t CseFilter::findLoadReadOnly(LIns* ins)
uint32_t CseFilter::findLoad(LIns* ins)
{
uint32_t k;
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadReadOnly, k);
return k;
}
uint32_t CseFilter::findLoadStack(LIns* ins)
{
uint32_t k;
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadStack, k);
return k;
}
uint32_t CseFilter::findLoadRStack(LIns* ins)
{
uint32_t k;
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadRStack, k);
return k;
}
uint32_t CseFilter::findLoadOther(LIns* ins)
{
uint32_t k;
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadOther, k);
return k;
}
uint32_t CseFilter::findLoadMultiple(LIns* ins)
{
uint32_t k;
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->accSet(), LInsLoadMultiple, k);
findLoad(ins->opcode(), ins->oprnd1(), ins->disp(), ins->miniAccSet(), ins->loadQual(), k);
return k;
}
@ -2320,12 +2341,12 @@ namespace nanojit
inline LIns* CseFilter::findCall(const CallInfo *ci, uint32_t argc, LIns* args[], uint32_t &k)
{
LInsHashKind kind = LInsCall;
const uint32_t bitmask = m_cap[kind] - 1;
NLKind nlkind = LInsCall;
const uint32_t bitmask = m_capNL[nlkind] - 1;
k = hashCall(ci, argc, args) & bitmask;
uint32_t n = 1;
while (true) {
LIns* ins = m_list[kind][k];
LIns* ins = m_listNL[nlkind][k];
if (!ins)
return NULL;
if (ins->isCall() && ins->callInfo() == ci && argsmatch(ins, argc, args))
@ -2353,7 +2374,7 @@ namespace nanojit
LIns* ins = findImmI(imm, k);
if (!ins) {
ins = out->insImmI(imm);
add(LInsImmI, ins, k);
addNL(LInsImmI, ins, k);
}
// We assume that downstream stages do not modify the instruction, so
// that we can insert 'ins' into slot 'k'. Check this.
@ -2368,7 +2389,7 @@ namespace nanojit
LIns* ins = findImmQ(q, k);
if (!ins) {
ins = out->insImmQ(q);
add(LInsImmQ, ins, k);
addNL(LInsImmQ, ins, k);
}
NanoAssert(ins->isop(LIR_immq) && ins->immQ() == q);
return ins;
@ -2388,7 +2409,7 @@ namespace nanojit
LIns* ins = findImmD(u.u64, k);
if (!ins) {
ins = out->insImmD(d);
add(LInsImmD, ins, k);
addNL(LInsImmD, ins, k);
}
NanoAssert(ins->isop(LIR_immd) && ins->immDasQ() == u.u64);
return ins;
@ -2397,7 +2418,7 @@ namespace nanojit
LIns* CseFilter::ins0(LOpcode op)
{
if (op == LIR_label)
clear();
clearAll();
return out->ins0(op);
}
@ -2409,7 +2430,7 @@ namespace nanojit
ins = find1(op, a, k);
if (!ins) {
ins = out->ins1(op, a);
add(LIns1, ins, k);
addNL(LIns1, ins, k);
}
} else {
ins = out->ins1(op, a);
@ -2426,7 +2447,7 @@ namespace nanojit
ins = find2(op, a, b, k);
if (!ins) {
ins = out->ins2(op, a, b);
add(LIns2, ins, k);
addNL(LIns2, ins, k);
}
NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
return ins;
@ -2439,51 +2460,56 @@ namespace nanojit
LIns* ins = find3(op, a, b, c, k);
if (!ins) {
ins = out->ins3(op, a, b, c);
add(LIns3, ins, k);
addNL(LIns3, ins, k);
}
NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b && ins->oprnd3() == c);
return ins;
}
LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet loadAccSet)
LIns* CseFilter::insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual)
{
LIns* ins;
if (isS16(disp)) {
// Clear all loads aliased by stores and calls since the last time
// we were in this function.
if (storesSinceLastLoad != ACC_NONE) {
NanoAssert(!(storesSinceLastLoad & ACC_READONLY)); // can't store to READONLY
if (storesSinceLastLoad & ACC_STACK) { clear(LInsLoadStack); }
if (storesSinceLastLoad & ACC_RSTACK) { clear(LInsLoadRStack); }
if (storesSinceLastLoad & ACC_OTHER) { clear(LInsLoadOther); }
// Loads marked with multiple access regions must be treated
// conservatively -- we always clear all of them.
clear(LInsLoadMultiple);
storesSinceLastLoad = ACC_NONE;
if (storesSinceLastLoad != ACCSET_NONE) {
// Clear all normal (excludes CONST and MULTIPLE) loads
// aliased by stores and calls since the last time we were in
// this function.
for (CseAcc a = 0; a < EMB_NUM_USED_ACCS; a++)
if (storesSinceLastLoad & (1 << a))
clearL(a);
// No need to clear CONST loads (those in the CSE_ACC_CONST table).
// Multi-region loads must be treated conservatively -- we
// always clear all of them.
clearL(CSE_ACC_MULTIPLE);
storesSinceLastLoad = ACCSET_NONE;
}
LInsHashKind kind;
switch (loadAccSet) {
case ACC_READONLY: kind = LInsLoadReadOnly; break;
case ACC_STACK: kind = LInsLoadStack; break;
case ACC_RSTACK: kind = LInsLoadRStack; break;
case ACC_OTHER: kind = LInsLoadOther; break;
default: kind = LInsLoadMultiple; break;
if (loadQual == LOAD_VOLATILE) {
// Volatile loads are never CSE'd, don't bother looking for
// them or inserting them in the table.
ins = out->insLoad(op, base, disp, accSet, loadQual);
} else {
uint32_t k;
ins = findLoad(op, base, disp, compressAccSet(accSet), loadQual, k);
if (!ins) {
ins = out->insLoad(op, base, disp, accSet, loadQual);
addL(ins, k);
}
}
uint32_t k;
ins = findLoad(op, base, disp, loadAccSet, kind, k);
if (!ins) {
ins = out->insLoad(op, base, disp, loadAccSet);
add(kind, ins, k);
}
NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp);
// Nb: must compare miniAccSets, not AccSets, because the AccSet
// stored in the load may have lost info if it's multi-region.
NanoAssert(ins->isop(op) && ins->oprnd1() == base && ins->disp() == disp &&
ins->miniAccSet().val == compressAccSet(accSet).val &&
ins->loadQual() == loadQual);
} else {
// If the displacement is more than 16 bits, put it in a separate
// instruction. Nb: LirBufWriter also does this, we do it here
// too because CseFilter relies on LirBufWriter not changing code.
ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, loadAccSet);
ins = insLoad(op, ins2(LIR_addp, base, insImmWord(disp)), 0, accSet, loadQual);
}
return ins;
}
@ -2531,7 +2557,7 @@ namespace nanojit
ins = find1(op, c, k);
if (!ins) {
ins = out->insGuard(op, c, gr);
add(LIns1, ins, k);
addNL(LIns1, ins, k);
}
} else {
ins = out->insGuard(op, c, gr);
@ -2549,7 +2575,7 @@ namespace nanojit
LIns* ins = find2(op, a, b, k);
if (!ins) {
ins = out->insGuardXov(op, a, b, gr);
add(LIns2, ins, k);
addNL(LIns2, ins, k);
}
NanoAssert(ins->isop(op) && ins->oprnd1() == a && ins->oprnd2() == b);
return ins;
@ -2562,12 +2588,12 @@ namespace nanojit
LIns* ins;
uint32_t argc = ci->count_args();
if (ci->_isPure) {
NanoAssert(ci->_storeAccSet == ACC_NONE);
NanoAssert(ci->_storeAccSet == ACCSET_NONE);
uint32_t k;
ins = findCall(ci, argc, args, k);
if (!ins) {
ins = out->insCall(ci, args);
add(LInsCall, ins, k);
addNL(LInsCall, ins, k);
}
} else {
// We only need to worry about aliasing if !ci->_isPure.
@ -2601,7 +2627,7 @@ namespace nanojit
#define SF_CALLINFO(name, typesig) \
static const CallInfo name##_ci = \
{ (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACC_NONE verbose_only(, #name) }
{ (intptr_t)&name, typesig, ABI_FASTCALL, /*isPure*/1, ACCSET_NONE verbose_only(, #name) }
SF_CALLINFO(i2d, SIG_D_I);
SF_CALLINFO(ui2d, SIG_D_UI);
@ -2817,6 +2843,13 @@ namespace nanojit
whereInPipeline, what, printer->formatAccSet(&b, accSet), shouldDesc);
}
void ValidateWriter::errorLoadQual(const char* what, LoadQual loadQual)
{
NanoAssertMsgf(0,
"LIR LoadQual error (%s): '%s' loadQual is '%d'",
whereInPipeline, what, loadQual);
}
void ValidateWriter::checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins)
{
// We could introduce a LTy_B32 type in the type system but that's a
@ -2839,60 +2872,26 @@ namespace nanojit
errorStructureShouldBe(op, "argument", argN, ins, lirNames[op2]);
}
void ValidateWriter::checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet)
{
if (accSet == ACC_NONE)
errorAccSet(lirNames[op], accSet, "it should not equal ACC_NONE");
if (accSet & ~maxAccSet)
errorAccSet(lirNames[op], accSet,
"it should not contain bits that aren't in ACC_LOAD_ANY/ACC_STORE_ANY");
// Some sanity checking, which is based on the following assumptions:
// - STACK ones should use 'sp' or 'sp+k' as the base. (We could look
// for more complex patterns, but that feels dangerous. Better to
// keep it really simple.)
// - RSTACK ones should use 'rp' as the base.
// - READONLY/OTHER ones should not use 'sp'/'sp+k' or 'rp' as the base.
//
// Things that aren't checked:
// - There's no easy way to check if READONLY ones really are read-only.
bool isStack = base == sp ||
(base->isop(LIR_addp) && base->oprnd1() == sp && base->oprnd2()->isImmP());
bool isRStack = base == rp;
switch (accSet) {
case ACC_STACK:
if (!isStack)
errorAccSet(lirNames[op], accSet, "but it's not a stack access");
break;
case ACC_RSTACK:
if (!isRStack)
errorAccSet(lirNames[op], accSet, "but it's not an rstack access");
break;
case ACC_READONLY:
case ACC_OTHER:
if (isStack)
errorAccSet(lirNames[op], accSet, "but it's a stack access");
if (isRStack)
errorAccSet(lirNames[op], accSet, "but it's an rstack access");
break;
default:
break;
}
}
ValidateWriter::ValidateWriter(LirWriter *out, LInsPrinter* printer, const char* where)
: LirWriter(out), printer(printer), whereInPipeline(where), sp(0), rp(0)
: LirWriter(out), printer(printer), whereInPipeline(where),
checkAccSetIns1(0), checkAccSetIns2(0)
{}
LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet)
LIns* ValidateWriter::insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet,
LoadQual loadQual)
{
checkAccSet(op, base, accSet, ACC_LOAD_ANY);
checkAccSet(op, base, accSet);
switch (loadQual) {
case LOAD_CONST:
case LOAD_NORMAL:
case LOAD_VOLATILE:
break;
default:
errorLoadQual(lirNames[op], loadQual);
break;
}
int nArgs = 1;
LTy formals[1] = { LTy_P };
@ -2914,12 +2913,12 @@ namespace nanojit
typeCheckArgs(op, nArgs, formals, args);
return out->insLoad(op, base, d, accSet);
return out->insLoad(op, base, d, accSet, loadQual);
}
LIns* ValidateWriter::insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet)
{
checkAccSet(op, base, accSet, ACC_STORE_ANY);
checkAccSet(op, base, accSet);
int nArgs = 2;
LTy formals[2] = { LTy_V, LTy_P }; // LTy_V is overwritten shortly
@ -3189,12 +3188,8 @@ namespace nanojit
LOpcode op = getCallOpcode(ci);
if (ci->_isPure && ci->_storeAccSet != ACC_NONE)
errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACC_NONE for pure functions");
if (ci->_storeAccSet & ~ACC_STORE_ANY)
errorAccSet(lirNames[op], ci->_storeAccSet,
"it should not contain bits that aren't in ACC_STORE_ANY");
if (ci->_isPure && ci->_storeAccSet != ACCSET_NONE)
errorAccSet(ci->_name, ci->_storeAccSet, "it should be ACCSET_NONE for pure functions");
// This loop iterates over the args from right-to-left (because arg()
// and getArgTypes() use right-to-left order), but puts the results

View File

@ -199,11 +199,12 @@ namespace nanojit
// Access regions
// --------------
// Doing alias analysis precisely is difficult. But it turns out that
// keeping track of aliasing at a very coarse level is enough to help with
// many optimisations. So we conceptually divide the memory that is
// accessible from LIR into a small number of "access regions". An access
// region may be non-contiguous. No two access regions can overlap. The
// union of all access regions covers all memory accessible from LIR.
// keeping track of aliasing at a coarse level is enough to help with many
// optimisations. So we conceptually divide the memory that is accessible
// from LIR into a small number of "access regions" (aka. "Acc"). An
// access region may be non-contiguous. No two access regions can
// overlap. The union of all access regions covers all memory accessible
// from LIR.
//
// In general a (static) load or store may be executed more than once, and
// thus may access multiple regions; however, in practice almost all
@ -214,64 +215,28 @@ namespace nanojit
// If two loads/stores/calls are known to not access the same region(s),
// then they do not alias.
//
// The access regions used are as follows:
//
// - READONLY: all memory that is read-only, ie. never stored to.
// A load from a READONLY region will never alias with any stores.
//
// - STACK: the stack. Stack loads/stores can usually be easily
// identified because they use SP as the base pointer.
//
// - RSTACK: the return stack. Return stack loads/stores can usually be
// easily identified because they use RP as the base pointer.
//
// - OTHER: all other regions of memory.
//
// It makes sense to add new access regions when doing so will help with
// one or more optimisations.
//
// One subtlety is that the meanings of the access region markings only
// apply to the LIR fragment that they are in. For example, if a memory
// location M is read-only in a particular LIR fragment, all loads
// involving M in that fragment can be safely marked READONLY, even if M
// is modified elsewhere. This is safe because the a LIR fragment is the
// unit of analysis in which the markings are used. In other words alias
// region markings are only used for intra-fragment optimisations.
// All regions are defined by the embedding. It makes sense to add new
// embedding-specific access regions when doing so will help with one or
// more optimisations.
//
// Access region sets and instruction markings
// -------------------------------------------
// The LIR generator must mark each load/store with an "access region
// set", which is a set of one or more access regions. This indicates
// which parts of LIR-accessible memory the load/store may touch.
// Each load/store is marked with an "access region set" (aka. "AccSet"),
// which is a set of one or more access regions. This indicates which
// parts of LIR-accessible memory the load/store may touch.
//
// The LIR generator must also mark each function called from LIR with an
// access region set for memory stored to by the function. (We could also
// have a marking for memory loads, but there's no need at the moment.)
// These markings apply to the function itself, not the call site (ie.
// they're not context-sensitive).
// Each function called from LIR is also marked with an access region set
// for memory stored to by the function. (We could also have a marking
// for memory loads done by the function, but there's no need at the
// moment.) These markings apply to the function itself, not the call
// site, ie. they're not context-sensitive.
//
// These load/store/call markings MUST BE ACCURATE -- if they are wrong
// then invalid optimisations might occur that change the meaning of the
// code. However, they can safely be imprecise (ie. conservative), in the
// following ways:
//
// - A load that accesses a READONLY region can be safely marked instead
// as loading from OTHER. In other words, it's safe to underestimate
// the size of the READONLY region. (This would also apply to the load
// set of a function, if we recorded that.)
//
// - A load/store can safely be marked as accessing regions that it
// doesn't, so long as the regions it does access are also included (one
// exception: marking a store with READONLY is nonsense and will cause
// assertions).
//
// In other words, a load/store can be marked with an access region set
// that is a superset of its actual access region set. Taking this to
// its logical conclusion, any load can be safely marked with LOAD_ANY and
// any store can be safely marked with with STORE_ANY (and the latter is
// true for the store set of a function.)
//
// Such imprecision is safe but may reduce optimisation opportunities.
// These load/store/call markings MUST BE ACCURATE -- if not then invalid
// optimisations might occur that change the meaning of the code.
// However, they can safely be imprecise (ie. conservative), ie. a
// load/store/call can be marked with an access region set that is a
// superset of the actual access region set. Such imprecision is safe but
// may reduce optimisation opportunities.
//
// Optimisations that use access region info
// -----------------------------------------
@ -282,35 +247,100 @@ namespace nanojit
// load with a single access region, you might as well use ACC_LOAD_ANY.
//-----------------------------------------------------------------------
// An access region set is represented as a bitset. Nb: this restricts us
// to at most eight alias regions for the moment.
typedef uint8_t AccSet;
// An access region set is represented as a bitset. Using a uint32_t
// restricts us to at most 32 alias regions for the moment. This could be
// expanded to a uint64_t easily if needed.
typedef uint32_t AccSet;
static const int NUM_ACCS = sizeof(AccSet) * 8;
// The access regions. Note that because of the bitset representation
// these constants are also valid (singleton) AccSet values. If you add
// new ones please update ACC_ALL_STORABLE and formatAccSet() and
// CseFilter.
//
static const AccSet ACC_READONLY = 1 << 0; // 0000_0001b
static const AccSet ACC_STACK = 1 << 1; // 0000_0010b
static const AccSet ACC_RSTACK = 1 << 2; // 0000_0100b
static const AccSet ACC_OTHER = 1 << 3; // 0000_1000b
// Some common (non-singleton) access region sets. ACC_NONE does not make
// Some common (non-singleton) access region sets. ACCSET_NONE does not make
// sense for loads or stores (which must access at least one region), it
// only makes sense for calls.
//
// A convention that's worth using: use ACC_LOAD_ANY/ACC_STORE_ANY for
// cases that you're unsure about or haven't considered carefully. Use
// ACC_ALL/ACC_ALL_STORABLE for cases that you have considered carefully.
// That way it's easy to tell which ones have been considered and which
// haven't.
static const AccSet ACC_NONE = 0x0;
static const AccSet ACC_ALL_STORABLE = ACC_STACK | ACC_RSTACK | ACC_OTHER;
static const AccSet ACC_ALL = ACC_READONLY | ACC_ALL_STORABLE;
static const AccSet ACC_LOAD_ANY = ACC_ALL; // synonym
static const AccSet ACC_STORE_ANY = ACC_ALL_STORABLE; // synonym
static const AccSet ACCSET_NONE = 0x0;
static const AccSet ACCSET_ALL = 0xffffffff;
static const AccSet ACCSET_LOAD_ANY = ACCSET_ALL; // synonym
static const AccSet ACCSET_STORE_ANY = ACCSET_ALL; // synonym
// Full AccSets don't fit into load and store instructions. But
// load/store AccSets almost always contain a single access region. We
// take advantage of this to create a compressed AccSet, MiniAccSet, that
// does fit.
//
// The 32 single-region AccSets get compressed into a number in the range
// 0..31 (according to the position of the set bit), and all other
// (multi-region) AccSets get converted into MINI_ACCSET_MULTIPLE. So the
// representation is lossy in the latter case, but that case is rare for
// loads/stores. We use a full AccSet for the storeAccSets of calls, for
// which multi-region AccSets are common.
//
// We wrap the uint8_t inside a struct to avoid the possiblity of subtle
// bugs caused by mixing up AccSet and MiniAccSet, which is easy to do.
// However, the struct gets padded inside LInsLd in an inconsistent way on
// Windows, so we actually store a MiniAccSetVal inside LInsLd. Sigh.
// But we use MiniAccSet everywhere else.
//
typedef uint8_t MiniAccSetVal;
struct MiniAccSet { MiniAccSetVal val; };
static const MiniAccSet MINI_ACCSET_MULTIPLE = { 255 };
static MiniAccSet compressAccSet(AccSet accSet) {
// As the number of regions increase, this may become a bottleneck.
// If it does we can first count the number of bits using Kernighan's
// technique
// (http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetKernighan)
// and if it's a single-region set, use a bit-scanning instruction to
// work out which single-region set it is. That would require
// factoring out the bit-scanning code currently in
// nRegisterAllocFromSet().
//
// Try all the single-region AccSets first.
for (int i = 0; i < NUM_ACCS; i++) {
if (accSet == (1U << i)) {
MiniAccSet ret = { uint8_t(i) };
return ret;
}
}
// If we got here, it must be a multi-region AccSet.
return MINI_ACCSET_MULTIPLE;
}
static AccSet decompressMiniAccSet(MiniAccSet miniAccSet) {
return (miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? ACCSET_ALL : (1 << miniAccSet.val);
}
// The LoadQual affects how a load can be optimised:
//
// - CONST: These loads are guaranteed to always return the same value
// during a single execution of a fragment (but the value is allowed to
// change between executions of the fragment). This means that the
// location is never stored to by the LIR, and is never modified by an
// external entity while the fragment is running.
//
// - NORMAL: These loads may be stored to by the LIR, but are never
// modified by an external entity while the fragment is running.
//
// - VOLATILE: These loads may be stored to by the LIR, and may be
// modified by an external entity while the fragment is running.
//
// This gives a lattice with the ordering: CONST < NORMAL < VOLATILE.
// As usual, it's safe to mark a load with a value higher (less precise)
// that actual, but it may result in fewer optimisations occurring.
//
// Generally CONST loads are highly amenable to optimisation (eg. CSE),
// VOLATILE loads are entirely unoptimisable, and NORMAL loads are in
// between and require some alias analysis to optimise.
//
// Note that CONST has a stronger meaning to "const" in C and C++; in C
// and C++ a "const" variable may be modified by an external entity, such
// as hardware. Hence "const volatile" makes sense in C and C++, but
// CONST+VOLATILE doesn't make sense in LIR.
//
enum LoadQual {
LOAD_CONST,
LOAD_NORMAL,
LOAD_VOLATILE
};
struct CallInfo
{
@ -687,7 +717,7 @@ namespace nanojit
inline void initLInsOp1(LOpcode opcode, LIns* oprnd1);
inline void initLInsOp2(LOpcode opcode, LIns* oprnd1, LIns* oprnd2);
inline void initLInsOp3(LOpcode opcode, LIns* oprnd1, LIns* oprnd2, LIns* oprnd3);
inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet);
inline void initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual);
inline void initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet);
inline void initLInsSk(LIns* prevLIns);
// Nb: args[] must be allocated and initialised before being passed in;
@ -790,8 +820,12 @@ namespace nanojit
// For guards.
inline GuardRecord* record() const;
// For loads.
inline LoadQual loadQual() const;
// For loads/stores.
inline int32_t disp() const;
inline MiniAccSet miniAccSet() const;
inline AccSet accSet() const;
// For LInsSk.
@ -1085,7 +1119,8 @@ namespace nanojit
// could go to 24 bits but then it would happen so rarely that the
// handler code would be difficult to test and thus untrustworthy.
int16_t disp;
AccSet accSet;
MiniAccSetVal miniAccSetVal; // not 'MiniAccSet' due to Windows padding; see above
LoadQual loadQual:2;
LIns* oprnd_1;
@ -1102,7 +1137,7 @@ namespace nanojit
friend class LIns;
int16_t disp;
AccSet accSet;
MiniAccSetVal miniAccSetVal;
LIns* oprnd_2;
@ -1251,12 +1286,13 @@ namespace nanojit
toLInsOp3()->oprnd_3 = oprnd3;
NanoAssert(isLInsOp3());
}
void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet) {
void LIns::initLInsLd(LOpcode opcode, LIns* val, int32_t d, AccSet accSet, LoadQual loadQual) {
initSharedFields(opcode);
toLInsLd()->oprnd_1 = val;
NanoAssert(d == int16_t(d));
toLInsLd()->disp = int16_t(d);
toLInsLd()->accSet = accSet;
toLInsLd()->miniAccSetVal = compressAccSet(accSet).val;
toLInsLd()->loadQual = loadQual;
NanoAssert(isLInsLd());
}
void LIns::initLInsSt(LOpcode opcode, LIns* val, LIns* base, int32_t d, AccSet accSet) {
@ -1265,7 +1301,7 @@ namespace nanojit
toLInsSt()->oprnd_2 = base;
NanoAssert(d == int16_t(d));
toLInsSt()->disp = int16_t(d);
toLInsSt()->accSet = accSet;
toLInsSt()->miniAccSetVal = compressAccSet(accSet).val;
NanoAssert(isLInsSt());
}
void LIns::initLInsSk(LIns* prevLIns) {
@ -1369,6 +1405,11 @@ namespace nanojit
}
}
LoadQual LIns::loadQual() const {
NanoAssert(isLInsLd());
return toLInsLd()->loadQual;
}
int32_t LIns::disp() const {
if (isLInsSt()) {
return toLInsSt()->disp;
@ -1378,13 +1419,19 @@ namespace nanojit
}
}
AccSet LIns::accSet() const {
MiniAccSet LIns::miniAccSet() const {
MiniAccSet miniAccSet;
if (isLInsSt()) {
return toLInsSt()->accSet;
miniAccSet.val = toLInsSt()->miniAccSetVal;
} else {
NanoAssert(isLInsLd());
return toLInsLd()->accSet;
miniAccSet.val = toLInsLd()->miniAccSetVal;
}
return miniAccSet;
}
AccSet LIns::accSet() const {
return decompressMiniAccSet(miniAccSet());
}
LIns* LIns::prevLIns() const {
@ -1510,8 +1557,8 @@ namespace nanojit
virtual LIns* insImmD(double d) {
return out->insImmD(d);
}
virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) {
return out->insLoad(op, base, d, accSet);
virtual LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual) {
return out->insLoad(op, base, d, accSet, loadQual);
}
virtual LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet) {
return out->insStore(op, value, base, d, accSet);
@ -1584,6 +1631,11 @@ namespace nanojit
#endif
}
// Do a load with LoadQual==LOAD_NORMAL.
LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet) {
return insLoad(op, base, d, accSet, LOAD_NORMAL);
}
// Chooses LIR_sti, LIR_stq or LIR_std according to the type of 'value'.
LIns* insStore(LIns* value, LIns* base, int32_t d, AccSet accSet);
};
@ -1680,16 +1732,19 @@ namespace nanojit
{
private:
Allocator& alloc;
const int EMB_NUM_USED_ACCS;
char *formatImmI(RefBuf* buf, int32_t c);
char *formatImmQ(RefBuf* buf, uint64_t c);
char *formatImmD(RefBuf* buf, double c);
void formatGuard(InsBuf* buf, LIns* ins);
void formatGuardXov(InsBuf* buf, LIns* ins);
void formatGuard(InsBuf* buf, LIns* ins); // defined by the embedder
void formatGuardXov(InsBuf* buf, LIns* ins); // defined by the embedder
static const char* accNames[]; // defined by the embedder
public:
LInsPrinter(Allocator& alloc)
: alloc(alloc)
LInsPrinter(Allocator& alloc, int embNumUsedAccs)
: alloc(alloc), EMB_NUM_USED_ACCS(embNumUsedAccs)
{
addrNameMap = new (alloc) AddrNameMap(alloc);
lirNameMap = new (alloc) LirNameMap(alloc);
@ -1790,8 +1845,8 @@ namespace nanojit
LIns* insParam(int32_t i, int32_t kind) {
return add(out->insParam(i, kind));
}
LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet) {
return add(out->insLoad(v, base, disp, accSet));
LIns* insLoad(LOpcode v, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual) {
return add(out->insLoad(v, base, disp, accSet, loadQual));
}
LIns* insStore(LOpcode op, LIns* v, LIns* b, int32_t d, AccSet accSet) {
return add(out->insStore(op, v, b, d, accSet));
@ -1825,16 +1880,17 @@ namespace nanojit
LIns* insGuardXov(LOpcode, LIns* a, LIns* b, GuardRecord *);
LIns* insBranch(LOpcode, LIns* cond, LIns* target);
LIns* insBranchJov(LOpcode, LIns* a, LIns* b, LIns* target);
LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet);
LIns* insLoad(LOpcode op, LIns* base, int32_t off, AccSet accSet, LoadQual loadQual);
private:
LIns* simplifyOverflowArith(LOpcode op, LIns** opnd1, LIns** opnd2);
};
class CseFilter: public LirWriter
{
enum LInsHashKind {
enum NLKind {
// We divide instruction kinds into groups. LIns0 isn't present
// because we don't need to record any 0-ary instructions.
// because we don't need to record any 0-ary instructions. Loads
// aren't here, they're handled separately.
LInsImmI = 0,
LInsImmQ = 1, // only occurs on 64-bit platforms
LInsImmD = 2,
@ -1843,46 +1899,69 @@ namespace nanojit
LIns3 = 5,
LInsCall = 6,
// Loads are special. We group them by access region: one table for
// each region, and then a catch-all table for any loads marked with
// multiple regions. This arrangement makes the removal of
// invalidated loads fast -- eg. we can invalidate all STACK loads by
// just clearing the LInsLoadStack table. The disadvantage is that
// loads marked with multiple regions must be invalidated
// conservatively, eg. if any intervening stores occur. But loads
// marked with multiple regions should be rare.
LInsLoadReadOnly = 7,
LInsLoadStack = 8,
LInsLoadRStack = 9,
LInsLoadOther = 10,
LInsLoadMultiple = 11,
LInsFirst = 0,
LInsLast = 11,
LInsLast = 6,
// Need a value after "last" to outsmart compilers that insist last+1 is impossible.
LInsInvalid = 12
LInsInvalid = 7
};
#define nextKind(kind) LInsHashKind(kind+1)
#define nextNLKind(kind) NLKind(kind+1)
// There is one list for each instruction kind. This lets us size the
// lists appropriately (some instructions are more common than others).
// It also lets us have kind-specific find/add/grow functions, which
// There is one table for each NLKind. This lets us size the lists
// appropriately (some instruction kinds are more common than others).
// It also lets us have NLKind-specific find/add/grow functions, which
// are faster than generic versions.
//
// Nb: Size must be a power of 2.
// Don't start too small, or we'll waste time growing and rehashing.
// Don't start too large, will waste memory.
// Nb: m_listNL and m_capNL sizes must be a power of 2.
// Don't start m_capNL too small, or we'll waste time growing and rehashing.
// Don't start m_capNL too large, will waste memory.
//
LIns** m_list[LInsLast + 1];
uint32_t m_cap[LInsLast + 1];
uint32_t m_used[LInsLast + 1];
LIns** m_listNL[LInsLast + 1];
uint32_t m_capNL[ LInsLast + 1];
uint32_t m_usedNL[LInsLast + 1];
typedef uint32_t (CseFilter::*find_t)(LIns*);
find_t m_find[LInsLast + 1];
find_t m_findNL[LInsLast + 1];
// Similarly, for loads, there is one table for each CseAcc. A CseAcc
// is like a normal access region, but there are two extra possible
// values: CSE_ACC_CONST, which is where we put all CONST-qualified
// loads, and CSE_ACC_MULTIPLE, where we put all multi-region loads.
// All remaining loads are single-region and go in the table entry for
// their region.
//
// This arrangement makes the removal of invalidated loads fast -- we
// can invalidate all loads from a single region by clearing that
// region's table.
//
typedef uint8_t CseAcc; // same type as MiniAccSet
static const uint8_t CSE_NUM_ACCS = NUM_ACCS + 2;
// These values would be 'static const' except they are defined in
// terms of EMB_NUM_USED_ACCS which is itself not 'static const'
// because it's passed in by the embedding.
const uint8_t EMB_NUM_USED_ACCS; // number of access regions used by the embedding
const uint8_t CSE_NUM_USED_ACCS; // EMB_NUM_USED_ACCS + 2
const CseAcc CSE_ACC_CONST; // EMB_NUM_USED_ACCS + 0
const CseAcc CSE_ACC_MULTIPLE; // EMB_NUM_USED_ACCS + 1
// We will only use CSE_NUM_USED_ACCS of these entries, ie. the
// number of lists allocated depends on the number of access regions
// in use by the embedding.
LIns** m_listL[CSE_NUM_ACCS];
uint32_t m_capL[ CSE_NUM_ACCS];
uint32_t m_usedL[CSE_NUM_ACCS];
AccSet storesSinceLastLoad; // regions stored to since the last load
Allocator& alloc;
CseAcc miniAccSetToCseAcc(MiniAccSet miniAccSet, LoadQual loadQual) {
NanoAssert(miniAccSet.val < NUM_ACCS || miniAccSet.val == MINI_ACCSET_MULTIPLE.val);
return (loadQual == LOAD_CONST) ? CSE_ACC_CONST :
(miniAccSet.val == MINI_ACCSET_MULTIPLE.val) ? CSE_ACC_MULTIPLE :
miniAccSet.val;
}
static uint32_t hash8(uint32_t hash, const uint8_t data);
static uint32_t hash32(uint32_t hash, const uint32_t data);
static uint32_t hashptr(uint32_t hash, const void* data);
@ -1893,7 +1972,7 @@ namespace nanojit
static uint32_t hash1(LOpcode op, LIns*);
static uint32_t hash2(LOpcode op, LIns*, LIns*);
static uint32_t hash3(LOpcode op, LIns*, LIns*, LIns*);
static uint32_t hashLoad(LOpcode op, LIns*, int32_t, AccSet);
static uint32_t hashLoad(LOpcode op, LIns*, int32_t);
static uint32_t hashCall(const CallInfo *call, uint32_t argc, LIns* args[]);
// These versions are used before an LIns has been created.
@ -1905,7 +1984,7 @@ namespace nanojit
LIns* find1(LOpcode v, LIns* a, uint32_t &k);
LIns* find2(LOpcode v, LIns* a, LIns* b, uint32_t &k);
LIns* find3(LOpcode v, LIns* a, LIns* b, LIns* c, uint32_t &k);
LIns* findLoad(LOpcode v, LIns* a, int32_t b, AccSet accSet, LInsHashKind kind,
LIns* findLoad(LOpcode v, LIns* a, int32_t b, MiniAccSet miniAccSet, LoadQual loadQual,
uint32_t &k);
LIns* findCall(const CallInfo *call, uint32_t argc, LIns* args[], uint32_t &k);
@ -1921,22 +2000,21 @@ namespace nanojit
uint32_t find2(LIns* ins);
uint32_t find3(LIns* ins);
uint32_t findCall(LIns* ins);
uint32_t findLoadReadOnly(LIns* ins);
uint32_t findLoadStack(LIns* ins);
uint32_t findLoadRStack(LIns* ins);
uint32_t findLoadOther(LIns* ins);
uint32_t findLoadMultiple(LIns* ins);
uint32_t findLoad(LIns* ins);
void grow(LInsHashKind kind);
void growNL(NLKind kind);
void growL(CseAcc cseAcc);
// 'k' is the index found by findXYZ().
void add(LInsHashKind kind, LIns* ins, uint32_t k);
void addNL(NLKind kind, LIns* ins, uint32_t k);
void addL(LIns* ins, uint32_t k);
void clear(); // clears all tables
void clear(LInsHashKind); // clears one table
void clearAll(); // clears all tables
void clearNL(NLKind); // clears one non-load table
void clearL(CseAcc); // clears one load table
public:
CseFilter(LirWriter *out, Allocator&);
CseFilter(LirWriter *out, uint8_t embNumUsedAccs, Allocator&);
LIns* insImmI(int32_t imm);
#ifdef NANOJIT_64BIT
@ -1947,7 +2025,7 @@ namespace nanojit
LIns* ins1(LOpcode v, LIns*);
LIns* ins2(LOpcode v, LIns*, LIns*);
LIns* ins3(LOpcode v, LIns*, LIns*, LIns*);
LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual);
LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
LIns* insCall(const CallInfo *call, LIns* args[]);
LIns* insGuard(LOpcode op, LIns* cond, GuardRecord *gr);
@ -2006,7 +2084,7 @@ namespace nanojit
}
// LirWriter interface
LIns* insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet);
LIns* insLoad(LOpcode op, LIns* base, int32_t disp, AccSet accSet, LoadQual loadQual);
LIns* insStore(LOpcode op, LIns* o1, LIns* o2, int32_t disp, AccSet accSet);
LIns* ins0(LOpcode op);
LIns* ins1(LOpcode op, LIns* o1);
@ -2142,19 +2220,21 @@ namespace nanojit
void errorStructureShouldBe(LOpcode op, const char* argDesc, int argN, LIns* arg,
const char* shouldBeDesc);
void errorAccSet(const char* what, AccSet accSet, const char* shouldDesc);
void errorLoadQual(const char* what, LoadQual loadQual);
void checkLInsHasOpcode(LOpcode op, int argN, LIns* ins, LOpcode op2);
void checkLInsIsACondOrConst(LOpcode op, int argN, LIns* ins);
void checkLInsIsNull(LOpcode op, int argN, LIns* ins);
void checkAccSet(LOpcode op, LIns* base, AccSet accSet, AccSet maxAccSet);
void checkAccSet(LOpcode op, LIns* base, AccSet accSet); // defined by the embedder
LIns *sp, *rp;
// These can be set by the embedder and used in checkAccSet().
LIns *checkAccSetIns1, *checkAccSetIns2;
public:
ValidateWriter(LirWriter* out, LInsPrinter* printer, const char* where);
void setSp(LIns* ins) { sp = ins; }
void setRp(LIns* ins) { rp = ins; }
void setCheckAccSetIns1(LIns* ins) { checkAccSetIns1 = ins; }
void setCheckAccSetIns2(LIns* ins) { checkAccSetIns2 = ins; }
LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet);
LIns* insLoad(LOpcode op, LIns* base, int32_t d, AccSet accSet, LoadQual loadQual);
LIns* insStore(LOpcode op, LIns* value, LIns* base, int32_t d, AccSet accSet);
LIns* ins0(LOpcode v);
LIns* ins1(LOpcode v, LIns* a);