Bug 553518 - nanojit: avoid 'test r,r' where possible on i386. r=edwsmith.

--HG-- extra : convert_revision : ec4d959e1cc9337cf30a08bf75b246516a1728a4
2024-09-13 09:24:08 -07:00 · 2010-03-24 15:34:34 -07:00 · 2010-03-24 15:34:34 -07:00 · 70b954205a
commit 70b954205a
parent 36ee148277
5 changed files with 118 additions and 78 deletions
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@ -80,6 +80,7 @@ namespace nanojit
        , _config(config)
    {
        VMPI_memset(&_stats, 0, sizeof(_stats));
+        VMPI_memset(lookahead, 0, N_LOOKAHEAD * sizeof(LInsp));
        nInit(core);
        (void)logc;
        verbose_only( _logc = logc; )
@ -1208,67 +1209,77 @@ namespace nanojit
        NanoAssert(_thisfrag->nStaticExits == 0);

        // The trace must end with one of these opcodes.
-        NanoAssert(reader->pos()->isop(LIR_x)    ||
-                   reader->pos()->isop(LIR_xtbl) ||
-                   reader->pos()->isRet()        ||
-                   reader->pos()->isLive());
+        NanoAssert(reader->finalIns()->isop(LIR_x)    ||
+                   reader->finalIns()->isop(LIR_xtbl) ||
+                   reader->finalIns()->isRet()        ||
+                   reader->finalIns()->isLive());

        InsList pending_lives(alloc);

        NanoAssert(!error());
-        for (LInsp ins = reader->read(); !ins->isop(LIR_start); ins = reader->read())
+
+        // What's going on here: we're visiting all the LIR instructions in
+        // the buffer, working strictly backwards in buffer-order, and
+        // generating machine instructions for them as we go.
+        //
+        // For each LIns, we first determine whether it's actually necessary,
+        // and if not skip it.  Otherwise we generate code for it.  There are
+        // two kinds of "necessary" instructions:
+        //
+        // - "Statement" instructions, which have side effects.  Anything that
+        //   could change control flow or the state of memory.
+        //
+        // - "Value" or "expression" instructions, which compute a value based
+        //   only on the operands to the instruction (and, in the case of
+        //   loads, the state of memory).  Because we visit instructions in
+        //   reverse order, if some previously visited instruction uses the
+        //   value computed by this instruction, then this instruction will
+        //   already have a register assigned to hold that value.  Hence we
+        //   can consult the instruction to detect whether its value is in
+        //   fact used (i.e. not dead).
+        //
+        // Note that the backwards code traversal can make register allocation
+        // confusing.  (For example, we restore a value before we spill it!)
+        // In particular, words like "before" and "after" must be used very
+        // carefully -- their meaning at regalloc-time is opposite to their
+        // meaning at run-time.  We use the term "pre-regstate" to refer to
+        // the register allocation state that occurs prior to an instruction's
+        // execution, and "post-regstate" to refer to the state that occurs
+        // after an instruction's execution, e.g.:
+        //
+        //   pre-regstate:  ebx(ins)
+        //   instruction:   mov eax, ebx     // mov dst, src
+        //   post-regstate: eax(ins)
+        //
+        // At run-time, the instruction updates the pre-regstate into the
+        // post-regstate (and these states are the real machine's regstates).
+        // But when allocating registers, because we go backwards, the
+        // pre-regstate is constructed from the post-regstate (and these
+        // regstates are those stored in RegAlloc).
+        //
+        // One consequence of generating code backwards is that we tend to
+        // both spill and restore registers as early (at run-time) as
+        // possible;  this is good for tolerating memory latency.  If we
+        // generated code forwards, we would expect to both spill and restore
+        // registers as late (at run-time) as possible;  this might be better
+        // for reducing register pressure.
+        //
+        // Another thing to note: we provide N_LOOKAHEAD instruction's worth
+        // of lookahead because it's useful for backends.  This is nice and
+        // easy because once read() gets to the LIR_start at the beginning of
+        // the buffer it'll just keep regetting it.
+
+        for (int32_t i = 0; i < N_LOOKAHEAD; i++)
+            lookahead[i] = reader->read();
+
+        while (!lookahead[0]->isop(LIR_start))
        {
-            /* What's going on here: we're visiting all the LIR instructions
-               in the buffer, working strictly backwards in buffer-order, and
-               generating machine instructions for them as we go.
+            LInsp ins = lookahead[0];   // give it a shorter name for local use
+            LOpcode op = ins->opcode();

-               For each LIns, we first determine whether it's actually
-               necessary, and if not skip it.  Otherwise we generate code for
-               it.  There are two kinds of "necessary" instructions:
-
-               - "Statement" instructions, which have side effects.  Anything
-                 that could change control flow or the state of memory.
-
-               - "Value" or "expression" instructions, which compute a value
-                 based only on the operands to the instruction (and, in the
-                 case of loads, the state of memory).  Because we visit
-                 instructions in reverse order, if some previously visited
-                 instruction uses the value computed by this instruction, then
-                 this instruction will already have a register assigned to
-                 hold that value.  Hence we can consult the instruction to
-                 detect whether its value is in fact used (i.e. not dead).
-
-              Note that the backwards code traversal can make register
-              allocation confusing.  (For example, we restore a value before
-              we spill it!)  In particular, words like "before" and "after"
-              must be used very carefully -- their meaning at regalloc-time is
-              opposite to their meaning at run-time.  We use the term
-              "pre-regstate" to refer to the register allocation state that
-              occurs prior to an instruction's execution, and "post-regstate"
-              to refer to the state that occurs after an instruction's
-              execution, e.g.:
-
-                pre-regstate:  ebx(ins)
-                instruction:   mov eax, ebx     // mov dst, src
-                post-regstate: eax(ins)
-
-              At run-time, the instruction updates the pre-regstate into the
-              post-regstate (and these states are the real machine's
-              regstates).  But when allocating registers, because we go
-              backwards, the pre-regstate is constructed from the
-              post-regstate (and these regstates are those stored in
-              RegAlloc).
-
-              One consequence of generating code backwards is that we tend to
-              both spill and restore registers as early (at run-time) as
-              possible;  this is good for tolerating memory latency.  If we
-              generated code forwards, we would expect to both spill and
-              restore registers as late (at run-time) as possible;  this might
-              be better for reducing register pressure.
-            */
            bool required = ins->isStmt() || ins->isUsed();
            if (!required)
-                continue;
+                goto end_of_loop;

 #ifdef NJ_VERBOSE
            // Output the post-regstate (registers and/or activation).
@ -1281,8 +1292,7 @@ namespace nanojit
                printRegState();
 #endif

-            LOpcode op = ins->opcode();
-            switch(op)
+            switch (op)
            {
                default:
                    NanoAssertMsgf(false, "unsupported LIR instruction: %d\n", op);
@ -1851,6 +1861,11 @@ namespace nanojit
            // check that all is well (don't check in exit paths since its more complicated)
            debug_only( pageValidate(); )
            debug_only( resourceConsistencyCheck();  )
+
+          end_of_loop:
+            for (int32_t i = 1; i < N_LOOKAHEAD; i++)
+                lookahead[i-1] = lookahead[i];
+            lookahead[N_LOOKAHEAD-1] = reader->read();
        }
    }

--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@ -413,6 +413,13 @@ namespace nanojit
            NIns*       pedanticTop;
        #endif

+
+            // Instruction lookahead in gen().  lookahead[0] is the current
+            // instruction.  Nb: lookahead[1..N_LOOKAHEAD] may include dead
+            // instructions, but we won't know that they're dead yet.
+            static const int N_LOOKAHEAD = 3;
+            LInsp       lookahead[N_LOOKAHEAD];
+
            AR          _activation;
            RegAlloc    _allocator;

--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@ -136,13 +136,18 @@ namespace nanojit

    LInsp ReverseLister::read()
    {
-        LInsp i = in->read();
+        // This check is necessary to avoid printing the LIR_start multiple
+        // times due to lookahead in Assembler::gen().
+        if (_prevIns && _prevIns->isop(LIR_start))
+            return _prevIns;
+        LInsp ins = in->read();
        InsBuf b;
-        const char* str = _printer->formatIns(&b, i);
+        const char* str = _printer->formatIns(&b, ins);
        char* cpy = new (_alloc) char[strlen(str)+1];
        VMPI_strcpy(cpy, str);
        _strs.insert(cpy);
-        return i;
+        _prevIns = ins;
+        return ins;
    }
 #endif

@ -402,20 +407,20 @@ namespace nanojit
            0
        };

-        // Check the invariant: _i never points to a skip.
-        NanoAssert(_i && !_i->isop(LIR_skip));
+        // Check the invariant: _ins never points to a skip.
+        NanoAssert(_ins && !_ins->isop(LIR_skip));

        // Step back one instruction.  Use a table lookup rather than a switch
        // to avoid branch mispredictions.  LIR_start is given a special size
        // of zero so that we don't step back past the start of the block.
        // (Callers of this function should stop once they see a LIR_start.)
-        LInsp ret = _i;
-        _i = (LInsp)(uintptr_t(_i) - insSizes[_i->opcode()]);
+        LInsp ret = _ins;
+        _ins = (LInsp)(uintptr_t(_ins) - insSizes[_ins->opcode()]);

-        // Ensure _i doesn't end up pointing to a skip.
-        while (_i->isop(LIR_skip)) {
-            NanoAssert(_i->prevLIns() != _i);
-            _i = _i->prevLIns();
+        // Ensure _ins doesn't end up pointing to a skip.
+        while (_ins->isop(LIR_skip)) {
+            NanoAssert(_ins->prevLIns() != _ins);
+            _ins = _ins->prevLIns();
        }

        return ret;
@ -1567,7 +1572,7 @@ namespace nanojit
        uint32_t exits = 0;
        int total = 0;
        if (frag->lirbuf->state)
-            live.add(frag->lirbuf->state, in->pos());
+            live.add(frag->lirbuf->state, in->finalIns());
        for (LInsp ins = in->read(); !ins->isop(LIR_start); ins = in->read())
        {
            total++;
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@ -891,7 +891,7 @@ namespace nanojit
        // Note, this assumes that loads will never fault and hence cannot
        // affect the control flow.
        bool isStmt() {
-            NanoAssert(!isop(LIR_start) && !isop(LIR_skip));
+            NanoAssert(!isop(LIR_skip));
            // All instructions with Void retType are statements, as are calls
            // to impure functions.
            if (isCall())
@ -1943,21 +1943,25 @@ namespace nanojit
        LirFilter(LirFilter *in) : in(in) {}
        virtual ~LirFilter(){}

+        // It's crucial that once this reaches the LIR_start at the beginning
+        // of the buffer, that it just keeps returning that LIR_start LIns on
+        // any subsequent calls.
        virtual LInsp read() {
            return in->read();
        }
-        virtual LInsp pos() {
-            return in->pos();
+        virtual LInsp finalIns() {
+            return in->finalIns();
        }
    };

    // concrete
    class LirReader : public LirFilter
    {
-        LInsp _i; // next instruction to be read;  invariant: is never a skip
+        LInsp _ins;         // next instruction to be read;  invariant: is never a skip
+        LInsp _finalIns;    // final instruction in the stream;  ie. the first one to be read

    public:
-        LirReader(LInsp i) : LirFilter(0), _i(i)
+        LirReader(LInsp ins) : LirFilter(0), _ins(ins), _finalIns(ins)
        {
            // The last instruction for a fragment shouldn't be a skip.
            // (Actually, if the last *inserted* instruction exactly fills up
@ -1966,7 +1970,7 @@ namespace nanojit
            // cross-chunk link.  But the last *inserted* instruction is what
            // is recorded and used to initialise each LirReader, and that is
            // what is seen here, and therefore this assertion holds.)
-            NanoAssert(i && !i->isop(LIR_skip));
+            NanoAssert(ins && !ins->isop(LIR_skip));
        }
        virtual ~LirReader() {}

@ -1974,9 +1978,8 @@ namespace nanojit
        // Invariant: never returns a skip.
        LInsp read();

-        // Returns next instruction.  Invariant: never returns a skip.
-        LInsp pos() {
-            return _i;
+        LInsp finalIns() {
+            return _finalIns;
        }
    };

@ -2102,6 +2105,7 @@ namespace nanojit
        const char*  _title;
        StringList   _strs;
        LogControl*  _logc;
+        LIns*        _prevIns;
    public:
        ReverseLister(LirFilter* in, Allocator& alloc,
                      LInsPrinter* printer, LogControl* logc, const char* title)
@ -2111,6 +2115,7 @@ namespace nanojit
            , _title(title)
            , _strs(alloc)
            , _logc(logc)
+            , _prevIns(NULL)
        { }

        void finish();
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@ -742,11 +742,19 @@ namespace nanojit
            // disturb the CCs!
            Register r = findRegFor(lhs, GpRegs);
            if (c == 0 && cond->isop(LIR_eq)) {
-                TEST(r, r);
+                NanoAssert(N_LOOKAHEAD >= 3);
+                if ((lhs->isop(LIR_and) || lhs->isop(LIR_or)) &&
+                    cond == lookahead[1] && lhs == lookahead[2])
+                {
+                    // Do nothing.  At run-time, 'lhs' will have just computed
+                    // by an i386 instruction that sets ZF for us ('and' or
+                    // 'or'), so we don't have to do it ourselves.
+                } else {
+                    TEST(r, r);     // sets ZF according to the value of 'lhs'
+                }
            } else {
                CMPi(r, c);
            }
-
        } else {
            Register ra, rb;
            findRegFor2(GpRegs, lhs, ra, GpRegs, rhs, rb);