diff --git a/js/src/nanojit/Assembler.cpp b/js/src/nanojit/Assembler.cpp
index 6b7e30a19d7..b3f62b2a7b6 100755
--- a/js/src/nanojit/Assembler.cpp
+++ b/js/src/nanojit/Assembler.cpp
@@ -80,6 +80,7 @@ namespace nanojit
         , _config(config)
     {
         VMPI_memset(&_stats, 0, sizeof(_stats));
+        VMPI_memset(lookahead, 0, N_LOOKAHEAD * sizeof(LInsp));
         nInit(core);
         (void)logc;
         verbose_only( _logc = logc; )
@@ -1208,67 +1209,77 @@ namespace nanojit
         NanoAssert(_thisfrag->nStaticExits == 0);
 
         // The trace must end with one of these opcodes.
-        NanoAssert(reader->pos()->isop(LIR_x)    ||
-                   reader->pos()->isop(LIR_xtbl) ||
-                   reader->pos()->isRet()        ||
-                   reader->pos()->isLive());
+        NanoAssert(reader->finalIns()->isop(LIR_x)    ||
+                   reader->finalIns()->isop(LIR_xtbl) ||
+                   reader->finalIns()->isRet()        ||
+                   reader->finalIns()->isLive());
 
         InsList pending_lives(alloc);
 
         NanoAssert(!error());
-        for (LInsp ins = reader->read(); !ins->isop(LIR_start); ins = reader->read())
+
+        // What's going on here: we're visiting all the LIR instructions in
+        // the buffer, working strictly backwards in buffer-order, and
+        // generating machine instructions for them as we go.
+        //
+        // For each LIns, we first determine whether it's actually necessary,
+        // and if not skip it.  Otherwise we generate code for it.  There are
+        // two kinds of "necessary" instructions:
+        //
+        // - "Statement" instructions, which have side effects.  Anything that
+        //   could change control flow or the state of memory.
+        //
+        // - "Value" or "expression" instructions, which compute a value based
+        //   only on the operands to the instruction (and, in the case of
+        //   loads, the state of memory).  Because we visit instructions in
+        //   reverse order, if some previously visited instruction uses the
+        //   value computed by this instruction, then this instruction will
+        //   already have a register assigned to hold that value.  Hence we
+        //   can consult the instruction to detect whether its value is in
+        //   fact used (i.e. not dead).
+        //
+        // Note that the backwards code traversal can make register allocation
+        // confusing.  (For example, we restore a value before we spill it!)
+        // In particular, words like "before" and "after" must be used very
+        // carefully -- their meaning at regalloc-time is opposite to their
+        // meaning at run-time.  We use the term "pre-regstate" to refer to
+        // the register allocation state that occurs prior to an instruction's
+        // execution, and "post-regstate" to refer to the state that occurs
+        // after an instruction's execution, e.g.:
+        //
+        //   pre-regstate:  ebx(ins)
+        //   instruction:   mov eax, ebx     // mov dst, src
+        //   post-regstate: eax(ins)
+        //
+        // At run-time, the instruction updates the pre-regstate into the
+        // post-regstate (and these states are the real machine's regstates).
+        // But when allocating registers, because we go backwards, the
+        // pre-regstate is constructed from the post-regstate (and these
+        // regstates are those stored in RegAlloc).
+        //
+        // One consequence of generating code backwards is that we tend to
+        // both spill and restore registers as early (at run-time) as
+        // possible;  this is good for tolerating memory latency.  If we
+        // generated code forwards, we would expect to both spill and restore
+        // registers as late (at run-time) as possible;  this might be better
+        // for reducing register pressure.
+        //
+        // Another thing to note: we provide N_LOOKAHEAD instruction's worth
+        // of lookahead because it's useful for backends.  This is nice and
+        // easy because once read() gets to the LIR_start at the beginning of
+        // the buffer it'll just keep regetting it.
+
+        for (int32_t i = 0; i < N_LOOKAHEAD; i++)
+            lookahead[i] = reader->read();
+
+        while (!lookahead[0]->isop(LIR_start))
         {
-            /* What's going on here: we're visiting all the LIR instructions
-               in the buffer, working strictly backwards in buffer-order, and
-               generating machine instructions for them as we go.
+            LInsp ins = lookahead[0];   // give it a shorter name for local use
+            LOpcode op = ins->opcode();
 
-               For each LIns, we first determine whether it's actually
-               necessary, and if not skip it.  Otherwise we generate code for
-               it.  There are two kinds of "necessary" instructions:
-
-               - "Statement" instructions, which have side effects.  Anything
-                 that could change control flow or the state of memory.
-
-               - "Value" or "expression" instructions, which compute a value
-                 based only on the operands to the instruction (and, in the
-                 case of loads, the state of memory).  Because we visit
-                 instructions in reverse order, if some previously visited
-                 instruction uses the value computed by this instruction, then
-                 this instruction will already have a register assigned to
-                 hold that value.  Hence we can consult the instruction to
-                 detect whether its value is in fact used (i.e. not dead).
-
-              Note that the backwards code traversal can make register
-              allocation confusing.  (For example, we restore a value before
-              we spill it!)  In particular, words like "before" and "after"
-              must be used very carefully -- their meaning at regalloc-time is
-              opposite to their meaning at run-time.  We use the term
-              "pre-regstate" to refer to the register allocation state that
-              occurs prior to an instruction's execution, and "post-regstate"
-              to refer to the state that occurs after an instruction's
-              execution, e.g.:
-
-                pre-regstate:  ebx(ins)
-                instruction:   mov eax, ebx     // mov dst, src
-                post-regstate: eax(ins)
-
-              At run-time, the instruction updates the pre-regstate into the
-              post-regstate (and these states are the real machine's
-              regstates).  But when allocating registers, because we go
-              backwards, the pre-regstate is constructed from the
-              post-regstate (and these regstates are those stored in
-              RegAlloc).
-
-              One consequence of generating code backwards is that we tend to
-              both spill and restore registers as early (at run-time) as
-              possible;  this is good for tolerating memory latency.  If we
-              generated code forwards, we would expect to both spill and
-              restore registers as late (at run-time) as possible;  this might
-              be better for reducing register pressure.
-            */
             bool required = ins->isStmt() || ins->isUsed();
             if (!required)
-                continue;
+                goto end_of_loop;
 
 #ifdef NJ_VERBOSE
             // Output the post-regstate (registers and/or activation).
@@ -1281,8 +1292,7 @@ namespace nanojit
                 printRegState();
 #endif
 
-            LOpcode op = ins->opcode();
-            switch(op)
+            switch (op)
             {
                 default:
                     NanoAssertMsgf(false, "unsupported LIR instruction: %d\n", op);
@@ -1851,6 +1861,11 @@ namespace nanojit
             // check that all is well (don't check in exit paths since its more complicated)
             debug_only( pageValidate(); )
             debug_only( resourceConsistencyCheck();  )
+
+          end_of_loop:
+            for (int32_t i = 1; i < N_LOOKAHEAD; i++)
+                lookahead[i-1] = lookahead[i];
+            lookahead[N_LOOKAHEAD-1] = reader->read();
         }
     }
 
diff --git a/js/src/nanojit/Assembler.h b/js/src/nanojit/Assembler.h
index cf36b617a4d..e0f47e5060c 100644
--- a/js/src/nanojit/Assembler.h
+++ b/js/src/nanojit/Assembler.h
@@ -413,6 +413,13 @@ namespace nanojit
             NIns*       pedanticTop;
         #endif
 
+
+            // Instruction lookahead in gen().  lookahead[0] is the current
+            // instruction.  Nb: lookahead[1..N_LOOKAHEAD] may include dead
+            // instructions, but we won't know that they're dead yet.
+            static const int N_LOOKAHEAD = 3;
+            LInsp       lookahead[N_LOOKAHEAD];
+
             AR          _activation;
             RegAlloc    _allocator;
 
diff --git a/js/src/nanojit/LIR.cpp b/js/src/nanojit/LIR.cpp
index 2a71fa4f697..9a0caad6f33 100644
--- a/js/src/nanojit/LIR.cpp
+++ b/js/src/nanojit/LIR.cpp
@@ -136,13 +136,18 @@ namespace nanojit
 
     LInsp ReverseLister::read()
     {
-        LInsp i = in->read();
+        // This check is necessary to avoid printing the LIR_start multiple
+        // times due to lookahead in Assembler::gen().
+        if (_prevIns && _prevIns->isop(LIR_start))
+            return _prevIns;
+        LInsp ins = in->read();
         InsBuf b;
-        const char* str = _printer->formatIns(&b, i);
+        const char* str = _printer->formatIns(&b, ins);
         char* cpy = new (_alloc) char[strlen(str)+1];
         VMPI_strcpy(cpy, str);
         _strs.insert(cpy);
-        return i;
+        _prevIns = ins;
+        return ins;
     }
 #endif
 
@@ -402,20 +407,20 @@ namespace nanojit
             0
         };
 
-        // Check the invariant: _i never points to a skip.
-        NanoAssert(_i && !_i->isop(LIR_skip));
+        // Check the invariant: _ins never points to a skip.
+        NanoAssert(_ins && !_ins->isop(LIR_skip));
 
         // Step back one instruction.  Use a table lookup rather than a switch
         // to avoid branch mispredictions.  LIR_start is given a special size
         // of zero so that we don't step back past the start of the block.
         // (Callers of this function should stop once they see a LIR_start.)
-        LInsp ret = _i;
-        _i = (LInsp)(uintptr_t(_i) - insSizes[_i->opcode()]);
+        LInsp ret = _ins;
+        _ins = (LInsp)(uintptr_t(_ins) - insSizes[_ins->opcode()]);
 
-        // Ensure _i doesn't end up pointing to a skip.
-        while (_i->isop(LIR_skip)) {
-            NanoAssert(_i->prevLIns() != _i);
-            _i = _i->prevLIns();
+        // Ensure _ins doesn't end up pointing to a skip.
+        while (_ins->isop(LIR_skip)) {
+            NanoAssert(_ins->prevLIns() != _ins);
+            _ins = _ins->prevLIns();
         }
 
         return ret;
@@ -1567,7 +1572,7 @@ namespace nanojit
         uint32_t exits = 0;
         int total = 0;
         if (frag->lirbuf->state)
-            live.add(frag->lirbuf->state, in->pos());
+            live.add(frag->lirbuf->state, in->finalIns());
         for (LInsp ins = in->read(); !ins->isop(LIR_start); ins = in->read())
         {
             total++;
diff --git a/js/src/nanojit/LIR.h b/js/src/nanojit/LIR.h
index 7f16a6b7f58..def1b45a8a2 100644
--- a/js/src/nanojit/LIR.h
+++ b/js/src/nanojit/LIR.h
@@ -891,7 +891,7 @@ namespace nanojit
         // Note, this assumes that loads will never fault and hence cannot
         // affect the control flow.
         bool isStmt() {
-            NanoAssert(!isop(LIR_start) && !isop(LIR_skip));
+            NanoAssert(!isop(LIR_skip));
             // All instructions with Void retType are statements, as are calls
             // to impure functions.
             if (isCall())
@@ -1943,21 +1943,25 @@ namespace nanojit
         LirFilter(LirFilter *in) : in(in) {}
         virtual ~LirFilter(){}
 
+        // It's crucial that once this reaches the LIR_start at the beginning
+        // of the buffer, that it just keeps returning that LIR_start LIns on
+        // any subsequent calls.
         virtual LInsp read() {
             return in->read();
         }
-        virtual LInsp pos() {
-            return in->pos();
+        virtual LInsp finalIns() {
+            return in->finalIns();
         }
     };
 
     // concrete
     class LirReader : public LirFilter
     {
-        LInsp _i; // next instruction to be read;  invariant: is never a skip
+        LInsp _ins;         // next instruction to be read;  invariant: is never a skip
+        LInsp _finalIns;    // final instruction in the stream;  ie. the first one to be read
 
     public:
-        LirReader(LInsp i) : LirFilter(0), _i(i)
+        LirReader(LInsp ins) : LirFilter(0), _ins(ins), _finalIns(ins)
         {
             // The last instruction for a fragment shouldn't be a skip.
             // (Actually, if the last *inserted* instruction exactly fills up
@@ -1966,7 +1970,7 @@ namespace nanojit
             // cross-chunk link.  But the last *inserted* instruction is what
             // is recorded and used to initialise each LirReader, and that is
             // what is seen here, and therefore this assertion holds.)
-            NanoAssert(i && !i->isop(LIR_skip));
+            NanoAssert(ins && !ins->isop(LIR_skip));
         }
         virtual ~LirReader() {}
 
@@ -1974,9 +1978,8 @@ namespace nanojit
         // Invariant: never returns a skip.
         LInsp read();
 
-        // Returns next instruction.  Invariant: never returns a skip.
-        LInsp pos() {
-            return _i;
+        LInsp finalIns() {
+            return _finalIns;
         }
     };
 
@@ -2102,6 +2105,7 @@ namespace nanojit
         const char*  _title;
         StringList   _strs;
         LogControl*  _logc;
+        LIns*        _prevIns;
     public:
         ReverseLister(LirFilter* in, Allocator& alloc,
                       LInsPrinter* printer, LogControl* logc, const char* title)
@@ -2111,6 +2115,7 @@ namespace nanojit
             , _title(title)
             , _strs(alloc)
             , _logc(logc)
+            , _prevIns(NULL)
         { }
 
         void finish();
diff --git a/js/src/nanojit/Nativei386.cpp b/js/src/nanojit/Nativei386.cpp
index 5fe50506184..8ec0ccdc38e 100644
--- a/js/src/nanojit/Nativei386.cpp
+++ b/js/src/nanojit/Nativei386.cpp
@@ -742,11 +742,19 @@ namespace nanojit
             // disturb the CCs!
             Register r = findRegFor(lhs, GpRegs);
             if (c == 0 && cond->isop(LIR_eq)) {
-                TEST(r, r);
+                NanoAssert(N_LOOKAHEAD >= 3);
+                if ((lhs->isop(LIR_and) || lhs->isop(LIR_or)) &&
+                    cond == lookahead[1] && lhs == lookahead[2])
+                {
+                    // Do nothing.  At run-time, 'lhs' will have just computed
+                    // by an i386 instruction that sets ZF for us ('and' or
+                    // 'or'), so we don't have to do it ourselves.
+                } else {
+                    TEST(r, r);     // sets ZF according to the value of 'lhs'
+                }
             } else {
                 CMPi(r, c);
             }
-
         } else {
             Register ra, rb;
             findRegFor2(GpRegs, lhs, ra, GpRegs, rhs, rb);