Bug 976110 - Part 1: Optimize signed integer division by constants; r=sunfish

2024-09-13 09:24:08 -07:00 · 2014-04-19 10:37:51 -07:00 · 2014-04-19 10:37:51 -07:00 · e6753fd672
commit e6753fd672
parent 395ac257b1
10 changed files with 219 additions and 4 deletions
--- a/js/src/assembler/assembler/X86Assembler.h
+++ b/js/src/assembler/assembler/X86Assembler.h
@ -366,6 +366,7 @@ private:
        GROUP3_OP_TEST = 0,
        GROUP3_OP_NOT  = 2,
        GROUP3_OP_NEG  = 3,
+        GROUP3_OP_IMUL = 5,
        GROUP3_OP_DIV  = 6,
        GROUP3_OP_IDIV = 7,

@ -1155,6 +1156,13 @@ public:
        m_formatter.twoByteOp(OP2_IMUL_GvEv, dst, src);
    }

+    void imull_r(RegisterID multiplier)
+    {
+        spew("imull      %s",
+             nameIReg(4, multiplier));
+        m_formatter.oneByteOp(OP_GROUP3_Ev, GROUP3_OP_IMUL, multiplier);
+    }
+
    void imull_mr(int offset, RegisterID base, RegisterID dst)
    {
        spew("imull      %s0x%x(%s), %s",
--- a/js/src/jit/shared/Assembler-x86-shared.h
+++ b/js/src/jit/shared/Assembler-x86-shared.h
@ -1062,12 +1062,18 @@ class AssemblerX86Shared
            MOZ_ASSUME_UNREACHABLE("unexpected operand kind");
        }
    }
+    void imull(const Register &multiplier) {
+        masm.imull_r(multiplier.code());
+    }
    void imull(Imm32 imm, const Register &dest) {
        masm.imull_i32r(dest.code(), imm.value, dest.code());
    }
    void imull(const Register &src, const Register &dest) {
        masm.imull_rr(src.code(), dest.code());
    }
+    void imull(Imm32 imm, const Register &src, const Register &dest) {
+        masm.imull_i32r(src.code(), imm.value, dest.code());
+    }
    void imull(const Operand &src, const Register &dest) {
        switch (src.kind()) {
          case Operand::REG:
--- a/js/src/jit/shared/CodeGenerator-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-shared.cpp
@ -1007,6 +1007,74 @@ CodeGeneratorShared::addCacheLocations(const CacheLocationList &locs, size_t *nu
    return firstIndex;
 }

+ReciprocalMulConstants
+CodeGeneratorShared::computeDivisionConstants(int d) {
+    // In what follows, d is positive and is not a power of 2.
+    JS_ASSERT(d > 0 && (d & (d - 1)) != 0);
+
+    // Speeding up division by non power-of-2 constants is possible by
+    // calculating, during compilation, a value M such that high-order
+    // bits of M*n correspond to the result of the division. Formally,
+    // we compute values 0 <= M < 2^32 and 0 <= s < 31 such that
+    //         (M * n) >> (32 + s) = floor(n/d)    if n >= 0
+    //         (M * n) >> (32 + s) = ceil(n/d) - 1 if n < 0.
+    // The original presentation of this technique appears in Hacker's
+    // Delight, a book by Henry S. Warren, Jr.. A proof of correctness
+    // for our version follows.
+
+    // Define p = 32 + s, M = ceil(2^p/d), and assume that s satisfies
+    //                     M - 2^p/d <= 2^(s+1)/d.                 (1)
+    // (Observe that s = FloorLog32(d) satisfies this, because in this
+    // case d <= 2^(s+1) and so the RHS of (1) is at least one). Then,
+    //
+    // a) If s <= FloorLog32(d), then M <= 2^32 - 1.
+    // Proof: Indeed, M is monotone in s and, for s = FloorLog32(d),
+    // the inequalities 2^31 > d >= 2^s + 1 readily imply
+    //    2^p / d  = 2^p/(d - 1) * (d - 1)/d
+    //            <= 2^32 * (1 - 1/d) < 2 * (2^31 - 1) = 2^32 - 2.
+    // The claim follows by applying the ceiling function.
+    //
+    // b) For any 0 <= n < 2^31, floor(Mn/2^p) = floor(n/d).
+    // Proof: Put x = floor(Mn/2^p); it's the unique integer for which
+    //                    Mn/2^p - 1 < x <= Mn/2^p.                (2)
+    // Using M >= 2^p/d on the LHS and (1) on the RHS, we get
+    //           n/d - 1 < x <= n/d + n/(2^31 d) < n/d + 1/d.
+    // Since x is an integer, it's not in the interval (n/d, (n+1)/d),
+    // and so n/d - 1 < x <= n/d, which implies x = floor(n/d).
+    //
+    // c) For any -2^31 <= n < 0, floor(Mn/2^p) + 1 = ceil(n/d).
+    // Proof: The proof is similar. Equation (2) holds as above. Using
+    // M > 2^p/d (d isn't a power of 2) on the RHS and (1) on the LHS,
+    //                 n/d + n/(2^31 d) - 1 < x < n/d.
+    // Using n >= -2^31 and summing 1,
+    //                  n/d - 1/d < x + 1 < n/d + 1.
+    // Since x + 1 is an integer, this implies n/d <= x + 1 < n/d + 1.
+    // In other words, x + 1 = ceil(n/d).
+    //
+    // Condition (1) isn't necessary for the existence of M and s with
+    // the properties above. Hacker's Delight provides a slightly less
+    // restrictive condition when d >= 196611, at the cost of a 3-page
+    // proof of correctness.
+
+    // Note that, since d*M - 2^p = d - (2^p)%d, (1) can be written as
+    //                   2^(s+1) >= d - (2^p)%d.
+    // We now compute the least s with this property...
+
+    int32_t shift = 0;
+    while ((int64_t(1) << (shift+1)) + (int64_t(1) << (shift+32)) % d < d)
+        shift++;
+
+    // ...and the corresponding M. This may not fit in a signed 32-bit
+    // integer; we will compute (M - 2^32) * n + (2^32 * n) instead of
+    // M * n if this is the case (cf. item (a) above).
+    ReciprocalMulConstants rmc;
+    rmc.multiplier = int32_t((int64_t(1) << (shift+32))/d + 1);
+    rmc.shift_amount = shift;
+
+    return rmc;
+}
+
+
 #ifdef JS_TRACE_LOGGING

 bool
--- a/js/src/jit/shared/CodeGenerator-shared.h
+++ b/js/src/jit/shared/CodeGenerator-shared.h
@ -45,6 +45,11 @@ struct PatchableBackedgeInfo
    {}
 };

+struct ReciprocalMulConstants {
+    int32_t multiplier;
+    int32_t shift_amount;
+};
+
 class CodeGeneratorShared : public LInstructionVisitor
 {
    js::Vector<OutOfLineCode *, 0, SystemAllocPolicy> outOfLineCode_;
@ -402,6 +407,7 @@ class CodeGeneratorShared : public LInstructionVisitor

    bool addCache(LInstruction *lir, size_t cacheIndex);
    size_t addCacheLocations(const CacheLocationList &locs, size_t *numLocs);
+    ReciprocalMulConstants computeDivisionConstants(int d);

  protected:
    bool addOutOfLineCode(OutOfLineCode *code);
--- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp
@ -22,6 +22,7 @@
 using namespace js;
 using namespace js::jit;

+using mozilla::Abs;
 using mozilla::FloatingPoint;
 using mozilla::FloorLog2;
 using mozilla::NegativeInfinity;
@ -918,6 +919,88 @@ CodeGeneratorX86Shared::visitDivPowTwoI(LDivPowTwoI *ins)
    return true;
 }

+bool
+CodeGeneratorX86Shared::visitDivOrModConstantI(LDivOrModConstantI *ins) {
+    Register lhs = ToRegister(ins->numerator());
+    Register output = ToRegister(ins->output());
+    int32_t d = ins->denominator();
+
+    // This emits the division answer into edx or the modulus answer into eax.
+    JS_ASSERT(output == eax || output == edx);
+    JS_ASSERT(lhs != eax && lhs != edx);
+
+    // The absolute value of the denominator isn't a power of 2 (see LDivPowTwoI
+    // and LModPowTwoI).
+    JS_ASSERT((Abs(d) & (Abs(d) - 1)) != 0);
+
+    // We will first divide by Abs(d), and negate the answer if d is negative.
+    // If desired, this can be avoided by generalizing computeDivisionConstants.
+    ReciprocalMulConstants rmc = computeDivisionConstants(Abs(d));
+
+    // As explained in the comments of computeDivisionConstants, we first compute
+    // X >> (32 + shift), where X is either (rmc.multiplier * n) if the multiplier
+    // is non-negative or (rmc.multiplier * n) + (2^32 * n) otherwise. This is the
+    // desired division result if n is non-negative, and is one less than the result
+    // otherwise.
+    masm.movl(lhs, eax);
+    masm.movl(Imm32(rmc.multiplier), edx);
+    masm.imull(edx);
+    if (rmc.multiplier < 0)
+        masm.addl(lhs, edx);
+    masm.sarl(Imm32(rmc.shift_amount), edx);
+
+    // We'll subtract -1 instead of adding 1, because (n < 0 ? -1 : 0) can be
+    // computed with just a sign-extending shift of 31 bits.
+    if (ins->canBeNegativeDividend()) {
+        masm.movl(lhs, eax);
+        masm.sarl(Imm32(31), eax);
+        masm.subl(eax, edx);
+    }
+
+    // After this, edx contains the correct division result.
+    if (d < 0)
+        masm.negl(edx);
+
+    if (output == eax) {
+        masm.imull(Imm32(-d), edx, eax);
+        masm.addl(lhs, eax);
+    }
+
+    if (!ins->mir()->isTruncated()) {
+        if (output == edx) {
+            // This is a division op. Multiply the obtained value by d to check if
+            // the correct answer is an integer. This cannot overflow, since |d| > 1.
+            masm.imull(Imm32(d), edx, eax);
+            masm.cmpl(lhs, eax);
+            if (!bailoutIf(Assembler::NotEqual, ins->snapshot()))
+                return false;
+
+            // If lhs is zero and the divisor is negative, the answer should have
+            // been -0.
+            if (d < 0) {
+                masm.testl(lhs, lhs);
+                if (!bailoutIf(Assembler::Zero, ins->snapshot()))
+                    return false;
+            }
+        } else if (ins->canBeNegativeDividend()) {
+            // This is a mod op. If the computed value is zero and lhs
+            // is negative, the answer should have been -0.
+            Label done;
+
+            masm.cmpl(lhs, Imm32(0));
+            masm.j(Assembler::GreaterThanOrEqual, &done);
+
+            masm.testl(eax, eax);
+            if (!bailoutIf(Assembler::Zero, ins->snapshot()))
+                return false;
+
+            masm.bind(&done);
+        }
+    }
+
+    return true;
+}
+
 bool
 CodeGeneratorX86Shared::visitDivI(LDivI *ins)
 {
--- a/js/src/jit/shared/CodeGenerator-x86-shared.h
+++ b/js/src/jit/shared/CodeGenerator-x86-shared.h
@ -124,6 +124,7 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared
    virtual bool visitMulI(LMulI *ins);
    virtual bool visitDivI(LDivI *ins);
    virtual bool visitDivPowTwoI(LDivPowTwoI *ins);
+    virtual bool visitDivOrModConstantI(LDivOrModConstantI *ins);
    virtual bool visitModI(LModI *ins);
    virtual bool visitModPowTwoI(LModPowTwoI *ins);
    virtual bool visitBitNotI(LBitNotI *ins);
--- a/js/src/jit/shared/LIR-x86-shared.h
+++ b/js/src/jit/shared/LIR-x86-shared.h
@ -76,6 +76,37 @@ class LDivPowTwoI : public LBinaryMath<0>
    }
 };

+class LDivOrModConstantI : public LInstructionHelper<1, 1, 1>
+{
+    const int32_t denominator_;
+
+  public:
+    LIR_HEADER(DivOrModConstantI)
+
+    LDivOrModConstantI(const LAllocation &lhs, int32_t denominator, const LDefinition& temp)
+    : denominator_(denominator)
+    {
+        setOperand(0, lhs);
+        setTemp(0, temp);
+    }
+
+    const LAllocation *numerator() {
+        return getOperand(0);
+    }
+    int32_t denominator() const {
+        return denominator_;
+    }
+    MBinaryArithInstruction *mir() const {
+        JS_ASSERT(mir_->isDiv() || mir_->isMod());
+        return static_cast<MBinaryArithInstruction *>(mir_);
+    }
+    bool canBeNegativeDividend() const {
+        if (mir_->isMod())
+            return mir_->toMod()->canBeNegativeDividend();
+        return mir_->toDiv()->canBeNegativeDividend();
+    }
+};
+
 class LModI : public LBinaryMath<1>
 {
  public:
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@ -138,10 +138,8 @@ LIRGeneratorX86Shared::lowerDivI(MDiv *div)
    if (div->rhs()->isConstant()) {
        int32_t rhs = div->rhs()->toConstant()->value().toInt32();

-        // Check for division by a power of two, which is an easy and
-        // important case to optimize. Note that other optimizations
-        // are also possible: division by other constants can be
-        // optimized by a reciprocal multiplication technique.
+        // Division by powers of two can be done by shifting, and division by
+        // other numbers can be done by a reciprocal multiplication technique.
        int32_t shift = FloorLog2(Abs(rhs));
        if (rhs != 0 && uint32_t(1) << shift == Abs(rhs)) {
            LAllocation lhs = useRegisterAtStart(div->lhs());
@ -157,6 +155,12 @@ LIRGeneratorX86Shared::lowerDivI(MDiv *div)
            if (div->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
                return false;
            return defineReuseInput(lir, div, 0);
+        } else if (rhs != 0) {
+            LDivOrModConstantI *lir;
+            lir = new(alloc()) LDivOrModConstantI(useRegister(div->lhs()), rhs, tempFixed(eax));
+            if (div->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
+                return false;
+            return defineFixed(lir, div, LAllocation(AnyRegister(edx)));
        }
    }

@ -181,6 +185,12 @@ LIRGeneratorX86Shared::lowerModI(MMod *mod)
            if (mod->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
                return false;
            return defineReuseInput(lir, mod, 0);
+        } else if (rhs != 0) {
+            LDivOrModConstantI *lir;
+            lir = new(alloc()) LDivOrModConstantI(useRegister(mod->lhs()), rhs, tempFixed(edx));
+            if (mod->fallible() && !assignSnapshot(lir, Bailout_BaselineInfo))
+                return false;
+            return defineFixed(lir, mod, LAllocation(AnyRegister(eax)));
        }
    }

--- a/js/src/jit/x64/LOpcodes-x64.h
+++ b/js/src/jit/x64/LOpcodes-x64.h
@ -13,6 +13,7 @@
    _(UnboxFloatingPoint)           \
    _(DivI)                         \
    _(DivPowTwoI)                   \
+    _(DivOrModConstantI)            \
    _(ModI)                         \
    _(ModPowTwoI)                   \
    _(PowHalfD)                     \
--- a/js/src/jit/x86/LOpcodes-x86.h
+++ b/js/src/jit/x86/LOpcodes-x86.h
@ -14,6 +14,7 @@
    _(BoxFloatingPoint)         \
    _(DivI)                     \
    _(DivPowTwoI)               \
+    _(DivOrModConstantI)        \
    _(ModI)                     \
    _(ModPowTwoI)               \
    _(PowHalfD)                 \