From 6e0c6962f0368bdf7cd8d249c64b4353dffd190f Mon Sep 17 00:00:00 2001 From: Shu-yu Guo Date: Fri, 14 Feb 2014 19:18:07 -0800 Subject: [PATCH] Bug 969722 - Part 2: Inline ForkJoinGetSlice. (r=nmatsakis) --- js/src/assembler/assembler/X86Assembler.h | 16 +- js/src/jit/CompileWrappers.cpp | 6 + js/src/jit/CompileWrappers.h | 2 + js/src/jit/Ion.cpp | 20 ++ js/src/jit/IonBuilder.h | 3 + js/src/jit/IonMacroAssembler.h | 10 + js/src/jit/JitCompartment.h | 9 + js/src/jit/LIR-Common.h | 32 ++++ js/src/jit/LOpcodes.h | 1 + js/src/jit/MCallOptimize.cpp | 36 ++++ js/src/jit/MIR.h | 27 +++ js/src/jit/MOpcodes.h | 1 + js/src/jit/ParallelSafetyAnalysis.cpp | 1 + js/src/jit/arm/CodeGenerator-arm.cpp | 12 ++ js/src/jit/arm/CodeGenerator-arm.h | 3 +- js/src/jit/arm/Lowering-arm.cpp | 6 + js/src/jit/arm/Lowering-arm.h | 1 + js/src/jit/shared/Assembler-x86-shared.h | 13 +- .../jit/shared/CodeGenerator-x86-shared.cpp | 175 ++++++++++++++++++ js/src/jit/shared/CodeGenerator-x86-shared.h | 2 + js/src/jit/shared/Lowering-shared.h | 5 + js/src/jit/shared/Lowering-x86-shared.cpp | 13 ++ js/src/jit/shared/Lowering-x86-shared.h | 1 + js/src/jit/shared/MacroAssembler-x86-shared.h | 14 ++ js/src/jit/x64/Assembler-x64.h | 7 + js/src/jit/x64/Lowering-x64.h | 4 + js/src/jit/x86/Assembler-x86.h | 7 + js/src/jit/x86/Lowering-x86.h | 4 + js/src/vm/ForkJoin.cpp | 4 +- js/src/vm/ForkJoin.h | 5 + js/src/vm/ThreadPool.h | 16 ++ 31 files changed, 451 insertions(+), 5 deletions(-) diff --git a/js/src/assembler/assembler/X86Assembler.h b/js/src/assembler/assembler/X86Assembler.h index 7c17552b1f9..4e16b33f3d2 100644 --- a/js/src/assembler/assembler/X86Assembler.h +++ b/js/src/assembler/assembler/X86Assembler.h @@ -1201,7 +1201,6 @@ public: // Otherwise, [%base+offset] -> %eax. spew("cmpxchg %s, %s0x%x(%s)", nameIReg(src), PRETTY_PRINT_OFFSET(offset), nameIReg(base)); - m_formatter.oneByteOp(PRE_LOCK); m_formatter.twoByteOp(OP2_CMPXCHG_GvEw, src, base, offset); } @@ -1406,6 +1405,14 @@ public: } } + void cmpw_rr(RegisterID src, RegisterID dst) + { + spew("cmpw %s, %s", + nameIReg(2, src), nameIReg(2, dst)); + m_formatter.prefix(PRE_OPERAND_SIZE); + m_formatter.oneByteOp(OP_CMP_EvGv, src, dst); + } + void cmpw_rm(RegisterID src, int offset, RegisterID base, RegisterID index, int scale) { FIXME_INSN_PRINTING; @@ -2064,6 +2071,13 @@ public: } #endif + void movzwl_rr(RegisterID src, RegisterID dst) + { + spew("movzwl %s, %s", + nameIReg(2, src), nameIReg(4, dst)); + m_formatter.twoByteOp(OP2_MOVZX_GvEw, dst, src); + } + void movzwl_mr(int offset, RegisterID base, RegisterID dst) { spew("movzwl %s0x%x(%s), %s", diff --git a/js/src/jit/CompileWrappers.cpp b/js/src/jit/CompileWrappers.cpp index 6cee6be8cd8..c519cecfde3 100644 --- a/js/src/jit/CompileWrappers.cpp +++ b/js/src/jit/CompileWrappers.cpp @@ -79,6 +79,12 @@ CompileRuntime::addressOfInterruptPar() } #endif +const void * +CompileRuntime::addressOfThreadPool() +{ + return &runtime()->threadPool; +} + const JitRuntime * CompileRuntime::jitRuntime() { diff --git a/js/src/jit/CompileWrappers.h b/js/src/jit/CompileWrappers.h index 6cd84daa235..41969ff05cc 100644 --- a/js/src/jit/CompileWrappers.h +++ b/js/src/jit/CompileWrappers.h @@ -56,6 +56,8 @@ class CompileRuntime const void *addressOfInterruptPar(); #endif + const void *addressOfThreadPool(); + const JitRuntime *jitRuntime(); // Compilation does not occur off thread when the SPS profiler is enabled. diff --git a/js/src/jit/Ion.cpp b/js/src/jit/Ion.cpp index 5093e3079b4..c271f92d546 100644 --- a/js/src/jit/Ion.cpp +++ b/js/src/jit/Ion.cpp @@ -157,6 +157,7 @@ JitRuntime::JitRuntime() parallelArgumentsRectifier_(nullptr), invalidator_(nullptr), debugTrapHandler_(nullptr), + forkJoinGetSliceStub_(nullptr), functionWrappers_(nullptr), osrTempData_(nullptr), flusher_(nullptr), @@ -288,6 +289,18 @@ JitRuntime::debugTrapHandler(JSContext *cx) return debugTrapHandler_; } +bool +JitRuntime::ensureForkJoinGetSliceStubExists(JSContext *cx) +{ + if (!forkJoinGetSliceStub_) { + IonSpew(IonSpew_Codegen, "# Emitting ForkJoinGetSlice stub"); + AutoLockForExclusiveAccess lock(cx); + AutoCompartment ac(cx, cx->runtime()->atomsCompartment()); + forkJoinGetSliceStub_ = generateForkJoinGetSliceStub(cx); + } + return !!forkJoinGetSliceStub_; +} + uint8_t * JitRuntime::allocateOsrTempData(size_t size) { @@ -1667,6 +1680,13 @@ IonCompile(JSContext *cx, JSScript *script, if (!cx->compartment()->jitCompartment()->ensureIonStubsExist(cx)) return AbortReason_Alloc; + if (executionMode == ParallelExecution && + LIRGenerator::allowInlineForkJoinGetSlice() && + !cx->runtime()->jitRuntime()->ensureForkJoinGetSliceStubExists(cx)) + { + return AbortReason_Alloc; + } + MIRGraph *graph = alloc->new_(temp); if (!graph) return AbortReason_Alloc; diff --git a/js/src/jit/IonBuilder.h b/js/src/jit/IonBuilder.h index fa3912c9ae0..8ce560581e6 100644 --- a/js/src/jit/IonBuilder.h +++ b/js/src/jit/IonBuilder.h @@ -674,6 +674,9 @@ class IonBuilder : public MIRGenerator InliningStatus inlineUnsafeSetReservedSlot(CallInfo &callInfo); InliningStatus inlineUnsafeGetReservedSlot(CallInfo &callInfo); + // ForkJoin intrinsics + InliningStatus inlineForkJoinGetSlice(CallInfo &callInfo); + // Utility intrinsics. InliningStatus inlineIsCallable(CallInfo &callInfo); InliningStatus inlineHaveSameClass(CallInfo &callInfo); diff --git a/js/src/jit/IonMacroAssembler.h b/js/src/jit/IonMacroAssembler.h index 6b7f83023a1..230428fc0d0 100644 --- a/js/src/jit/IonMacroAssembler.h +++ b/js/src/jit/IonMacroAssembler.h @@ -377,6 +377,10 @@ class MacroAssembler : public MacroAssemblerSpecific rshiftPtr(Imm32(JSString::LENGTH_SHIFT), dest); } + void loadSliceBounds(Register worker, Register dest) { + loadPtr(Address(worker, ThreadPoolWorker::offsetOfSliceBounds()), dest); + } + void loadJSContext(const Register &dest) { loadPtr(AbsoluteAddress(GetIonContext()->runtime->addressOfJSContext()), dest); } @@ -833,6 +837,12 @@ class MacroAssembler : public MacroAssemblerSpecific Push(ImmPtr(nullptr)); } + void loadThreadPool(Register pool) { + // JitRuntimes are tied to JSRuntimes and there is one ThreadPool per + // JSRuntime, so we can hardcode the ThreadPool address here. + movePtr(ImmPtr(GetIonContext()->runtime->addressOfThreadPool()), pool); + } + void loadForkJoinContext(Register cx, Register scratch); void loadContext(Register cxReg, Register scratch, ExecutionMode executionMode); diff --git a/js/src/jit/JitCompartment.h b/js/src/jit/JitCompartment.h index aa28b54f42b..859dcc7112b 100644 --- a/js/src/jit/JitCompartment.h +++ b/js/src/jit/JitCompartment.h @@ -189,6 +189,9 @@ class JitRuntime // Thunk used by the debugger for breakpoint and step mode. JitCode *debugTrapHandler_; + // Stub used to inline the ForkJoinGetSlice intrinsic. + JitCode *forkJoinGetSliceStub_; + // Map VMFunction addresses to the JitCode of the wrapper. typedef WeakCache VMWrapperMap; VMWrapperMap *functionWrappers_; @@ -219,6 +222,7 @@ class JitRuntime JitCode *generateInvalidator(JSContext *cx); JitCode *generatePreBarrier(JSContext *cx, MIRType type); JitCode *generateDebugTrapHandler(JSContext *cx); + JitCode *generateForkJoinGetSliceStub(JSContext *cx); JitCode *generateVMWrapper(JSContext *cx, const VMFunction &f); JSC::ExecutableAllocator *createIonAlloc(JSContext *cx); @@ -321,6 +325,11 @@ class JitRuntime JitCode *shapePreBarrier() const { return shapePreBarrier_; } + + bool ensureForkJoinGetSliceStubExists(JSContext *cx); + JitCode *forkJoinGetSliceStub() const { + return forkJoinGetSliceStub_; + } }; class JitCompartment diff --git a/js/src/jit/LIR-Common.h b/js/src/jit/LIR-Common.h index 41d50fe443d..bc539d4ffec 100644 --- a/js/src/jit/LIR-Common.h +++ b/js/src/jit/LIR-Common.h @@ -4785,6 +4785,38 @@ class LForkJoinContext : public LCallInstructionHelper<1, 0, 1> } }; +class LForkJoinGetSlice : public LInstructionHelper<1, 1, 4> +{ + public: + LIR_HEADER(ForkJoinGetSlice); + + LForkJoinGetSlice(const LAllocation &cx, + const LDefinition &temp1, const LDefinition &temp2, + const LDefinition &temp3, const LDefinition &temp4) { + setOperand(0, cx); + setTemp(0, temp1); + setTemp(1, temp2); + setTemp(2, temp3); + setTemp(3, temp4); + } + + const LAllocation *forkJoinContext() { + return getOperand(0); + } + const LDefinition *temp1() { + return getTemp(0); + } + const LDefinition *temp2() { + return getTemp(1); + } + const LDefinition *temp3() { + return getTemp(2); + } + const LDefinition *temp4() { + return getTemp(3); + } +}; + class LCallGetProperty : public LCallInstructionHelper { public: diff --git a/js/src/jit/LOpcodes.h b/js/src/jit/LOpcodes.h index fe3eb17a05f..25d4dd02ddd 100644 --- a/js/src/jit/LOpcodes.h +++ b/js/src/jit/LOpcodes.h @@ -214,6 +214,7 @@ _(StoreFixedSlotT) \ _(FunctionEnvironment) \ _(ForkJoinContext) \ + _(ForkJoinGetSlice) \ _(GetPropertyCacheV) \ _(GetPropertyCacheT) \ _(GetPropertyPolymorphicV) \ diff --git a/js/src/jit/MCallOptimize.cpp b/js/src/jit/MCallOptimize.cpp index 110c123fb02..9b00fd1240d 100644 --- a/js/src/jit/MCallOptimize.cpp +++ b/js/src/jit/MCallOptimize.cpp @@ -146,6 +146,8 @@ IonBuilder::inlineNativeCall(CallInfo &callInfo, JSNative native) if (native == intrinsic_ShouldForceSequential || native == intrinsic_InParallelSection) return inlineForceSequentialOrInParallelSection(callInfo); + if (native == intrinsic_ForkJoinGetSlice) + return inlineForkJoinGetSlice(callInfo); // Utility intrinsics. if (native == intrinsic_IsCallable) @@ -1389,6 +1391,40 @@ IonBuilder::inlineForceSequentialOrInParallelSection(CallInfo &callInfo) MOZ_ASSUME_UNREACHABLE("Invalid execution mode"); } +IonBuilder::InliningStatus +IonBuilder::inlineForkJoinGetSlice(CallInfo &callInfo) +{ + if (info().executionMode() != ParallelExecution) + return InliningStatus_NotInlined; + + // Assert the way the function is used instead of testing, as it is a + // self-hosted function which must be used in a particular fashion. + MOZ_ASSERT(callInfo.argc() == 1 && !callInfo.constructing()); + MOZ_ASSERT(callInfo.getArg(0)->type() == MIRType_Int32); + MOZ_ASSERT(getInlineReturnType() == MIRType_Int32); + + callInfo.setImplicitlyUsedUnchecked(); + + switch (info().executionMode()) { + case SequentialExecution: + case DefinitePropertiesAnalysis: + // ForkJoinGetSlice acts as identity for sequential execution. + current->push(callInfo.getArg(0)); + return InliningStatus_Inlined; + case ParallelExecution: + if (LIRGenerator::allowInlineForkJoinGetSlice()) { + MForkJoinGetSlice *getSlice = MForkJoinGetSlice::New(alloc(), + graph().forkJoinContext()); + current->add(getSlice); + current->push(getSlice); + return InliningStatus_Inlined; + } + return InliningStatus_NotInlined; + } + + MOZ_ASSUME_UNREACHABLE("Invalid execution mode"); +} + IonBuilder::InliningStatus IonBuilder::inlineNewDenseArray(CallInfo &callInfo) { diff --git a/js/src/jit/MIR.h b/js/src/jit/MIR.h index 459a331ec45..8ff1a88005b 100644 --- a/js/src/jit/MIR.h +++ b/js/src/jit/MIR.h @@ -7489,6 +7489,33 @@ class MForkJoinContext } }; +// Calls the ForkJoinGetSlice stub, used for inlining the eponymous intrinsic. +// Only applicable in ParallelExecution. +class MForkJoinGetSlice + : public MUnaryInstruction +{ + MForkJoinGetSlice(MDefinition *cx) + : MUnaryInstruction(cx) + { + setResultType(MIRType_Int32); + } + + public: + INSTRUCTION_HEADER(ForkJoinGetSlice); + + static MForkJoinGetSlice *New(TempAllocator &alloc, MDefinition *cx) { + return new(alloc) MForkJoinGetSlice(cx); + } + + MDefinition *forkJoinContext() { + return getOperand(0); + } + + bool possiblyCalls() const { + return true; + } +}; + // Store to vp[slot] (slots that are not inline in an object). class MStoreSlot : public MBinaryInstruction, diff --git a/js/src/jit/MOpcodes.h b/js/src/jit/MOpcodes.h index b267e1bc7b5..e598a711a5e 100644 --- a/js/src/jit/MOpcodes.h +++ b/js/src/jit/MOpcodes.h @@ -214,6 +214,7 @@ namespace jit { _(LambdaPar) \ _(RestPar) \ _(ForkJoinContext) \ + _(ForkJoinGetSlice) \ _(GuardThreadExclusive) \ _(InterruptCheckPar) \ _(RecompileCheck) diff --git a/js/src/jit/ParallelSafetyAnalysis.cpp b/js/src/jit/ParallelSafetyAnalysis.cpp index 1e0a35e0e22..791b38614cf 100644 --- a/js/src/jit/ParallelSafetyAnalysis.cpp +++ b/js/src/jit/ParallelSafetyAnalysis.cpp @@ -268,6 +268,7 @@ class ParallelSafetyVisitor : public MInstructionVisitor UNSAFE_OP(InstanceOf) CUSTOM_OP(InterruptCheck) SAFE_OP(ForkJoinContext) + SAFE_OP(ForkJoinGetSlice) SAFE_OP(NewPar) SAFE_OP(NewDenseArrayPar) SAFE_OP(NewCallObjectPar) diff --git a/js/src/jit/arm/CodeGenerator-arm.cpp b/js/src/jit/arm/CodeGenerator-arm.cpp index 39c85f10a0b..2ae03452a3c 100644 --- a/js/src/jit/arm/CodeGenerator-arm.cpp +++ b/js/src/jit/arm/CodeGenerator-arm.cpp @@ -2310,3 +2310,15 @@ CodeGeneratorARM::visitNegF(LNegF *ins) masm.ma_vneg_f32(input, ToFloatRegister(ins->output())); return true; } + +bool +CodeGeneratorARM::visitForkJoinGetSlice(LForkJoinGetSlice *ins) +{ + MOZ_ASSUME_UNREACHABLE("NYI"); +} + +JitCode * +JitRuntime::generateForkJoinGetSliceStub(JSContext *cx) +{ + MOZ_ASSUME_UNREACHABLE("NYI"); +} diff --git a/js/src/jit/arm/CodeGenerator-arm.h b/js/src/jit/arm/CodeGenerator-arm.h index c714382236c..120c1b058c5 100644 --- a/js/src/jit/arm/CodeGenerator-arm.h +++ b/js/src/jit/arm/CodeGenerator-arm.h @@ -170,9 +170,10 @@ class CodeGeneratorARM : public CodeGeneratorShared bool visitAsmJSStoreGlobalVar(LAsmJSStoreGlobalVar *ins); bool visitAsmJSLoadFuncPtr(LAsmJSLoadFuncPtr *ins); bool visitAsmJSLoadFFIFunc(LAsmJSLoadFFIFunc *ins); - bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins); + bool visitForkJoinGetSlice(LForkJoinGetSlice *ins); + bool generateInvalidateEpilogue(); protected: void postAsmJSCall(LAsmJSCall *lir) { diff --git a/js/src/jit/arm/Lowering-arm.cpp b/js/src/jit/arm/Lowering-arm.cpp index b74971f3e3f..c306e5f763c 100644 --- a/js/src/jit/arm/Lowering-arm.cpp +++ b/js/src/jit/arm/Lowering-arm.cpp @@ -544,4 +544,10 @@ LIRGeneratorARM::visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic MOZ_ASSUME_UNREACHABLE("NYI"); } +bool +LIRGeneratorARM::visitForkJoinGetSlice(MForkJoinGetSlice *ins) +{ + MOZ_ASSUME_UNREACHABLE("NYI"); +} + //__aeabi_uidiv diff --git a/js/src/jit/arm/Lowering-arm.h b/js/src/jit/arm/Lowering-arm.h index bc460fc16a2..38a3ff45a1b 100644 --- a/js/src/jit/arm/Lowering-arm.h +++ b/js/src/jit/arm/Lowering-arm.h @@ -88,6 +88,7 @@ class LIRGeneratorARM : public LIRGeneratorShared bool visitAsmJSStoreHeap(MAsmJSStoreHeap *ins); bool visitAsmJSLoadFuncPtr(MAsmJSLoadFuncPtr *ins); bool visitStoreTypedArrayElementStatic(MStoreTypedArrayElementStatic *ins); + bool visitForkJoinGetSlice(MForkJoinGetSlice *ins); static bool allowFloat32Optimizations() { return true; diff --git a/js/src/jit/shared/Assembler-x86-shared.h b/js/src/jit/shared/Assembler-x86-shared.h index 0b8e38ef072..164fb5c56ea 100644 --- a/js/src/jit/shared/Assembler-x86-shared.h +++ b/js/src/jit/shared/Assembler-x86-shared.h @@ -525,6 +525,9 @@ class AssemblerX86Shared } void movzwl(const Operand &src, const Register &dest) { switch (src.kind()) { + case Operand::REG: + masm.movzwl_rr(src.reg(), dest.code()); + break; case Operand::MEM_REG_DISP: masm.movzwl_mr(src.disp(), src.base(), dest.code()); break; @@ -535,7 +538,9 @@ class AssemblerX86Shared MOZ_ASSUME_UNREACHABLE("unexpected operand kind"); } } - + void movzwl(const Register &src, const Register &dest) { + masm.movzwl_rr(src.code(), dest.code()); + } void movw(const Register &src, const Operand &dest) { switch (dest.kind()) { case Operand::MEM_REG_DISP: @@ -858,6 +863,9 @@ class AssemblerX86Shared void cmpl(const Operand &op, ImmPtr imm) { cmpl(op, ImmWord(uintptr_t(imm.value))); } + void cmpw(const Register &lhs, const Register &rhs) { + masm.cmpw_rr(lhs.code(), rhs.code()); + } void setCC(Condition cond, const Register &r) { masm.setCC_r(static_cast(cond), r.code()); } @@ -866,6 +874,9 @@ class AssemblerX86Shared JS_ASSERT(GeneralRegisterSet(Registers::SingleByteRegs).has(rhs)); masm.testb_rr(rhs.code(), lhs.code()); } + void testw(const Register &lhs, const Register &rhs) { + masm.testw_rr(rhs.code(), lhs.code()); + } void testl(const Register &lhs, const Register &rhs) { masm.testl_rr(rhs.code(), lhs.code()); } diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.cpp b/js/src/jit/shared/CodeGenerator-x86-shared.cpp index 8e0e8b12e01..317121f7741 100644 --- a/js/src/jit/shared/CodeGenerator-x86-shared.cpp +++ b/js/src/jit/shared/CodeGenerator-x86-shared.cpp @@ -9,7 +9,10 @@ #include "mozilla/DebugOnly.h" #include "mozilla/MathAlgorithms.h" +#include "jsmath.h" + #include "jit/IonFrames.h" +#include "jit/IonLinker.h" #include "jit/JitCompartment.h" #include "jit/RangeAnalysis.h" @@ -1768,5 +1771,177 @@ CodeGeneratorX86Shared::visitNegF(LNegF *ins) return true; } +bool +CodeGeneratorX86Shared::visitForkJoinGetSlice(LForkJoinGetSlice *ins) +{ + MOZ_ASSERT(gen->info().executionMode() == ParallelExecution); + MOZ_ASSERT(ToRegister(ins->forkJoinContext()) == ForkJoinGetSliceReg_cx); + MOZ_ASSERT(ToRegister(ins->temp1()) == eax); + MOZ_ASSERT(ToRegister(ins->temp2()) == edx); + MOZ_ASSERT(ToRegister(ins->temp3()) == ForkJoinGetSliceReg_temp0); + MOZ_ASSERT(ToRegister(ins->temp4()) == ForkJoinGetSliceReg_temp1); + MOZ_ASSERT(ToRegister(ins->output()) == ForkJoinGetSliceReg_output); + + masm.call(gen->jitRuntime()->forkJoinGetSliceStub()); + return true; +} + +JitCode * +JitRuntime::generateForkJoinGetSliceStub(JSContext *cx) +{ +#ifdef JS_THREADSAFE + MacroAssembler masm(cx); + + // We need two fixed temps. We need to fix eax for cmpxchg, and edx for + // div. + Register cxReg = ForkJoinGetSliceReg_cx, worker = cxReg; + Register pool = ForkJoinGetSliceReg_temp0; + Register bounds = ForkJoinGetSliceReg_temp1; + Register output = ForkJoinGetSliceReg_output; + + MOZ_ASSERT(worker != eax && worker != edx); + MOZ_ASSERT(pool != eax && pool != edx); + MOZ_ASSERT(bounds != eax && bounds != edx); + MOZ_ASSERT(output != eax && output != edx); + + Label stealWork, noMoreWork, gotSlice; + Operand workerSliceBounds(Address(worker, ThreadPoolWorker::offsetOfSliceBounds())); + + // Clobber cx to load the worker. + masm.push(cxReg); + masm.loadPtr(Address(cxReg, ForkJoinContext::offsetOfWorker()), worker); + + // Load the thread pool, which is used in all cases below. + masm.loadThreadPool(pool); + + { + // Try to get a slice from the current thread. + Label getOwnSliceLoopHead; + masm.bind(&getOwnSliceLoopHead); + + // Load the slice bounds for the current thread. + masm.loadSliceBounds(worker, bounds); + + // The slice bounds is a uint32 composed from two uint16s: + // [ from , to ] + // ^~~~ ^~ + // upper 16 bits | lower 16 bits + masm.move32(bounds, output); + masm.shrl(Imm32(16), output); + + // If we don't have any slices left ourselves, move on to stealing. + masm.branch16(Assembler::Equal, output, bounds, &stealWork); + + // If we still have work, try to CAS [ from+1, to ]. + masm.move32(bounds, edx); + masm.add32(Imm32(0x10000), edx); + masm.move32(bounds, eax); + masm.atomic_cmpxchg32(edx, workerSliceBounds, eax); + masm.j(Assembler::NonZero, &getOwnSliceLoopHead); + + // If the CAS succeeded, return |from| in output. + masm.jump(&gotSlice); + } + + // Try to steal work. + masm.bind(&stealWork); + + // It's not technically correct to test whether work-stealing is turned on + // only during stub-generation time, but it's a DEBUG only thing. + if (cx->runtime()->threadPool.workStealing()) { + Label stealWorkLoopHead; + masm.bind(&stealWorkLoopHead); + + // Check if we have work. + masm.branch32(Assembler::Equal, + Address(pool, ThreadPool::offsetOfPendingSlices()), + Imm32(0), &noMoreWork); + + // Get an id at random. The following is an inline of + // the 32-bit xorshift in ThreadPoolWorker::randomWorker(). + { + // Reload the current worker. + masm.loadPtr(Address(StackPointer, 0), cxReg); + masm.loadPtr(Address(cxReg, ForkJoinContext::offsetOfWorker()), worker); + + // Perform the xorshift to get a random number in eax, using edx + // as a temp. + Address rngState(worker, ThreadPoolWorker::offsetOfSchedulerRNGState()); + masm.load32(rngState, eax); + masm.move32(eax, edx); + masm.shll(Imm32(ThreadPoolWorker::XORSHIFT_A), eax); + masm.xor32(edx, eax); + masm.move32(eax, edx); + masm.shrl(Imm32(ThreadPoolWorker::XORSHIFT_B), eax); + masm.xor32(edx, eax); + masm.move32(eax, edx); + masm.shll(Imm32(ThreadPoolWorker::XORSHIFT_C), eax); + masm.xor32(edx, eax); + masm.store32(eax, rngState); + + // Compute the random worker id by computing % numWorkers. Reuse + // output as a temp. + masm.move32(Imm32(0), edx); + masm.move32(Imm32(cx->runtime()->threadPool.numWorkers()), output); + masm.udiv(output); + } + + // Load the worker from the workers array. + masm.loadPtr(Address(pool, ThreadPool::offsetOfWorkers()), worker); + masm.loadPtr(BaseIndex(worker, edx, ScalePointer), worker); + + // Try to get a slice from the designated victim worker. + Label stealSliceFromWorkerLoopHead; + masm.bind(&stealSliceFromWorkerLoopHead); + + // Load the slice bounds and decompose for the victim worker. + masm.loadSliceBounds(worker, bounds); + masm.move32(bounds, eax); + masm.shrl(Imm32(16), eax); + + // If the victim worker has no more slices left, find another worker. + masm.branch16(Assembler::Equal, eax, bounds, &stealWorkLoopHead); + + // If the victim worker still has work, try to CAS [ from, to-1 ]. + masm.move32(bounds, output); + masm.sub32(Imm32(1), output); + masm.move32(bounds, eax); + masm.atomic_cmpxchg32(output, workerSliceBounds, eax); + masm.j(Assembler::NonZero, &stealSliceFromWorkerLoopHead); + + // If the CAS succeeded, return |to-1| in output. +#ifdef DEBUG + masm.atomic_inc32(Operand(Address(pool, ThreadPool::offsetOfStolenSlices()))); +#endif + // Copies lower 16 bits only. + masm.movzwl(output, output); + } + + // If we successfully got a slice, decrement pool->pendingSlices_ and + // return the slice. + masm.bind(&gotSlice); + masm.atomic_dec32(Operand(Address(pool, ThreadPool::offsetOfPendingSlices()))); + masm.pop(cxReg); + masm.ret(); + + // There's no more slices to give out, return -1. + masm.bind(&noMoreWork); + masm.move32(Imm32(-1), output); + masm.pop(cxReg); + masm.ret(); + + Linker linker(masm); + JitCode *code = linker.newCode(cx, JSC::OTHER_CODE); + +#ifdef JS_ION_PERF + writePerfSpewerJitCodeProfile(code, "ForkJoinGetSliceStub"); +#endif + + return code; +#else + return nullptr; +#endif // JS_THREADSAFE +} + } // namespace jit } // namespace js diff --git a/js/src/jit/shared/CodeGenerator-x86-shared.h b/js/src/jit/shared/CodeGenerator-x86-shared.h index 6a7f5f65a5d..1ad0ae07d16 100644 --- a/js/src/jit/shared/CodeGenerator-x86-shared.h +++ b/js/src/jit/shared/CodeGenerator-x86-shared.h @@ -122,6 +122,8 @@ class CodeGeneratorX86Shared : public CodeGeneratorShared virtual bool visitUDivOrMod(LUDivOrMod *ins); virtual bool visitAsmJSPassStackArg(LAsmJSPassStackArg *ins); + bool visitForkJoinGetSlice(LForkJoinGetSlice *ins); + bool visitNegI(LNegI *lir); bool visitNegD(LNegD *lir); bool visitNegF(LNegF *lir); diff --git a/js/src/jit/shared/Lowering-shared.h b/js/src/jit/shared/Lowering-shared.h index 35075335af1..d26982909f6 100644 --- a/js/src/jit/shared/Lowering-shared.h +++ b/js/src/jit/shared/Lowering-shared.h @@ -191,6 +191,11 @@ class LIRGeneratorShared : public MInstructionVisitorWithDefaults static bool allowFloat32Optimizations() { return false; } + + // Whether we can inline ForkJoinGetSlice. + static bool allowInlineForkJoinGetSlice() { + return false; + } }; } // namespace jit diff --git a/js/src/jit/shared/Lowering-x86-shared.cpp b/js/src/jit/shared/Lowering-x86-shared.cpp index 8128ec6fd33..684fc7c77b6 100644 --- a/js/src/jit/shared/Lowering-x86-shared.cpp +++ b/js/src/jit/shared/Lowering-x86-shared.cpp @@ -295,3 +295,16 @@ LIRGeneratorX86Shared::lowerTruncateFToInt32(MTruncateToInt32 *ins) LDefinition maybeTemp = Assembler::HasSSE3() ? LDefinition::BogusTemp() : tempFloat32(); return define(new(alloc()) LTruncateFToInt32(useRegister(opd), maybeTemp), ins); } + +bool +LIRGeneratorX86Shared::visitForkJoinGetSlice(MForkJoinGetSlice *ins) +{ + // We fix eax and edx for cmpxchg and div. + LForkJoinGetSlice *lir = new(alloc()) + LForkJoinGetSlice(useFixed(ins->forkJoinContext(), ForkJoinGetSliceReg_cx), + tempFixed(eax), + tempFixed(edx), + tempFixed(ForkJoinGetSliceReg_temp0), + tempFixed(ForkJoinGetSliceReg_temp1)); + return defineFixed(lir, ins, LAllocation(AnyRegister(ForkJoinGetSliceReg_output))); +} diff --git a/js/src/jit/shared/Lowering-x86-shared.h b/js/src/jit/shared/Lowering-x86-shared.h index 19d6700edf6..eeece101301 100644 --- a/js/src/jit/shared/Lowering-x86-shared.h +++ b/js/src/jit/shared/Lowering-x86-shared.h @@ -47,6 +47,7 @@ class LIRGeneratorX86Shared : public LIRGeneratorShared bool lowerConstantFloat32(float d, MInstruction *ins); bool lowerTruncateDToInt32(MTruncateToInt32 *ins); bool lowerTruncateFToInt32(MTruncateToInt32 *ins); + bool visitForkJoinGetSlice(MForkJoinGetSlice *ins); }; } // namespace jit diff --git a/js/src/jit/shared/MacroAssembler-x86-shared.h b/js/src/jit/shared/MacroAssembler-x86-shared.h index 8a2b8eefaa9..ba258ef6d57 100644 --- a/js/src/jit/shared/MacroAssembler-x86-shared.h +++ b/js/src/jit/shared/MacroAssembler-x86-shared.h @@ -111,6 +111,9 @@ class MacroAssemblerX86Shared : public Assembler void and32(const Imm32 &imm, const Address &dest) { andl(imm, Operand(dest)); } + void or32(const Register &src, const Register &dest) { + orl(src, dest); + } void or32(const Imm32 &imm, const Register &dest) { orl(imm, dest); } @@ -156,6 +159,9 @@ class MacroAssemblerX86Shared : public Assembler void xor32(Imm32 imm, Register dest) { xorl(imm, dest); } + void xor32(Register src, Register dest) { + xorl(src, dest); + } void not32(Register reg) { notl(reg); } @@ -177,6 +183,10 @@ class MacroAssemblerX86Shared : public Assembler lock_cmpxchg32(src, addr); } + void branch16(Condition cond, const Register &lhs, const Register &rhs, Label *label) { + cmpw(lhs, rhs); + j(cond, label); + } void branch32(Condition cond, const Operand &lhs, const Register &rhs, Label *label) { cmpl(lhs, rhs); j(cond, label); @@ -201,6 +211,10 @@ class MacroAssemblerX86Shared : public Assembler cmpl(lhs, rhs); j(cond, label); } + void branchTest16(Condition cond, const Register &lhs, const Register &rhs, Label *label) { + testw(lhs, rhs); + j(cond, label); + } void branchTest32(Condition cond, const Register &lhs, const Register &rhs, Label *label) { testl(lhs, rhs); j(cond, label); diff --git a/js/src/jit/x64/Assembler-x64.h b/js/src/jit/x64/Assembler-x64.h index b02bfd80c33..6ea67251571 100644 --- a/js/src/jit/x64/Assembler-x64.h +++ b/js/src/jit/x64/Assembler-x64.h @@ -129,6 +129,13 @@ static MOZ_CONSTEXPR_VAR uint32_t NumFloatArgRegs = 8; static MOZ_CONSTEXPR_VAR FloatRegister FloatArgRegs[NumFloatArgRegs] = { xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 }; #endif +// The convention used by the ForkJoinGetSlice stub. None of these can be rax +// or rdx, which the stub also needs for cmpxchg and div, respectively. +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_cx = rdi; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp0 = rbx; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp1 = rcx; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_output = rsi; + class ABIArgGenerator { #if defined(XP_WIN) diff --git a/js/src/jit/x64/Lowering-x64.h b/js/src/jit/x64/Lowering-x64.h index 8294ceb3d75..6b58f17d282 100644 --- a/js/src/jit/x64/Lowering-x64.h +++ b/js/src/jit/x64/Lowering-x64.h @@ -55,6 +55,10 @@ class LIRGeneratorX64 : public LIRGeneratorX86Shared static bool allowFloat32Optimizations() { return true; } + + static bool allowInlineForkJoinGetSlice() { + return true; + } }; typedef LIRGeneratorX64 LIRGeneratorSpecific; diff --git a/js/src/jit/x86/Assembler-x86.h b/js/src/jit/x86/Assembler-x86.h index 55886db02a0..2cc3e0e7174 100644 --- a/js/src/jit/x86/Assembler-x86.h +++ b/js/src/jit/x86/Assembler-x86.h @@ -55,6 +55,13 @@ static MOZ_CONSTEXPR_VAR Register CallTempReg3 = ecx; static MOZ_CONSTEXPR_VAR Register CallTempReg4 = esi; static MOZ_CONSTEXPR_VAR Register CallTempReg5 = edx; +// The convention used by the ForkJoinGetSlice stub. None of these can be eax +// or edx, which the stub also needs for cmpxchg and div, respectively. +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_cx = edi; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp0 = ebx; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_temp1 = ecx; +static MOZ_CONSTEXPR_VAR Register ForkJoinGetSliceReg_output = esi; + // We have no arg regs, so our NonArgRegs are just our CallTempReg* static MOZ_CONSTEXPR_VAR Register CallTempNonArgRegs[] = { edi, eax, ebx, ecx, esi, edx }; static const uint32_t NumCallTempNonArgRegs = diff --git a/js/src/jit/x86/Lowering-x86.h b/js/src/jit/x86/Lowering-x86.h index 1062e28cab2..1590aaab915 100644 --- a/js/src/jit/x86/Lowering-x86.h +++ b/js/src/jit/x86/Lowering-x86.h @@ -66,6 +66,10 @@ class LIRGeneratorX86 : public LIRGeneratorX86Shared static bool allowFloat32Optimizations() { return true; } + + static bool allowInlineForkJoinGetSlice() { + return true; + } }; typedef LIRGeneratorX86 LIRGeneratorSpecific; diff --git a/js/src/vm/ForkJoin.cpp b/js/src/vm/ForkJoin.cpp index 171cb5c5406..10788e5489f 100644 --- a/js/src/vm/ForkJoin.cpp +++ b/js/src/vm/ForkJoin.cpp @@ -1422,8 +1422,8 @@ ForkJoinShared::execute() } #ifdef DEBUG - Spew(SpewOps, "Completed parallel job [slices %d, threads: %d (+1), stolen: %d (work stealing:%s)]", - sliceTo_ - sliceFrom_, + Spew(SpewOps, "Completed parallel job [slices: %d, threads: %d, stolen: %d (work stealing:%s)]", + sliceTo_ - sliceFrom_ + 1, threadPool_->numWorkers(), threadPool_->stolenSlices(), threadPool_->workStealing() ? "ON" : "OFF"); diff --git a/js/src/vm/ForkJoin.h b/js/src/vm/ForkJoin.h index 08bcd442cec..8d02a910c9b 100644 --- a/js/src/vm/ForkJoin.h +++ b/js/src/vm/ForkJoin.h @@ -406,6 +406,11 @@ class ForkJoinContext : public ThreadSafeContext // Initializes the thread-local state. static bool initialize(); + // Used in inlining GetForkJoinSlice. + static size_t offsetOfWorker() { + return offsetof(ForkJoinContext, worker_); + } + private: friend class AutoSetForkJoinContext; diff --git a/js/src/vm/ThreadPool.h b/js/src/vm/ThreadPool.h index c1b5b720c35..70720a0c0c8 100644 --- a/js/src/vm/ThreadPool.h +++ b/js/src/vm/ThreadPool.h @@ -99,6 +99,10 @@ class ThreadPoolWorker static size_t offsetOfSliceBounds() { return offsetof(ThreadPoolWorker, sliceBounds_); } + + static size_t offsetOfSchedulerRNGState() { + return offsetof(ThreadPoolWorker, schedulerRNGState_); + } }; ///////////////////////////////////////////////////////////////////////////// @@ -192,6 +196,18 @@ class ThreadPool : public Monitor ThreadPoolWorker *mainThreadWorker() { return workers_[0]; } public: +#ifdef DEBUG + static size_t offsetOfStolenSlices() { + return offsetof(ThreadPool, stolenSlices_); + } +#endif + static size_t offsetOfPendingSlices() { + return offsetof(ThreadPool, pendingSlices_); + } + static size_t offsetOfWorkers() { + return offsetof(ThreadPool, workers_); + } + ThreadPool(JSRuntime *rt); ~ThreadPool();