Bug 958370 - Unify chunks and slices; self-host the scheduler's slice processing loop. (r=nmatsakis)

2024-09-13 09:24:08 -07:00 · 2014-02-07 14:40:29 -08:00 · 2014-02-07 14:40:29 -08:00 · c6165cfa30
commit c6165cfa30
parent b674035d7c
11 changed files with 561 additions and 711 deletions
--- a/js/src/builtin/Array.js
+++ b/js/src/builtin/Array.js
@ -569,20 +569,6 @@ function ArrayKeys() {
 #define ASSERT_SEQUENTIAL_IS_OK(MODE) \
  do { if (MODE) AssertSequentialIsOK(MODE) } while(false)

-/* Slice array: see ComputeAllSliceBounds() */
-#define SLICE_INFO(START, END) START, END, START, 0
-#define SLICE_START(ID) ((ID << 2) + 0)
-#define SLICE_END(ID)   ((ID << 2) + 1)
-#define SLICE_POS(ID)   ((ID << 2) + 2)
-
-/*
- * How many items at a time do we do recomp. for parallel execution.
- * Note that filter currently assumes that this is no greater than 32
- * in order to make use of a bitset.
- */
-#define CHUNK_SHIFT 5
-#define CHUNK_SIZE 32
-
 /* Safe versions of ARRAY.push(ELEMENT) */
 #define ARRAY_PUSH(ARRAY, ELEMENT) \
  callFunction(std_Array_push, ARRAY, ELEMENT);
@ -597,74 +583,94 @@ function ArrayKeys() {
 #define ParallelSpew(args)
 #endif

-/**
- * Determine the number of chunks of size CHUNK_SIZE;
- * note that the final chunk may be smaller than CHUNK_SIZE.
- */
-function ComputeNumChunks(length) {
-  var chunks = length >>> CHUNK_SHIFT;
-  if (chunks << CHUNK_SHIFT === length)
-    return chunks;
-  return chunks + 1;
-}
-
-#define SLICES_PER_WORKER 8
+#define MAX_SLICE_SHIFT 6
+#define MAX_SLICE_SIZE 64
+#define MAX_SLICES_PER_WORKER 8

 /**
- * Compute the number of slices given an array length and the number of
- * chunks. Used in tandem with the workstealing scheduler.
+ * Determine the number and size of slices.
 */
-function ComputeNumSlices(workers, length, chunks) {
-  if (length !== 0) {
-    var slices = workers * SLICES_PER_WORKER;
-    if (chunks < slices)
-      return workers;
-    return slices;
-  }
-  return workers;
+function ComputeSlicesInfo(length) {
+  var count = length >>> MAX_SLICE_SHIFT;
+  var numWorkers = ForkJoinNumWorkers();
+  if (count < numWorkers)
+    count = numWorkers;
+  else if (count >= numWorkers * MAX_SLICES_PER_WORKER)
+    count = numWorkers * MAX_SLICES_PER_WORKER;
+
+  // Round the slice size to be a power of 2.
+  var shift = std_Math_max(std_Math_log2(length / count) | 0, 1);
+
+  // Recompute count with the rounded size.
+  count = length >>> shift;
+  if (count << shift !== length)
+    count += 1;
+
+  return { shift: shift, statuses: new Uint8Array(count), lastSequentialId: 0 };
 }

 /**
- * Computes the bounds for slice |sliceIndex| of |numItems| items,
- * assuming |numSlices| total slices. If numItems is not evenly
- * divisible by numSlices, then the final thread may have a bit of
- * extra work.
+ * Macros to help compute the start and end indices of slices based on id. Use
+ * with the object returned by ComputeSliceInfo.
 */
-function ComputeSliceBounds(numItems, sliceIndex, numSlices) {
-  var sliceWidth = (numItems / numSlices) | 0;
-  var extraChunks = (numItems % numSlices) | 0;
+#define SLICE_START(info, id) \
+    (id << info.shift)
+#define SLICE_END(info, start, length) \
+    std_Math_min(start + (1 << info.shift), length)
+#define SLICE_COUNT(info) \
+    info.statuses.length

-  var startIndex = sliceWidth * sliceIndex + std_Math_min(extraChunks, sliceIndex);
-  var endIndex = startIndex + sliceWidth;
-  if (sliceIndex < extraChunks)
-    endIndex += 1;
-  return [startIndex, endIndex];
+/**
+ * ForkJoinGetSlice acts as identity when we are not in a parallel section, so
+ * pass in the next sequential value when we are in sequential mode. The
+ * reason for this odd API is because intrinsics *need* to be called during
+ * ForkJoin's warmup to fill the TI info.
+ */
+#define GET_SLICE(info, id) \
+    ((id = ForkJoinGetSlice(InParallelSection() ? -1 : NextSequentialSliceId(info, -1))) >= 0)
+
+#define SLICE_STATUS_DONE 1
+
+/**
+ * Macro to mark a slice as completed in the info object.
+ */
+#define MARK_SLICE_DONE(info, id) \
+    UnsafePutElements(info.statuses, id, SLICE_STATUS_DONE)
+
+/**
+ * Reset the status array of the slices info object.
+ */
+function SlicesInfoClearStatuses(info) {
+  var statuses = info.statuses;
+  var length = statuses.length;
+  for (var i = 0; i < length; i++)
+    UnsafePutElements(statuses, i, 0);
+  info.lastSequentialId = 0;
 }

 /**
- * Divides |numItems| items amongst |numSlices| slices. The result
- * is an array containing multiple values per slice: the start
- * index, end index, current position, and some padding. The
- * current position is initially the same as the start index. To
- * access the values for a particular slice, use the macros
- * SLICE_START() and so forth.
+ * Compute the slice such that all slices before it (but not including it) are
+ * completed.
 */
-function ComputeAllSliceBounds(numItems, numSlices) {
-  // FIXME(bug 844890): Use typed arrays here.
-  var sliceWidth = (numItems / numSlices) | 0;
-  var extraChunks = (numItems % numSlices) | 0;
-  var counter = 0;
-  var info = [];
-  var i = 0;
-  for (; i < extraChunks; i++) {
-    ARRAY_PUSH(info, SLICE_INFO(counter, counter + sliceWidth + 1));
-    counter += sliceWidth + 1;
+function NextSequentialSliceId(info, doneMarker) {
+  var statuses = info.statuses;
+  var length = statuses.length;
+  for (var i = info.lastSequentialId; i < length; i++) {
+    if (statuses[i] === SLICE_STATUS_DONE)
+      continue;
+    info.lastSequentialId = i;
+    return i;
  }
-  for (; i < numSlices; i++) {
-    ARRAY_PUSH(info, SLICE_INFO(counter, counter + sliceWidth));
-    counter += sliceWidth;
-  }
-  return info;
+  return doneMarker == undefined ? length : doneMarker;
+}
+
+/**
+ * Determinism-preserving bounds function.
+ */
+function ShrinkLeftmost(info) {
+  return function () {
+    return [NextSequentialSliceId(info), SLICE_COUNT(info)]
+  };
 }

 /**
@ -691,41 +697,28 @@ function ArrayMapPar(func, mode) {
    if (!TRY_PARALLEL(mode))
      break parallel;

-    var chunks = ComputeNumChunks(length);
-    var numWorkers = ForkJoinNumWorkers();
-    var numSlices = ComputeNumSlices(numWorkers, length, chunks);
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-    ForkJoin(mapSlice, ForkJoinMode(mode), numSlices);
+    var slicesInfo = ComputeSlicesInfo(length);
+    ForkJoin(mapThread, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));
    return buffer;
  }

  // Sequential fallback:
  ASSERT_SEQUENTIAL_IS_OK(mode);
-  for (var i = 0; i < length; i++) {
-    // Note: Unlike JS arrays, parallel arrays cannot have holes.
-    var v = func(self[i], i, self);
-    UnsafePutElements(buffer, i, v);
-  }
+  for (var i = 0; i < length; i++)
+    UnsafePutElements(buffer, i, func(self[i], i, self));
  return buffer;

-  function mapSlice(sliceId, warmup) {
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos + 1)
-      chunkEnd = chunkPos + 1;
-
-    while (chunkPos < chunkEnd) {
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-
+  function mapThread(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
      for (var i = indexStart; i < indexEnd; i++)
        UnsafePutElements(buffer, i, func(self[i], i, self));
-
-      UnsafePutElements(info, SLICE_POS(sliceId), ++chunkPos);
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    return chunkEnd === info[SLICE_END(sliceId)];
  }

  return undefined;
@ -751,15 +744,12 @@ function ArrayReducePar(func, mode) {
    if (!TRY_PARALLEL(mode))
      break parallel;

-    var chunks = ComputeNumChunks(length);
-    var numWorkers = ForkJoinNumWorkers();
-    if (chunks < numWorkers)
-      break parallel;
-
-    var numSlices = ComputeNumSlices(numWorkers, length, chunks);
-    var info = ComputeAllSliceBounds(chunks, numSlices);
+    var slicesInfo = ComputeSlicesInfo(length);
+    var numSlices = SLICE_COUNT(slicesInfo);
    var subreductions = NewDenseArray(numSlices);
-    ForkJoin(reduceSlice, ForkJoinMode(mode), numSlices);
+
+    ForkJoin(reduceThread, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));
+
    var accumulator = subreductions[0];
    for (var i = 1; i < numSlices; i++)
      accumulator = func(accumulator, subreductions[i]);
@ -773,46 +763,19 @@ function ArrayReducePar(func, mode) {
    accumulator = func(accumulator, self[i]);
  return accumulator;

-  function reduceSlice(sliceId, warmup) {
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    // (*) This function is carefully designed so that the warmup
-    // (which executes with chunkStart === chunkPos) will execute all
-    // potential loads and stores. In particular, the warmup run
-    // processes two chunks rather than one. Moreover, it stores
-    // accumulator into subreductions and then loads it again to
-    // ensure that the load is executed during the warmup, as it will
-    // certainly be executed during subsequent runs.
-
-    if (warmup && chunkEnd > chunkPos + 2)
-      chunkEnd = chunkPos + 2;
-
-    if (chunkStart === chunkPos) {
-      var indexPos = chunkStart << CHUNK_SHIFT;
-      var accumulator = reduceChunk(self[indexPos], indexPos + 1, indexPos + CHUNK_SIZE);
-
-      UnsafePutElements(subreductions, sliceId, accumulator, // see (*) above
-                        info, SLICE_POS(sliceId), ++chunkPos);
+  function reduceThread(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
+      var accumulator = self[indexStart];
+      for (var i = indexStart + 1; i < indexEnd; i++)
+        accumulator = func(accumulator, self[i]);
+      UnsafePutElements(subreductions, sliceId, accumulator);
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    var accumulator = subreductions[sliceId]; // see (*) above
-
-    while (chunkPos < chunkEnd) {
-      var indexPos = chunkPos << CHUNK_SHIFT;
-      accumulator = reduceChunk(accumulator, indexPos, indexPos + CHUNK_SIZE);
-      UnsafePutElements(subreductions, sliceId, accumulator, info, SLICE_POS(sliceId), ++chunkPos);
-    }
-
-    return chunkEnd === info[SLICE_END(sliceId)];
-  }
-
-  function reduceChunk(accumulator, from, to) {
-    to = std_Math_min(to, length);
-    for (var i = from; i < to; i++)
-      accumulator = func(accumulator, self[i]);
-    return accumulator;
  }

  return undefined;
@ -841,16 +804,11 @@ function ArrayScanPar(func, mode) {
    if (!TRY_PARALLEL(mode))
      break parallel;

-    var chunks = ComputeNumChunks(length);
-    var numWorkers = ForkJoinNumWorkers();
-    if (chunks < numWorkers)
-      break parallel;
-
-    var numSlices = ComputeNumSlices(numWorkers, length, chunks);
-    var info = ComputeAllSliceBounds(chunks, numSlices);
+    var slicesInfo = ComputeSlicesInfo(length);
+    var numSlices = SLICE_COUNT(slicesInfo);

    // Scan slices individually (see comment on phase1()).
-    ForkJoin(phase1, ForkJoinMode(mode), numSlices);
+    ForkJoin(phase1, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));

    // Compute intermediates array (see comment on phase2()).
    var intermediates = [];
@ -861,16 +819,14 @@ function ArrayScanPar(func, mode) {
      ARRAY_PUSH(intermediates, accumulator);
    }

-    // Reset the current position information for each slice, but
-    // convert from chunks to indices (see comment on phase2()).
-    for (var i = 0; i < numSlices; i++) {
-      info[SLICE_POS(i)] = info[SLICE_START(i)] << CHUNK_SHIFT;
-      info[SLICE_END(i)] = info[SLICE_END(i)] << CHUNK_SHIFT;
-    }
-    info[SLICE_END(numSlices - 1)] = std_Math_min(info[SLICE_END(numSlices - 1)], length);
+    // Clear the slices' statuses in between phases.
+    SlicesInfoClearStatuses(slicesInfo);
+
+    // There is no work to be done for slice 0, so mark it as done.
+    MARK_SLICE_DONE(slicesInfo, 0);

    // Complete each slice using intermediates array (see comment on phase2()).
-    ForkJoin(phase2, ForkJoinMode(mode), numSlices);
+    ForkJoin(phase2, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));
    return buffer;
  }

@ -904,46 +860,23 @@ function ArrayScanPar(func, mode) {
   *
   * Read on in phase2 to see what we do next!
   */
-  function phase1(sliceId, warmup) {
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos + 2)
-      chunkEnd = chunkPos + 2;
-
-    if (chunkPos === chunkStart) {
-      // For the first chunk, the accumulator begins as the value in
-      // the input at the start of the chunk.
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+  function phase1(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
      scan(self[indexStart], indexStart, indexEnd);
-      UnsafePutElements(info, SLICE_POS(sliceId), ++chunkPos);
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    while (chunkPos < chunkEnd) {
-      // For each subsequent chunk, the accumulator begins as the
-      // combination of the final value of prev chunk and the value in
-      // the input at the start of this chunk. Note that this loop is
-      // written as simple as possible, at the cost of an extra read
-      // from the buffer per iteration.
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      var accumulator = func(buffer[indexStart - 1], self[indexStart]);
-      scan(accumulator, indexStart, indexEnd);
-      UnsafePutElements(info, SLICE_POS(sliceId), ++chunkPos);
-    }
-
-    return chunkEnd === info[SLICE_END(sliceId)];
  }

  /**
   * Computes the index of the final element computed by the slice |sliceId|.
   */
  function finalElement(sliceId) {
-    var chunkEnd = info[SLICE_END(sliceId)]; // last chunk written by |sliceId| is endChunk - 1
-    var indexStart = std_Math_min(chunkEnd << CHUNK_SHIFT, length);
-    return indexStart - 1;
+    return SLICE_END(slicesInfo, SLICE_START(slicesInfo, sliceId), length) - 1;
  }

  /**
@ -978,32 +911,21 @@ function ArrayScanPar(func, mode) {
   * |intermediates[1-1]|, which is |A+B+C|, so that the final
   * result is [(A+B+C)+D, (A+B+C)+(D+E), (A+B+C)+(D+E+F)]. Again I
   * am using parentheses to clarify how these results were reduced.
-   *
-   * SUBTLE: Because we are mutating |buffer| in place, we have to
-   * be very careful about bailouts!  We cannot checkpoint a chunk
-   * at a time as we do elsewhere because that assumes it is safe to
-   * replay the portion of a chunk which was already processed.
-   * Therefore, in this phase, we track the current position at an
-   * index granularity, although this requires two memory writes per
-   * index.
   */
-  function phase2(sliceId, warmup) {
-    if (sliceId === 0)
-      return true; // No work to do for the 0th slice.
+  function phase2(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var indexPos = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexPos, length);

-    var indexPos = info[SLICE_POS(sliceId)];
-    var indexEnd = info[SLICE_END(sliceId)];
+      var intermediate = intermediates[sliceId - 1];
+      for (; indexPos < indexEnd; indexPos++)
+        UnsafePutElements(buffer, indexPos, func(intermediate, buffer[indexPos]));

-    if (warmup)
-      indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-    var intermediate = intermediates[sliceId - 1];
-    for (; indexPos < indexEnd; indexPos++) {
-      UnsafePutElements(buffer, indexPos, func(intermediate, buffer[indexPos]),
-                        info, SLICE_POS(sliceId), indexPos + 1);
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    return indexEnd === info[SLICE_END(sliceId)];
  }

  return undefined;
@ -1039,210 +961,17 @@ function ArrayScatterPar(targets, defaultValue, conflictFunc, length, mode) {
  if (length === undefined)
    length = self.length;

-  // The Divide-Scatter-Vector strategy:
-  // 1. Slice |targets| array of indices ("scatter-vector") into N
-  //    parts.
-  // 2. Each of the N threads prepares an output buffer and a
-  //    write-log.
-  // 3. Each thread scatters according to one of the N parts into its
-  //    own output buffer, tracking written indices in the write-log
-  //    and resolving any resulting local collisions in parallel.
-  // 4. Merge the parts (either in parallel or sequentially), using
-  //    the write-logs as both the basis for finding merge-inputs and
-  //    for detecting collisions.
-
-  // The Divide-Output-Range strategy:
-  // 1. Slice the range of indices [0..|length|-1] into N parts.
-  //    Allocate a single shared output buffer of length |length|.
-  // 2. Each of the N threads scans (the entirety of) the |targets|
-  //    array, seeking occurrences of indices from that thread's part
-  //    of the range, and writing the results into the shared output
-  //    buffer.
-  // 3. Since each thread has its own portion of the output range,
-  //    every collision that occurs can be handled thread-locally.
-
-  // SO:
-  //
-  // If |targets.length| >> |length|, Divide-Scatter-Vector seems like
-  // a clear win over Divide-Output-Range, since for the latter, the
-  // expense of redundantly scanning the |targets| will diminish the
-  // gain from processing |length| in parallel, while for the former,
-  // the total expense of building separate output buffers and the
-  // merging post-process is small compared to the gain from
-  // processing |targets| in parallel.
-  //
-  // If |targets.length| << |length|, then Divide-Output-Range seems
-  // like it *could* win over Divide-Scatter-Vector. (But when is
-  // |targets.length| << |length| or even |targets.length| < |length|?
-  // Seems like an odd situation and an uncommon case at best.)
-  //
-  // The unanswered question is which strategy performs better when
-  // |targets.length| approximately equals |length|, especially for
-  // special cases like collision-free scatters and permutations.
-
  var targetsLength = std_Math_min(targets.length, self.length);

  if (!IS_UINT32(targetsLength) || !IS_UINT32(length))
    ThrowError(JSMSG_BAD_ARRAY_LENGTH);

-  parallel: for (;;) { // see ArrayMapPar() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    if (forceDivideScatterVector())
-      return parDivideScatterVector();
-    else if (forceDivideOutputRange())
-      return parDivideOutputRange();
-    else if (conflictFunc === undefined && targetsLength < length)
-      return parDivideOutputRange();
-    return parDivideScatterVector();
-  }
+  // FIXME: Bug 965609: Find a better parallel startegy for scatter.

  // Sequential fallback:
  ASSERT_SEQUENTIAL_IS_OK(mode);
  return seq();

-  function forceDivideScatterVector() {
-    return mode && mode.strategy && mode.strategy === "divide-scatter-vector";
-  }
-
-  function forceDivideOutputRange() {
-    return mode && mode.strategy && mode.strategy === "divide-output-range";
-  }
-
-  function collide(elem1, elem2) {
-    if (conflictFunc === undefined)
-      ThrowError(JSMSG_PAR_ARRAY_SCATTER_CONFLICT);
-
-    return conflictFunc(elem1, elem2);
-  }
-
-
-  function parDivideOutputRange() {
-    var chunks = ComputeNumChunks(targetsLength);
-    var numSlices = ComputeNumSlices(ForkJoinNumWorkers(), length, chunks);
-    var checkpoints = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++)
-      UnsafePutElements(checkpoints, i, 0);
-
-    var buffer = NewDenseArray(length);
-    var conflicts = NewDenseArray(length);
-
-    for (var i = 0; i < length; i++) {
-      UnsafePutElements(buffer, i, defaultValue);
-      UnsafePutElements(conflicts, i, false);
-    }
-
-    ForkJoin(fill, ForkJoinMode(mode), numSlices);
-    return buffer;
-
-    function fill(sliceId, warmup) {
-      var indexPos = checkpoints[sliceId];
-      var indexEnd = targetsLength;
-      if (warmup)
-        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-      // Range in the output for which we are responsible:
-      var [outputStart, outputEnd] = ComputeSliceBounds(length, sliceId, numSlices);
-
-      for (; indexPos < indexEnd; indexPos++) {
-        var x = self[indexPos];
-        var t = checkTarget(indexPos, targets[indexPos]);
-        if (t < outputStart || t >= outputEnd)
-          continue;
-        if (conflicts[t])
-          x = collide(x, buffer[t]);
-        UnsafePutElements(buffer, t, x, conflicts, t, true, checkpoints, sliceId, indexPos + 1);
-      }
-
-      return indexEnd === targetsLength;
-    }
-
-    return undefined;
-  }
-
-  function parDivideScatterVector() {
-    // Subtle: because we will be mutating the localBuffers and
-    // conflict arrays in place, we can never replay an entry in the
-    // target array for fear of inducing a conflict where none existed
-    // before. Therefore, we must proceed not by chunks but rather by
-    // individual indices.
-    var numSlices = ComputeNumSlices(ForkJoinNumWorkers(), length, ComputeNumChunks(length));
-    var info = ComputeAllSliceBounds(targetsLength, numSlices);
-
-    // FIXME(bug 844890): Use typed arrays here.
-    var localBuffers = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++)
-      UnsafePutElements(localBuffers, i, NewDenseArray(length));
-    var localConflicts = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++) {
-      var conflicts_i = NewDenseArray(length);
-      for (var j = 0; j < length; j++)
-        UnsafePutElements(conflicts_i, j, false);
-      UnsafePutElements(localConflicts, i, conflicts_i);
-    }
-
-    // Initialize the 0th buffer, which will become the output. For
-    // the other buffers, we track which parts have been written to
-    // using the conflict buffer so they do not need to be
-    // initialized.
-    var outputBuffer = localBuffers[0];
-    for (var i = 0; i < length; i++)
-      UnsafePutElements(outputBuffer, i, defaultValue);
-
-    ForkJoin(fill, ForkJoinMode(mode), numSlices);
-    mergeBuffers();
-    return outputBuffer;
-
-    function fill(sliceId, warmup) {
-      var indexPos = info[SLICE_POS(sliceId)];
-      var indexEnd = info[SLICE_END(sliceId)];
-      if (warmup)
-        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-      var localbuffer = localBuffers[sliceId];
-      var conflicts = localConflicts[sliceId];
-      while (indexPos < indexEnd) {
-        var x = self[indexPos];
-        var t = checkTarget(indexPos, targets[indexPos]);
-        if (conflicts[t])
-          x = collide(x, localbuffer[t]);
-        UnsafePutElements(localbuffer, t, x, conflicts, t, true,
-                          info, SLICE_POS(sliceId), ++indexPos);
-      }
-
-      return indexEnd === info[SLICE_END(sliceId)];
-    }
-
-    /**
-     * Merge buffers 1..NUMSLICES into buffer 0. In principle, we could
-     * parallelize the merge work as well. But for this first cut,
-     * just do the merge sequentially.
-     */
-    function mergeBuffers() {
-      var buffer = localBuffers[0];
-      var conflicts = localConflicts[0];
-      for (var i = 1; i < numSlices; i++) {
-        var otherbuffer = localBuffers[i];
-        var otherconflicts = localConflicts[i];
-        for (var j = 0; j < length; j++) {
-          if (otherconflicts[j]) {
-            if (conflicts[j]) {
-              buffer[j] = collide(otherbuffer[j], buffer[j]);
-            } else {
-              buffer[j] = otherbuffer[j];
-              conflicts[j] = true;
-            }
-          }
-        }
-      }
-    }
-
-    return undefined;
-  }
-
  function seq() {
    var buffer = NewDenseArray(length);
    var conflicts = NewDenseArray(length);
@ -1294,13 +1023,7 @@ function ArrayFilterPar(func, mode) {
    if (!TRY_PARALLEL(mode))
      break parallel;

-    var chunks = ComputeNumChunks(length);
-    var numWorkers = ForkJoinNumWorkers();
-    if (chunks < numWorkers * 2)
-      break parallel;
-
-    var numSlices = ComputeNumSlices(numWorkers, length, chunks);
-    var info = ComputeAllSliceBounds(chunks, numSlices);
+    var slicesInfo = ComputeSlicesInfo(length);

    // Step 1. Compute which items from each slice of the result
    // buffer should be preserved. When we're done, we have an array
@ -1310,11 +1033,15 @@ function ArrayFilterPar(func, mode) {
    // preserved from within one slice.
    //
    // FIXME(bug 844890): Use typed arrays here.
+    var numSlices = SLICE_COUNT(slicesInfo);
    var counts = NewDenseArray(numSlices);
    for (var i = 0; i < numSlices; i++)
      UnsafePutElements(counts, i, 0);
-    var survivors = NewDenseArray(chunks);
-    ForkJoin(findSurvivorsInSlice, ForkJoinMode(mode), numSlices);
+    var survivors = NewDenseArray(computeNum32BitChunks(length));
+    ForkJoin(findSurvivorsThread, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));
+
+    // Clear the slices' statuses in between phases.
+    SlicesInfoClearStatuses(slicesInfo);

    // Step 2. Compress the slices into one contiguous set.
    var count = 0;
@ -1322,7 +1049,7 @@ function ArrayFilterPar(func, mode) {
      count += counts[i];
    var buffer = NewDenseArray(count);
    if (count > 0)
-      ForkJoin(copySurvivorsInSlice, ForkJoinMode(mode), numSlices);
+      ForkJoin(copySurvivorsThread, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));

    return buffer;
  }
@ -1337,80 +1064,97 @@ function ArrayFilterPar(func, mode) {
  }
  return buffer;

+  /**
+   * Determine the number of 32-bit chunks for use with the survivors bitset.
+   */
+  function computeNum32BitChunks(length) {
+    var chunks = length >>> 5;
+    if (chunks << 5 === length)
+      return chunks;
+    return chunks + 1;
+  }
+
  /**
   * As described above, our goal is to determine which items we
   * will preserve from a given slice. We do this one chunk at a
   * time. When we finish a chunk, we record our current count and
   * the next chunk sliceId, lest we should bail.
   */
-  function findSurvivorsInSlice(sliceId, warmup) {
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos)
-      chunkEnd = chunkPos + 1;
-
-    var count = counts[sliceId];
-    while (chunkPos < chunkEnd) {
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      var chunkBits = 0;
-
-      for (var bit = 0; indexStart + bit < indexEnd; bit++) {
-        var keep = !!func(self[indexStart + bit], indexStart + bit, self);
-        chunkBits |= keep << bit;
-        count += keep;
+  function findSurvivorsThread(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var count = 0;
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
+      var chunkStart = computeNum32BitChunks(indexStart);
+      var chunkEnd = computeNum32BitChunks(indexEnd);
+      for (var chunkPos = chunkStart; chunkPos < chunkEnd; chunkPos++, indexStart += 32) {
+        var chunkBits = 0;
+        for (var bit = 0, indexPos = indexStart; bit < 32 && indexPos < indexEnd; bit++, indexPos++) {
+          var keep = !!func(self[indexPos], indexPos, self);
+          chunkBits |= keep << bit;
+          count += keep;
+        }
+        UnsafePutElements(survivors, chunkPos, chunkBits);
      }
+      UnsafePutElements(counts, sliceId, count);

-      UnsafePutElements(survivors, chunkPos, chunkBits,
-                        counts, sliceId, count,
-                        info, SLICE_POS(sliceId), ++chunkPos);
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    return chunkEnd === info[SLICE_END(sliceId)];
  }

-  function copySurvivorsInSlice(sliceId, warmup) {
-    // Copies the survivors from this slice into the correct position.
-    // Note that this is an idempotent operation that does not invoke
-    // user code. Therefore, we don't expect bailouts and make an
-    // effort to proceed chunk by chunk or avoid duplicating work.
+  function copySurvivorsThread(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      // Copies the survivors from this slice into the correct position.
+      // Note that this is an idempotent operation that does not invoke
+      // user code. Therefore, we don't expect bailouts and make an
+      // effort to proceed chunk by chunk or avoid duplicating work.

-    // Total up the items preserved by previous slices.
-    var count = 0;
-    if (sliceId > 0) { // FIXME(#819219)---work around a bug in Ion's range checks
-      for (var i = 0; i < sliceId; i++)
-        count += counts[i];
-    }
+      // Total up the items preserved by previous slices.
+      var total = 0;
+      for (var i = 0; i < sliceId + 1; i++)
+        total += counts[i];

-    // Compute the final index we expect to write.
-    var total = count + counts[sliceId];
-    if (count === total)
-      return true;
-
-    // Iterate over the chunks assigned to us. Read the bitset for
-    // each chunk. Copy values where a 1 appears until we have
-    // written all the values that we expect to. We can just iterate
-    // from 0...CHUNK_SIZE without fear of a truncated final chunk
-    // because we are already checking for when count==total.
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-    for (var chunk = chunkStart; chunk < chunkEnd; chunk++) {
-      var chunkBits = survivors[chunk];
-      if (!chunkBits)
+      // Compute the final index we expect to write.
+      var count = total - counts[sliceId];
+      if (count === total) {
+        MARK_SLICE_DONE(slicesInfo, sliceId);
        continue;
-
-      var indexStart = chunk << CHUNK_SHIFT;
-      for (var i = 0; i < CHUNK_SIZE; i++) {
-        if (chunkBits & (1 << i)) {
-          UnsafePutElements(buffer, count++, self[indexStart + i]);
-          if (count === total)
-            break;
-        }
      }
-    }

-    return true;
+      // Iterate over the chunks assigned to us. Read the bitset for
+      // each chunk. Copy values where a 1 appears until we have
+      // written all the values that we expect to. We can just iterate
+      // from 0...CHUNK_SIZE without fear of a truncated final chunk
+      // because we are already checking for when count==total.
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
+      var chunkStart = computeNum32BitChunks(indexStart);
+      var chunkEnd = computeNum32BitChunks(indexEnd);
+      for (var chunkPos = chunkStart; chunkPos < chunkEnd; chunkPos++, indexStart += 32) {
+        var chunkBits = survivors[chunkPos];
+        if (!chunkBits)
+          continue;
+
+        for (var i = 0; i < 32; i++) {
+          if (chunkBits & (1 << i)) {
+            UnsafePutElements(buffer, count++, self[indexStart + i]);
+            if (count === total)
+              break;
+          }
+        }
+
+        if (count == total)
+          break;
+      }
+
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
+    }
  }

  return undefined;
@ -1452,39 +1196,28 @@ function ArrayStaticBuildPar(length, func, mode) {
    if (!TRY_PARALLEL(mode))
      break parallel;

-    var chunks = ComputeNumChunks(length);
-    var numWorkers = ForkJoinNumWorkers();
-    var numSlices = ComputeNumSlices(numWorkers, length, chunks);
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-    ForkJoin(constructSlice, ForkJoinMode(mode), numSlices);
+    var slicesInfo = ComputeSlicesInfo(length);
+    ForkJoin(constructThread, ShrinkLeftmost(slicesInfo), ForkJoinMode(mode));
    return buffer;
  }

  // Sequential fallback:
  ASSERT_SEQUENTIAL_IS_OK(mode);
-  fill(0, length);
+  for (var i = 0; i < length; i++)
+    UnsafePutElements(buffer, i, func(i));
  return buffer;

-  function constructSlice(sliceId, warmup) {
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos)
-      chunkEnd = chunkPos + 1;
-
-    while (chunkPos < chunkEnd) {
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      fill(indexStart, indexEnd);
-      UnsafePutElements(info, SLICE_POS(sliceId), ++chunkPos);
+  function constructThread(warmup) {
+    var sliceId;
+    while (GET_SLICE(slicesInfo, sliceId)) {
+      var indexStart = SLICE_START(slicesInfo, sliceId);
+      var indexEnd = SLICE_END(slicesInfo, indexStart, length);
+      for (var i = indexStart; i < indexEnd; i++)
+        UnsafePutElements(buffer, i, func(i));
+      MARK_SLICE_DONE(slicesInfo, sliceId);
+      if (warmup)
+        return;
    }
-
-    return chunkEnd === info[SLICE_END(sliceId)];
-  }
-
-  function fill(indexStart, indexEnd) {
-    for (var i = indexStart; i < indexEnd; i++)
-      UnsafePutElements(buffer, i, func(i));
  }

  return undefined;
--- a/js/src/builtin/Utilities.js
+++ b/js/src/builtin/Utilities.js
@ -57,6 +57,7 @@ var std_Math_floor = Math.floor;
 var std_Math_max = Math.max;
 var std_Math_min = Math.min;
 var std_Math_imul = Math.imul;
+var std_Math_log2 = Math.log2;
 var std_Number_valueOf = Number.prototype.valueOf;
 var std_Number_POSITIVE_INFINITY = Number.POSITIVE_INFINITY;
 var std_Object_create = Object.create;
--- a/js/src/jit-test/lib/parallelarray-helpers.js
+++ b/js/src/jit-test/lib/parallelarray-helpers.js
@ -100,7 +100,7 @@ function assertEqArray(a, b) {
      try {
        assertStructuralEq(a[i], b[i]);
      } catch (e) {
-        print("...in index ", i, " of ", l);
+        print("...in index", i, "of", l);
        throw e;
      }
    }
--- a/js/src/jit-test/tests/parallel/Array-mapPar-nested.js
+++ b/js/src/jit-test/tests/parallel/Array-mapPar-nested.js
@ -17,5 +17,6 @@ function test() {
  }
 }

-if (getBuildConfiguration().parallelJS)
-  test();
+// FIXME: Bug 949296. Broken due to all interrupt triggers aborting PJS.
+//if (getBuildConfiguration().parallelJS)
+//  test();
--- a/js/src/jit/MCallOptimize.cpp
+++ b/js/src/jit/MCallOptimize.cpp
@ -143,7 +143,8 @@ IonBuilder::inlineNativeCall(CallInfo &callInfo, JSNative native)
        return inlineUnsafeGetReservedSlot(callInfo);

    // Parallel intrinsics.
-    if (native == intrinsic_ShouldForceSequential)
+    if (native == intrinsic_ShouldForceSequential ||
+        native == intrinsic_InParallelSection)
        return inlineForceSequentialOrInParallelSection(callInfo);

    // Utility intrinsics.
--- a/js/src/jscntxt.h
+++ b/js/src/jscntxt.h
@ -1020,6 +1020,8 @@ bool intrinsic_IsPackedArray(JSContext *cx, unsigned argc, Value *vp);

 bool intrinsic_ShouldForceSequential(JSContext *cx, unsigned argc, Value *vp);
 bool intrinsic_NewParallelArray(JSContext *cx, unsigned argc, Value *vp);
+bool intrinsic_ForkJoinGetSlice(JSContext *cx, unsigned argc, Value *vp);
+bool intrinsic_InParallelSection(JSContext *cx, unsigned argc, Value *vp);

 class AutoLockForExclusiveAccess
 {
--- a/js/src/vm/ForkJoin.cpp
+++ b/js/src/vm/ForkJoin.cpp
@ -48,17 +48,14 @@ using mozilla::ThreadLocal;
 // altogether.

 static bool
-ExecuteSequentially(JSContext *cx_, HandleValue funVal, bool *complete,
-                    uint16_t sliceStart, uint16_t numSlices);
+ExecuteSequentially(JSContext *cx_, HandleValue funVal);

 #if !defined(JS_THREADSAFE) || !defined(JS_ION)
 bool
 js::ForkJoin(JSContext *cx, CallArgs &args)
 {
    RootedValue argZero(cx, args[0]);
-    bool complete = false; // since warmup is false, will always complete
-    uint32_t numSlices = args[2].toInt32();
-    return ExecuteSequentially(cx, argZero, &complete, 0, numSlices);
+    return ExecuteSequentially(cx, argZero);
 }

 JSContext *
@ -169,25 +166,16 @@ JS_JITINFO_NATIVE_PARALLEL(js::intrinsic_SetForkJoinTargetRegionInfo,
 // Some code that is shared between degenerate and parallel configurations.

 static bool
-ExecuteSequentially(JSContext *cx, HandleValue funVal, bool *complete,
-                    uint16_t sliceStart, uint16_t numSlices)
+ExecuteSequentially(JSContext *cx, HandleValue funVal)
 {
-    bool allComplete = true;
-    for (uint16_t i = sliceStart; i < numSlices; i++) {
-        FastInvokeGuard fig(cx, funVal);
-        InvokeArgs &args = fig.args();
-        if (!args.init(2))
-            return false;
-        args.setCallee(funVal);
-        args.setThis(UndefinedValue());
-        args[0].setInt32(i);
-        args[1].setBoolean(!!cx->runtime()->parallelWarmup);
-        if (!fig.invoke(cx))
-            return false;
-        allComplete = allComplete & args.rval().toBoolean();
-    }
-    *complete = allComplete;
-    return true;
+    FastInvokeGuard fig(cx, funVal);
+    InvokeArgs &args = fig.args();
+    if (!args.init(1))
+        return false;
+    args.setCallee(funVal);
+    args.setThis(UndefinedValue());
+    args[0].setBoolean(!!cx->runtime()->parallelWarmup);
+    return fig.invoke(cx);
 }

 ThreadLocal<ForkJoinContext*> ForkJoinContext::tlsForkJoinContext;
@ -260,7 +248,8 @@ class ForkJoinOperation
    RootedScript bailoutScript;
    jsbytecode *bailoutBytecode;

-    ForkJoinOperation(JSContext *cx, HandleObject fun, ForkJoinMode mode, uint16_t numSlices);
+    ForkJoinOperation(JSContext *cx, HandleFunction fun, HandleFunction boundsFun,
+                      ForkJoinMode mode);
    ExecutionStatus apply();

  private:
@ -298,18 +287,16 @@ class ForkJoinOperation
    };

    JSContext *cx_;
-    HandleObject fun_;
+    HandleFunction fun_;
+    HandleFunction boundsFun_;
    Vector<ParallelBailoutRecord, 16> bailoutRecords_;
    AutoScriptVector worklist_;
    Vector<WorklistData, 16> worklistData_;
    ForkJoinMode mode_;
-    uint16_t warmupSlice_;
-    uint16_t numSlices_;

    TrafficLight enqueueInitialScript(ExecutionStatus *status);
    TrafficLight compileForParallelExecution(ExecutionStatus *status);
-    TrafficLight warmupExecution(bool stopIfComplete,
-                                 ExecutionStatus *status);
+    TrafficLight warmupExecution(bool stopIfComplete, ExecutionStatus *status);
    TrafficLight parallelExecution(ExecutionStatus *status);
    TrafficLight sequentialExecution(bool disqualified, ExecutionStatus *status);
    TrafficLight recoverFromBailout(ExecutionStatus *status);
@ -318,12 +305,12 @@ class ForkJoinOperation
    bool invalidateBailedOutScripts();
    ExecutionStatus sequentialExecution(bool disqualified);

-    TrafficLight appendCallTargetsToWorklist(uint32_t index,
-                                             ExecutionStatus *status);
-    TrafficLight appendCallTargetToWorklist(HandleScript script,
-                                            ExecutionStatus *status);
+    TrafficLight appendCallTargetsToWorklist(uint32_t index, ExecutionStatus *status);
+    TrafficLight appendCallTargetToWorklist(HandleScript script, ExecutionStatus *status);
    bool addToWorklist(HandleScript script);
    inline bool hasScript(Vector<types::RecompileInfo> &scripts, JSScript *script);
+
+    bool computeBounds(uint16_t *start, uint16_t *end);
 }; // class ForkJoinOperation

 class ForkJoinShared : public ParallelJob, public Monitor
@ -331,12 +318,13 @@ class ForkJoinShared : public ParallelJob, public Monitor
    /////////////////////////////////////////////////////////////////////////
    // Constant fields

-    JSContext *const cx_;          // Current context
-    ThreadPool *const threadPool_; // The thread pool.
-    HandleObject fun_;             // The JavaScript function to execute.
-    uint16_t numSlices_;           // Total number of slices. Dynamically changed
-    PRLock *cxLock_;               // Locks cx_ for parallel VM calls.
-    ParallelBailoutRecord *const records_; // Bailout records for each slice
+    JSContext *const cx_;                  // Current context
+    ThreadPool *const threadPool_;         // The thread pool
+    HandleFunction fun_;                   // The JavaScript function to execute
+    uint16_t sliceFrom_;                   // The starting slice id.
+    uint16_t sliceTo_;                     // The ending slice id + 1.
+    PRLock *cxLock_;                       // Locks cx_ for parallel VM calls
+    ParallelBailoutRecord *const records_; // Bailout records for each worker

    /////////////////////////////////////////////////////////////////////////
    // Per-thread arenas
@ -369,8 +357,9 @@ class ForkJoinShared : public ParallelJob, public Monitor
  public:
    ForkJoinShared(JSContext *cx,
                   ThreadPool *threadPool,
-                   HandleObject fun,
-                   uint16_t numSlices,
+                   HandleFunction fun,
+                   uint16_t sliceFrom,
+                   uint16_t sliceTo,
                   ParallelBailoutRecord *records);
    ~ForkJoinShared();

@ -379,14 +368,13 @@ class ForkJoinShared : public ParallelJob, public Monitor
    ParallelResult execute();

    // Invoked from parallel worker threads:
-    virtual bool executeFromWorker(uint16_t sliceId, uint32_t workerId,
-                                   uintptr_t stackLimit) MOZ_OVERRIDE;
+    virtual bool executeFromWorker(uint32_t workerId, uintptr_t stackLimit) MOZ_OVERRIDE;

    // Invoked only from the main thread:
-    virtual bool executeFromMainThread(uint16_t sliceId) MOZ_OVERRIDE;
+    virtual bool executeFromMainThread() MOZ_OVERRIDE;

-    // Executes slice |sliceId| either from a worker or the main thread.
-    void executePortion(PerThreadData *perThread, uint16_t sliceId, uint32_t workerId);
+    // Executes the user-supplied function a worker or the main thread.
+    void executePortion(PerThreadData *perThread, uint32_t workerId);

    // Moves all the per-thread arenas into the main compartment and processes
    // any pending requests for a GC. This can only safely be invoked on the
@ -489,16 +477,16 @@ js::ForkJoin(JSContext *cx, CallArgs &args)
    JS_ASSERT(args.length() == 3); // else the self-hosted code is wrong
    JS_ASSERT(args[0].isObject());
    JS_ASSERT(args[0].toObject().is<JSFunction>());
-    JS_ASSERT(args[1].isInt32());
-    JS_ASSERT(args[1].toInt32() < NumForkJoinModes);
+    JS_ASSERT(args[1].isObject());
+    JS_ASSERT(args[1].toObject().is<JSFunction>());
    JS_ASSERT(args[2].isInt32());
+    JS_ASSERT(args[2].toInt32() < NumForkJoinModes);

-    RootedObject fun(cx, &args[0].toObject());
-    ForkJoinMode mode = (ForkJoinMode) args[1].toInt32();
-    uint32_t numSlices = args[2].toInt32();
-    MOZ_ASSERT(uint32_t(uint16_t(numSlices)) == numSlices);
+    RootedFunction fun(cx, &args[0].toObject().as<JSFunction>());
+    RootedFunction boundsFun(cx, &args[1].toObject().as<JSFunction>());
+    ForkJoinMode mode = (ForkJoinMode) args[2].toInt32();

-    ForkJoinOperation op(cx, fun, mode, numSlices);
+    ForkJoinOperation op(cx, fun, boundsFun, mode);
    ExecutionStatus status = op.apply();
    if (status == ExecutionFatal)
        return false;
@ -557,24 +545,23 @@ ForkJoinModeString(ForkJoinMode mode) {
    return "???";
 }

-js::ForkJoinOperation::ForkJoinOperation(JSContext *cx, HandleObject fun, ForkJoinMode mode,
-                                         uint16_t numSlices)
+ForkJoinOperation::ForkJoinOperation(JSContext *cx, HandleFunction fun, HandleFunction boundsFun,
+                                     ForkJoinMode mode)
  : bailouts(0),
    bailoutCause(ParallelBailoutNone),
    bailoutScript(cx),
    bailoutBytecode(nullptr),
    cx_(cx),
    fun_(fun),
+    boundsFun_(boundsFun),
    bailoutRecords_(cx),
    worklist_(cx),
    worklistData_(cx),
-    mode_(mode),
-    warmupSlice_(0),
-    numSlices_(numSlices)
+    mode_(mode)
 { }

 ExecutionStatus
-js::ForkJoinOperation::apply()
+ForkJoinOperation::apply()
 {
    ExecutionStatus status;

@ -660,8 +647,8 @@ js::ForkJoinOperation::apply()
    return SpewEndOp(sequentialExecution(true));
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::enqueueInitialScript(ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::enqueueInitialScript(ExecutionStatus *status)
 {
    // GreenLight: script successfully enqueued if necessary
    // RedLight: fatal error or fell back to sequential
@ -698,8 +685,8 @@ js::ForkJoinOperation::enqueueInitialScript(ExecutionStatus *status)
    return GreenLight;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::compileForParallelExecution(ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::compileForParallelExecution(ExecutionStatus *status)
 {
    // GreenLight: all scripts compiled
    // RedLight: fatal error or completed work via warmups or fallback
@ -889,8 +876,8 @@ js::ForkJoinOperation::compileForParallelExecution(ExecutionStatus *status)
    return GreenLight;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::appendCallTargetsToWorklist(uint32_t index, ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::appendCallTargetsToWorklist(uint32_t index, ExecutionStatus *status)
 {
    // GreenLight: call targets appended
    // RedLight: fatal error or completed work via warmups or fallback
@ -918,8 +905,8 @@ js::ForkJoinOperation::appendCallTargetsToWorklist(uint32_t index, ExecutionStat
    return GreenLight;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::appendCallTargetToWorklist(HandleScript script, ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::appendCallTargetToWorklist(HandleScript script, ExecutionStatus *status)
 {
    // GreenLight: call target appended if necessary
    // RedLight: fatal error or completed work via warmups or fallback
@ -949,7 +936,7 @@ js::ForkJoinOperation::appendCallTargetToWorklist(HandleScript script, Execution
 }

 bool
-js::ForkJoinOperation::addToWorklist(HandleScript script)
+ForkJoinOperation::addToWorklist(HandleScript script)
 {
    for (uint32_t i = 0; i < worklist_.length(); i++) {
        if (worklist_[i] == script) {
@ -977,8 +964,8 @@ js::ForkJoinOperation::addToWorklist(HandleScript script)
    return true;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::sequentialExecution(bool disqualified, ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::sequentialExecution(bool disqualified, ExecutionStatus *status)
 {
    // RedLight: fatal error or completed work

@ -987,26 +974,21 @@ js::ForkJoinOperation::sequentialExecution(bool disqualified, ExecutionStatus *s
 }

 ExecutionStatus
-js::ForkJoinOperation::sequentialExecution(bool disqualified)
+ForkJoinOperation::sequentialExecution(bool disqualified)
 {
    // XXX use disqualified to set parallelIon to ION_DISABLED_SCRIPT?

    Spew(SpewOps, "Executing sequential execution (disqualified=%d).",
         disqualified);

-    bool complete = false;
    RootedValue funVal(cx_, ObjectValue(*fun_));
-    if (!ExecuteSequentially(cx_, funVal, &complete, 0, numSlices_))
+    if (!ExecuteSequentially(cx_, funVal))
        return ExecutionFatal;
-
-    // When invoked without the warmup flag set to true, the kernel
-    // function OUGHT to complete successfully, barring an exception.
-    JS_ASSERT(complete);
    return ExecutionSequential;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::fatalError(ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::fatalError(ExecutionStatus *status)
 {
    // RedLight: fatal error

@ -1058,7 +1040,7 @@ BailoutExplanation(ParallelBailoutCause cause)
 }

 void
-js::ForkJoinOperation::determineBailoutCause()
+ForkJoinOperation::determineBailoutCause()
 {
    bailoutCause = ParallelBailoutNone;
    for (uint32_t i = 0; i < bailoutRecords_.length(); i++) {
@ -1096,7 +1078,7 @@ js::ForkJoinOperation::determineBailoutCause()
 }

 bool
-js::ForkJoinOperation::invalidateBailedOutScripts()
+ForkJoinOperation::invalidateBailedOutScripts()
 {
    Vector<types::RecompileInfo> invalid(cx_);
    for (uint32_t i = 0; i < bailoutRecords_.length(); i++) {
@ -1148,48 +1130,52 @@ js::ForkJoinOperation::invalidateBailedOutScripts()
    return true;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::warmupExecution(bool stopIfComplete, ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::warmupExecution(bool stopIfComplete, ExecutionStatus *status)
 {
    // GreenLight: warmup succeeded, still more work to do
    // RedLight: fatal error or warmup completed all work (check status)

-    Spew(SpewOps, "Executing warmup of slice %u.", warmupSlice_);
-
-    AutoEnterWarmup warmup(cx_->runtime());
-    RootedValue funVal(cx_, ObjectValue(*fun_));
-    bool complete;
-    uint32_t warmupTo = Min<uint16_t>(warmupSlice_ + 1, numSlices_);
-    if (!ExecuteSequentially(cx_, funVal, &complete, warmupSlice_, warmupTo)) {
+    uint16_t from, to;
+    if (!computeBounds(&from, &to)) {
        *status = ExecutionFatal;
        return RedLight;
    }

-    if (complete) {
-        warmupSlice_ = warmupTo;
-        if (warmupSlice_ == numSlices_) {
-            if (stopIfComplete) {
-                Spew(SpewOps, "Warmup execution finished all the work.");
-                *status = ExecutionWarmup;
-                return RedLight;
-            }
+    if (from == to) {
+        Spew(SpewOps, "Warmup execution finished all the work.");

-            // If we finished all slices in warmup, be sure check the
-            // interrupt flag. This is because we won't be running more JS
-            // code, and thus no more automatic checking of the interrupt
-            // flag.
-            if (!js_HandleExecutionInterrupt(cx_)) {
-                *status = ExecutionFatal;
-                return RedLight;
-            }
+        if (stopIfComplete) {
+            *status = ExecutionWarmup;
+            return RedLight;
        }
+
+        // If we finished all slices in warmup, be sure check the
+        // interrupt flag. This is because we won't be running more JS
+        // code, and thus no more automatic checking of the interrupt
+        // flag.
+        if (!js_HandleExecutionInterrupt(cx_)) {
+            *status = ExecutionFatal;
+            return RedLight;
+        }
+
+        return GreenLight;
+    }
+
+    Spew(SpewOps, "Executing warmup.");
+
+    AutoEnterWarmup warmup(cx_->runtime());
+    RootedValue funVal(cx_, ObjectValue(*fun_));
+    if (!ExecuteSequentially(cx_, funVal)) {
+        *status = ExecutionFatal;
+        return RedLight;
    }

    return GreenLight;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::parallelExecution(ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::parallelExecution(ExecutionStatus *status)
 {
    // GreenLight: bailout occurred, keep trying
    // RedLight: fatal error or all work completed
@ -1201,10 +1187,20 @@ js::ForkJoinOperation::parallelExecution(ExecutionStatus *status)

    ForkJoinActivation activation(cx_);

-    ThreadPool *threadPool = &cx_->runtime()->threadPool;
+    uint16_t from, to;
+    if (!computeBounds(&from, &to)) {
+        *status = ExecutionFatal;
+        return RedLight;
+    }

-    RootedObject rootedFun(cx_, fun_);
-    ForkJoinShared shared(cx_, threadPool, rootedFun, numSlices_, &bailoutRecords_[0]);
+    if (from == to) {
+        Spew(SpewOps, "Warmup execution finished all the work.");
+        *status = ExecutionWarmup;
+        return RedLight;
+    }
+
+    ThreadPool *threadPool = &cx_->runtime()->threadPool;
+    ForkJoinShared shared(cx_, threadPool, fun_, from, to, &bailoutRecords_[0]);
    if (!shared.init()) {
        *status = ExecutionFatal;
        return RedLight;
@ -1227,8 +1223,8 @@ js::ForkJoinOperation::parallelExecution(ExecutionStatus *status)
    return GreenLight;
 }

-js::ForkJoinOperation::TrafficLight
-js::ForkJoinOperation::recoverFromBailout(ExecutionStatus *status)
+ForkJoinOperation::TrafficLight
+ForkJoinOperation::recoverFromBailout(ExecutionStatus *status)
 {
    // GreenLight: bailout recovered, try to compile-and-run again
    // RedLight: fatal error
@ -1240,7 +1236,7 @@ js::ForkJoinOperation::recoverFromBailout(ExecutionStatus *status)

    // After any bailout, we always scan over callee list of main
    // function, if nothing else
-    RootedScript mainScript(cx_, fun_->as<JSFunction>().nonLazyScript());
+    RootedScript mainScript(cx_, fun_->nonLazyScript());
    if (!addToWorklist(mainScript))
        return fatalError(status);

@ -1256,7 +1252,7 @@ js::ForkJoinOperation::recoverFromBailout(ExecutionStatus *status)
 }

 bool
-js::ForkJoinOperation::hasScript(Vector<types::RecompileInfo> &scripts, JSScript *script)
+ForkJoinOperation::hasScript(Vector<types::RecompileInfo> &scripts, JSScript *script)
 {
    for (uint32_t i = 0; i < scripts.length(); i++) {
        if (scripts[i] == script->parallelIonScript()->recompileInfo())
@ -1265,6 +1261,36 @@ js::ForkJoinOperation::hasScript(Vector<types::RecompileInfo> &scripts, JSScript
    return false;
 }

+bool
+ForkJoinOperation::computeBounds(uint16_t *start, uint16_t *end)
+{
+    RootedValue funVal(cx_, ObjectValue(*boundsFun_));
+    FastInvokeGuard fig(cx_, funVal);
+
+    InvokeArgs &args = fig.args();
+    if (!args.init(0))
+        return false;
+    args.setCallee(funVal);
+    args.setThis(UndefinedValue());
+
+    if (!fig.invoke(cx_))
+        return false;
+
+    MOZ_ASSERT(args.rval().toObject().is<ArrayObject>());
+    MOZ_ASSERT(args.rval().toObject().getDenseInitializedLength() == 2);
+
+    int32_t start32 = args.rval().toObject().getDenseElement(0).toInt32();
+    int32_t end32 = args.rval().toObject().getDenseElement(1).toInt32();
+
+    MOZ_ASSERT(int32_t(uint16_t(start32)) == start32);
+    MOZ_ASSERT(int32_t(uint16_t(end32)) == end32);
+
+    *start = uint16_t(start32);
+    *end = uint16_t(end32);
+
+    return true;
+}
+
 // Can only enter callees with a valid IonScript.
 template <uint32_t maxArgc>
 class ParallelIonInvoke
@ -1312,13 +1338,15 @@ class ParallelIonInvoke

 ForkJoinShared::ForkJoinShared(JSContext *cx,
                               ThreadPool *threadPool,
-                               HandleObject fun,
-                               uint16_t numSlices,
+                               HandleFunction fun,
+                               uint16_t sliceFrom,
+                               uint16_t sliceTo,
                               ParallelBailoutRecord *records)
  : cx_(cx),
    threadPool_(threadPool),
    fun_(fun),
-    numSlices_(numSlices),
+    sliceFrom_(sliceFrom),
+    sliceTo_(sliceTo),
    cxLock_(nullptr),
    records_(records),
    allocators_(cx),
@ -1388,7 +1416,7 @@ ForkJoinShared::execute()
        AutoUnlockMonitor unlock(*this);

        // Push parallel tasks and wait until they're all done.
-        jobResult = threadPool_->executeJob(cx_, this, numSlices_);
+        jobResult = threadPool_->executeJob(cx_, this, sliceFrom_, sliceTo_);
        if (jobResult == TP_FATAL)
            return TP_FATAL;
    }
@ -1404,7 +1432,7 @@ ForkJoinShared::execute()

 #ifdef DEBUG
    Spew(SpewOps, "Completed parallel job [slices %d, threads: %d (+1), stolen: %d (work stealing:%s)]",
-         numSlices_,
+         sliceTo_ - sliceFrom_,
         threadPool_->numWorkers(),
         threadPool_->stolenSlices(),
         threadPool_->workStealing() ? "ON" : "OFF");
@ -1432,10 +1460,8 @@ ForkJoinShared::transferArenasToCompartmentAndProcessGCRequests()
 }

 bool
-ForkJoinShared::executeFromWorker(uint16_t sliceId, uint32_t workerId, uintptr_t stackLimit)
+ForkJoinShared::executeFromWorker(uint32_t workerId, uintptr_t stackLimit)
 {
-    JS_ASSERT(sliceId <= numSlices_);
-
    PerThreadData thisThread(cx_->runtime());
    if (!thisThread.init()) {
        setAbortFlag(true);
@ -1450,21 +1476,21 @@ ForkJoinShared::executeFromWorker(uint16_t sliceId, uint32_t workerId, uintptr_t
    // Don't use setIonStackLimit() because that acquires the ionStackLimitLock, and the
    // lock has not been initialized in these cases.
    thisThread.ionStackLimit = stackLimit;
-    executePortion(&thisThread, sliceId, workerId);
+    executePortion(&thisThread, workerId);
    TlsPerThreadData.set(nullptr);

    return !abort_;
 }

 bool
-ForkJoinShared::executeFromMainThread(uint16_t sliceId)
+ForkJoinShared::executeFromMainThread()
 {
-    executePortion(&cx_->mainThread(), sliceId, threadPool_->numWorkers());
+    executePortion(&cx_->mainThread(), threadPool_->numWorkers());
    return !abort_;
 }

 void
-ForkJoinShared::executePortion(PerThreadData *perThread, uint16_t sliceId, uint32_t workerId)
+ForkJoinShared::executePortion(PerThreadData *perThread, uint32_t workerId)
 {
    // WARNING: This code runs ON THE PARALLEL WORKER THREAD.
    // Be careful when accessing cx_.
@ -1475,16 +1501,15 @@ ForkJoinShared::executePortion(PerThreadData *perThread, uint16_t sliceId, uint3
    JS::AutoAssertNoGC nogc(runtime());

    Allocator *allocator = allocators_[workerId];
-    ForkJoinContext cx(perThread, sliceId, workerId, allocator, this, &records_[workerId]);
+    ForkJoinContext cx(perThread, workerId, allocator, this, &records_[workerId]);
    AutoSetForkJoinContext autoContext(&cx);

 #ifdef DEBUG
    // Set the maximum worker and slice number for prettier spewing.
-    cx.maxSliceId = numSlices_ - 1;
    cx.maxWorkerId = threadPool_->numWorkers();
 #endif

-    Spew(SpewOps, "Slice up");
+    Spew(SpewOps, "Up");

    // Make a new IonContext for the slice, which is needed if we need to
    // re-enter the VM.
@ -1494,10 +1519,7 @@ ForkJoinShared::executePortion(PerThreadData *perThread, uint16_t sliceId, uint3

    JS_ASSERT(cx.bailoutRecord->topScript == nullptr);

-    RootedObject fun(perThread, fun_);
-    JS_ASSERT(fun->is<JSFunction>());
-    RootedFunction callee(perThread, &fun->as<JSFunction>());
-    if (!callee->nonLazyScript()->hasParallelIonScript()) {
+    if (!fun_->nonLazyScript()->hasParallelIonScript()) {
        // Sometimes, particularly with GCZeal, the parallel ion
        // script can be collected between starting the parallel
        // op and reaching this point.  In that case, we just fail
@ -1506,10 +1528,9 @@ ForkJoinShared::executePortion(PerThreadData *perThread, uint16_t sliceId, uint3
        cx.bailoutRecord->setCause(ParallelBailoutMainScriptNotPresent);
        setAbortFlag(false);
    } else {
-        ParallelIonInvoke<2> fii(cx_->runtime(), callee, 2);
+        ParallelIonInvoke<2> fii(cx_->runtime(), fun_, 1);

-        fii.args[0] = Int32Value(cx.sliceId);
-        fii.args[1] = BooleanValue(false);
+        fii.args[0] = BooleanValue(false);

        bool ok = fii.invoke(perThread);
        JS_ASSERT(ok == !cx.bailoutRecord->topScript);
@ -1517,7 +1538,7 @@ ForkJoinShared::executePortion(PerThreadData *perThread, uint16_t sliceId, uint3
            setAbortFlag(false);
    }

-    Spew(SpewOps, "Slice down");
+    Spew(SpewOps, "Down");
 }

 bool
@ -1595,12 +1616,10 @@ ForkJoinShared::requestZoneGC(JS::Zone *zone, JS::gcreason::Reason reason)
 // ForkJoinContext
 //

-ForkJoinContext::ForkJoinContext(PerThreadData *perThreadData,
-                                 uint16_t sliceId, uint32_t workerId,
+ForkJoinContext::ForkJoinContext(PerThreadData *perThreadData, uint32_t workerId,
                                 Allocator *allocator, ForkJoinShared *shared,
                                 ParallelBailoutRecord *bailoutRecord)
  : ThreadSafeContext(shared->runtime(), perThreadData, Context_ForkJoin),
-    sliceId(sliceId),
    workerId(workerId),
    bailoutRecord(bailoutRecord),
    targetRegionStart(nullptr),
@ -1884,12 +1903,12 @@ class ParallelSpewer

        if (ForkJoinContext *cx = ForkJoinContext::current()) {
            // Print the format first into a buffer to right-justify the
-            // worker and slice ids.
+            // worker ids.
            char bufbuf[BufferSize];
-            JS_snprintf(bufbuf, BufferSize, "[%%sParallel:%%0%du(%%0%du)%%s] ",
-                        NumberOfDigits(cx->maxWorkerId), NumberOfDigits(cx->maxSliceId));
+            JS_snprintf(bufbuf, BufferSize, "[%%sParallel:%%0%du%%s] ",
+                        NumberOfDigits(cx->maxWorkerId));
            JS_snprintf(buf, BufferSize, bufbuf, workerColor(cx->workerId),
-                        cx->workerId, cx->sliceId, reset());
+                        cx->workerId, reset());
        } else {
            JS_snprintf(buf, BufferSize, "[Parallel:M] ");
        }
--- a/js/src/vm/ForkJoin.h
+++ b/js/src/vm/ForkJoin.h
@ -30,38 +30,49 @@
 // to enable parallel execution.  At the top-level, it consists of a native
 // function (exposed as the ForkJoin intrinsic) that is used like so:
 //
-//     ForkJoin(func, feedback, N)
+//     ForkJoin(func, boundsFunc, mode)
 //
-// The intention of this statement is to start |N| copies of |func()|
-// running in parallel.  Each copy will then do more or less 1/Nth of
-// the total work, depending on workstealing-based load balancing.
+// The intention of this statement is to start some some number (usually the
+// number of hardware threads) of copies of |func()| running in parallel. Each
+// copy will then do a portion of the total work, depending on
+// workstealing-based load balancing.
 //
-// Typically, each of the N slices runs in a different worker thread,
-// but that is not something you should rely upon---if work-stealing
-// is enabled it could be that a single worker thread winds up
-// handling multiple slices.
+// Typically, each of the N slices runs in a different worker thread, but that
+// is not something you should rely upon---if work-stealing is enabled it
+// could be that a single worker thread winds up handling multiple slices.
 //
-// The second argument, |feedback|, is an optional callback that will
-// receiver information about how execution proceeded.  This is
-// intended for use in unit testing but also for providing feedback to
-// users.  Note that gathering the data to provide to |feedback| is
-// not free and so execution will run somewhat slower if |feedback| is
-// provided.
+// The second argument, |boundsFunc|, is a function that must return an array
+// of exactly two integers. This function is called before every attempt at
+// execution: warmup, sequential, or parallel. The bounds are taken from a
+// function call instead of taken as two static integers so that the bounds
+// may be shrunk when recovering from bailout.
+//
+// The third argument, |mode|, is an internal mode integer giving finer
+// control over the behavior of ForkJoin. See the |ForkJoinMode| enum.
 //
 // func() should expect the following arguments:
 //
-//     func(id, n, warmup)
+//     func(warmup)
 //
-// Here, |id| is the slice id. |n| is the total number of slices. The
-// parameter |warmup| is true for a *warmup or recovery phase*.
-// Warmup phases are discussed below in more detail, but the general
-// idea is that if |warmup| is true, |func| should only do a fixed
-// amount of work.  If |warmup| is false, |func| should try to do all
-// remaining work is assigned.
+// The parameter |warmup| is true for a *warmup or recovery phase*. Warmup
+// phases are discussed below in more detail, but the general idea is that if
+// |warmup| is true, |func| should only do a fixed amount of work. If |warmup|
+// is false, |func| should try to do all remaining work is assigned.
 //
-// Note that we implicitly assume that |func| is tracking how much
-// work it has accomplished thus far; some techniques for doing this
-// are discussed in |ParallelArray.js|.
+// |func| can keep asking for more work from the scheduler by calling the
+// intrinsic |GetForkJoinSlice(id)|. When there are no more slices to hand
+// out, -1 is returned as a sentinel value. By exposing this function as an
+// intrinsic, we reduce the number of JS-C++ boundary crossings incurred by
+// workstealing, which may have many slices.
+//
+// |func| MUST PROCESS ALL SLICES BEFORE RETURNING! Not doing so is an error
+// |and is protected by debug asserts in ThreadPool.
+//
+// Note well that there is a separation of concern between *scheduling* slices
+// and *interpreting* slices. ForkJoin only schedules slices by handing out
+// slice ids; it does not interpret what slice ids mean. Instead, |func|
+// should track how much work it has accomplished thus far; consult |Array.js|
+// for some examples.
 //
 // Warmups and Sequential Fallbacks
 // --------------------------------
@ -301,9 +312,6 @@ struct ForkJoinShared;
 class ForkJoinContext : public ThreadSafeContext
 {
  public:
-    // The slice that is being processed.
-    const uint16_t sliceId;
-
    // The worker that is doing the work.
    const uint32_t workerId;

@ -314,8 +322,7 @@ class ForkJoinContext : public ThreadSafeContext
    // Records the last instr. to execute on this thread.
    IonLIRTraceData traceData;

-    // The maximum worker and slice id.
-    uint16_t maxSliceId;
+    // The maximum worker id.
    uint32_t maxWorkerId;
 #endif

@ -336,10 +343,18 @@ class ForkJoinContext : public ThreadSafeContext
    uint8_t *targetRegionStart;
    uint8_t *targetRegionEnd;

-    ForkJoinContext(PerThreadData *perThreadData, uint16_t sliceId, uint32_t workerId,
+    ForkJoinContext(PerThreadData *perThreadData, uint32_t workerId,
                    Allocator *allocator, ForkJoinShared *shared,
                    ParallelBailoutRecord *bailoutRecord);

+    // Get a slice of work for the worker associated with the context.
+    bool getSlice(uint16_t *sliceId) {
+        ThreadPool &pool = runtime()->threadPool;
+        return (isMainThread()
+                ? pool.getSliceForMainThread(sliceId)
+                : pool.getSliceForWorker(workerId, sliceId));
+    }
+
    // True if this is the main thread, false if it is one of the parallel workers.
    bool isMainThread() const;

--- a/js/src/vm/SelfHosting.cpp
+++ b/js/src/vm/SelfHosting.cpp
@ -316,6 +316,46 @@ intrinsic_ForkJoinNumWorkers(JSContext *cx, unsigned argc, Value *vp)
    return true;
 }

+/*
+ * ForkJoinGetSlice(id): Returns the id of the next slice to be worked
+ * on.
+ *
+ * Acts as the identity function when called from outside of a ForkJoin
+ * thread. This odd API is because intrinsics must be called during the
+ * parallel warm up phase to populate observed type sets, so we must call it
+ * even during sequential execution. But since there is no thread pool during
+ * sequential execution, the selfhosted code is responsible for computing the
+ * next sequential slice id and passing it in itself.
+ */
+bool
+js::intrinsic_ForkJoinGetSlice(JSContext *cx, unsigned argc, Value *vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+    MOZ_ASSERT(args.length() == 1);
+    MOZ_ASSERT(args[0].isInt32());
+    args.rval().set(args[0]);
+    return true;
+}
+
+static bool
+intrinsic_ForkJoinGetSlicePar(ForkJoinContext *cx, unsigned argc, Value *vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+    MOZ_ASSERT(args.length() == 1);
+    MOZ_ASSERT(args[0].isInt32());
+
+    uint16_t sliceId;
+    if (cx->getSlice(&sliceId))
+        args.rval().setInt32(sliceId);
+    else
+        args.rval().setInt32(-1);
+
+    return true;
+}
+
+JS_JITINFO_NATIVE_PARALLEL(intrinsic_ForkJoinGetSlice_jitInfo,
+                           intrinsic_ForkJoinGetSlicePar);
+
 /*
 * NewDenseArray(length): Allocates and returns a new dense array with
 * the given length where all values are initialized to holes.
@ -573,6 +613,25 @@ js::intrinsic_ShouldForceSequential(JSContext *cx, unsigned argc, Value *vp)
    return true;
 }

+bool
+js::intrinsic_InParallelSection(JSContext *cx, unsigned argc, Value *vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+    args.rval().setBoolean(false);
+    return true;
+}
+
+static bool
+intrinsic_InParallelSectionPar(ForkJoinContext *cx, unsigned argc, Value *vp)
+{
+    CallArgs args = CallArgsFromVp(argc, vp);
+    args.rval().setBoolean(true);
+    return true;
+}
+
+JS_JITINFO_NATIVE_PARALLEL(intrinsic_InParallelSection_jitInfo,
+                           intrinsic_InParallelSectionPar);
+
 /**
 * Returns the default locale as a well-formed, but not necessarily canonicalized,
 * BCP-47 language tag.
@ -630,6 +689,12 @@ static const JSFunctionSpec intrinsic_functions[] = {
    JS_FNINFO("SetForkJoinTargetRegion",
              intrinsic_SetForkJoinTargetRegion,
              &intrinsic_SetForkJoinTargetRegionInfo, 2, 0),
+    JS_FNINFO("ForkJoinGetSlice",
+              intrinsic_ForkJoinGetSlice,
+              &intrinsic_ForkJoinGetSlice_jitInfo, 1, 0),
+    JS_FNINFO("InParallelSection",
+              intrinsic_InParallelSection,
+              &intrinsic_InParallelSection_jitInfo, 0, 0),

    // See builtin/TypedObject.h for descriptors of the typedobj functions.
    JS_FN("NewTypedHandle",
--- a/js/src/vm/ThreadPool.cpp
+++ b/js/src/vm/ThreadPool.cpp
@ -68,7 +68,6 @@ class js::ThreadPoolBaseWorker

    void submitSlices(uint16_t sliceFrom, uint16_t sliceTo) {
        MOZ_ASSERT(!hasWork());
-        MOZ_ASSERT(sliceFrom < sliceTo);
        sliceBounds_ = ComposeSliceBounds(sliceFrom, sliceTo);
    }

@ -98,16 +97,16 @@ class js::ThreadPoolWorker : public ThreadPoolBaseWorker
    static void ThreadMain(void *arg);
    void run();

-    // Get a slice of work, from ourself or steal work from other workers
-    // (or from the main thread).
-    bool getSlice(uint16_t *sliceId);
-
  public:
    ThreadPoolWorker(uint32_t workerId, ThreadPool *pool)
      : ThreadPoolBaseWorker(workerId, pool),
        state_(CREATED)
    { }

+    // Get a slice of work, from ourself or steal work from other workers
+    // (or from the main thread).
+    bool getSlice(uint16_t *sliceId);
+
    // Invoked from main thread; signals worker to start.
    bool start();

@ -124,9 +123,6 @@ class js::ThreadPoolMainWorker : public ThreadPoolBaseWorker
 {
    friend class ThreadPoolWorker;

-    // Get a slice of work, from ourself or steal work from other workers.
-    bool getSlice(uint16_t *sliceId);
-
  public:
    bool isActive;

@ -135,6 +131,9 @@ class js::ThreadPoolMainWorker : public ThreadPoolBaseWorker
        isActive(false)
    { }

+    // Get a slice of work, from ourself or steal work from other workers.
+    bool getSlice(uint16_t *sliceId);
+
    // Execute a job on the main thread.
    void executeJob();
 };
@ -287,14 +286,8 @@ ThreadPoolWorker::run()
            pool_->activeWorkers_++;
        }

-        ParallelJob *job = pool_->job();
-        uint16_t sliceId;
-        while (getSlice(&sliceId)) {
-            if (!job->executeFromWorker(sliceId, workerId_, stackLimit)) {
-                pool_->abortJob();
-                break;
-            }
-        }
+        if (!pool_->job()->executeFromWorker(workerId_, stackLimit))
+            pool_->abortJob();

        // Join the pool.
        {
@ -315,14 +308,8 @@ ThreadPoolWorker::terminate(AutoLockMonitor &lock)
 void
 ThreadPoolMainWorker::executeJob()
 {
-    ParallelJob *job = pool_->job();
-    uint16_t sliceId;
-    while (getSlice(&sliceId)) {
-        if (!job->executeFromMainThread(sliceId)) {
-            pool_->abortJob();
-            return;
-        }
-    }
+    if (!pool_->job()->executeFromMainThread())
+        pool_->abortJob();
 }

 bool
@ -514,8 +501,9 @@ ThreadPool::waitForWorkers(AutoLockMonitor &lock)
 }

 ParallelResult
-ThreadPool::executeJob(JSContext *cx, ParallelJob *job, uint16_t numSlices)
+ThreadPool::executeJob(JSContext *cx, ParallelJob *job, uint16_t sliceFrom, uint16_t sliceMax)
 {
+    MOZ_ASSERT(sliceFrom < sliceMax);
    MOZ_ASSERT(CurrentThreadCanAccessRuntime(runtime_));
    MOZ_ASSERT(activeWorkers_ == 0);
    MOZ_ASSERT(!hasWork());
@ -533,10 +521,10 @@ ThreadPool::executeJob(JSContext *cx, ParallelJob *job, uint16_t numSlices)
        return TP_FATAL;

    // Evenly distribute slices to the workers.
+    uint16_t numSlices = sliceMax - sliceFrom;
    uint16_t slicesPerWorker = numSlices / (numWorkers() + 1);
-    uint16_t leftover = numSlices % slicesPerWorker;
-    uint16_t sliceFrom = 0;
-    uint16_t sliceTo = 0;
+    uint16_t leftover = numSlices % (numWorkers() + 1);
+    uint16_t sliceTo = sliceFrom;
    for (uint32_t workerId = 0; workerId < numWorkers(); workerId++) {
        if (leftover > 0) {
            sliceTo += slicesPerWorker + 1;
@ -573,10 +561,28 @@ ThreadPool::executeJob(JSContext *cx, ParallelJob *job, uint16_t numSlices)
        waitForWorkers(lock);
    }

+    // Guard against errors in the self-hosted slice processing function. If
+    // we still have work at this point, it is the user function's fault.
+    MOZ_ASSERT(!hasWork(), "User function did not process all the slices!");
+
    // Everything went swimmingly. Give yourself a pat on the back.
    return TP_SUCCESS;
 }

+bool
+ThreadPool::getSliceForWorker(uint32_t workerId, uint16_t *sliceId)
+{
+    MOZ_ASSERT(workers_[workerId]);
+    return workers_[workerId]->getSlice(sliceId);
+}
+
+bool
+ThreadPool::getSliceForMainThread(uint16_t *sliceId)
+{
+    MOZ_ASSERT(mainWorker_);
+    return mainWorker_->getSlice(sliceId);
+}
+
 void
 ThreadPool::abortJob()
 {
--- a/js/src/vm/ThreadPool.h
+++ b/js/src/vm/ThreadPool.h
@ -26,14 +26,15 @@ class ThreadPoolWorker;
 class ThreadPoolMainWorker;

 // A ParallelJob is the main runnable abstraction in the ThreadPool.
-// ParallelJobs are composed of one or more slices. Each slice is executed by
-// the pool by calling one of the execute method with the unique |sliceId|
-// as argument. The pool executes multiple slices in parallel.
+//
+// The unit of work here is in terms of threads, *not* slices. The
+// user-provided function has the responsibility of getting slices of work via
+// the |ForkJoinGetSlice| intrinsic.
 class ParallelJob
 {
  public:
-    virtual bool executeFromWorker(uint16_t sliceId, uint32_t workerId, uintptr_t stackLimit) = 0;
-    virtual bool executeFromMainThread(uint16_t sliceId) = 0;
+    virtual bool executeFromWorker(uint32_t workerId, uintptr_t stackLimit) = 0;
+    virtual bool executeFromMainThread() = 0;
 };

 // ThreadPool used for parallel JavaScript execution. Unless you are building
@ -147,7 +148,13 @@ class ThreadPool : public Monitor

    // Execute the given ParallelJob using the main thread and any available worker.
    // Blocks until the main thread has completed execution.
-    ParallelResult executeJob(JSContext *cx, ParallelJob *job, uint16_t numSlices);
+    ParallelResult executeJob(JSContext *cx, ParallelJob *job, uint16_t sliceStart,
+                              uint16_t numSlices);
+
+    // Get the next slice; work stealing happens here if work stealing is
+    // on. Returns false if there are no more slices to hand out.
+    bool getSliceForWorker(uint32_t workerId, uint16_t *sliceId);
+    bool getSliceForMainThread(uint16_t *sliceId);

    // Abort the current job.
    void abortJob();