Bug 1137339 - [manifestparser] implement a chunk_by_runtime filter, r=jmaher

With this chunking strategy, the runtimes of tests are taken into account, such that each chunk takes roughly the same amount of time to finish. Tests belonging to the same manifest will not get split up. The algorithm works by sorting every manifest from slowest to fastest. Each manifest is popped off and its tests are added to the fastest chunk to date until no manifests are left. Total runtimes of the chunks are re-calculated after every addition.
2024-09-13 09:24:08 -07:00 · 2015-03-05 09:12:55 -05:00 · 2015-03-05 09:12:55 -05:00 · aa85a0d1dc
commit aa85a0d1dc
parent 56c951c61f
2 changed files with 210 additions and 24 deletions
--- a/testing/mozbase/manifestparser/manifestparser/filters.py
+++ b/testing/mozbase/manifestparser/manifestparser/filters.py
@ -135,18 +135,18 @@ class chunk_by_slice(InstanceFilter):
    """
    Basic chunking algorithm that splits tests evenly across total chunks.

-    :param this: the current chunk, 1 <= this <= total
-    :param total: the total number of chunks
+    :param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
+    :param total_chunks: the total number of chunks
    :param disabled: Whether to include disabled tests in the chunking
                     algorithm. If False, each chunk contains an equal number
                     of non-disabled tests. If True, each chunk contains an
                     equal number of tests (default False)
    """

-    def __init__(self, this, total, disabled=False):
-        assert 1 <= this <= total
-        self.this = this
-        self.total = total
+    def __init__(self, this_chunk, total_chunks, disabled=False):
+        assert 1 <= this_chunk <= total_chunks
+        self.this_chunk = this_chunk
+        self.total_chunks = total_chunks
        self.disabled = disabled

    def __call__(self, tests, values):
@ -156,20 +156,20 @@ class chunk_by_slice(InstanceFilter):
        else:
            chunk_tests = [t for t in tests if 'disabled' not in t]

-        tests_per_chunk = float(len(chunk_tests)) / self.total
-        start = int(round((self.this - 1) * tests_per_chunk))
-        end = int(round(self.this * tests_per_chunk))
+        tests_per_chunk = float(len(chunk_tests)) / self.total_chunks
+        start = int(round((self.this_chunk - 1) * tests_per_chunk))
+        end = int(round(self.this_chunk * tests_per_chunk))

        if not self.disabled:
            # map start and end back onto original list of tests. Disabled
            # tests will still be included in the returned list, but each
            # chunk will contain an equal number of enabled tests.
-            if self.this == 1:
+            if self.this_chunk == 1:
                start = 0
            else:
                start = tests.index(chunk_tests[start])

-            if self.this == self.total:
+            if self.this_chunk == self.total_chunks:
                end = len(tests)
            else:
                end = tests.index(chunk_tests[end])
@ -188,15 +188,15 @@ class chunk_by_dir(InstanceFilter):
    paths must be relative to the same root (typically the root of the source
    repository).

-    :param this: the current chunk, 1 <= this <= total
-    :param total: the total number of chunks
+    :param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
+    :param total_chunks: the total number of chunks
    :param depth: the minimum depth of a subdirectory before it will be
                  considered unique
    """

-    def __init__(self, this, total, depth):
-        self.this = this
-        self.total = total
+    def __init__(self, this_chunk, total_chunks, depth):
+        self.this_chunk = this_chunk
+        self.total_chunks = total_chunks
        self.depth = depth

    def __call__(self, tests, values):
@ -216,15 +216,63 @@ class chunk_by_dir(InstanceFilter):
                ordered_dirs.append(path)
            tests_by_dir[path].append(test)

-        tests_per_chunk = float(len(tests_by_dir)) / self.total
-        start = int(round((self.this - 1) * tests_per_chunk))
-        end = int(round(self.this * tests_per_chunk))
+        tests_per_chunk = float(len(tests_by_dir)) / self.total_chunks
+        start = int(round((self.this_chunk - 1) * tests_per_chunk))
+        end = int(round(self.this_chunk * tests_per_chunk))

        for i in range(start, end):
            for test in tests_by_dir[ordered_dirs[i]]:
                yield test


+class chunk_by_runtime(InstanceFilter):
+    """
+    Chunking algorithm that attempts to group tests into chunks based on their
+    average runtimes. It keeps manifests of tests together and pairs slow
+    running manifests with fast ones.
+
+    :param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
+    :param total_chunks: the total number of chunks
+    :param runtimes: dictionary of test runtime data, of the form
+                     {<test path>: <average runtime>}
+    """
+
+    def __init__(self, this_chunk, total_chunks, runtimes):
+        self.this_chunk = this_chunk
+        self.total_chunks = total_chunks
+
+        # defaultdict(int) assigns all non-existent keys a value of 0. This
+        # essentially means all tests we encounter that don't exist in the
+        # runtimes file won't factor in to the chunking determination.
+        self.runtimes = defaultdict(int)
+        self.runtimes.update(runtimes)
+
+    def __call__(self, tests, values):
+        tests = list(tests)
+        manifests = set(t['manifest'] for t in tests)
+
+        def total_runtime(tests):
+            return sum(self.runtimes[t['relpath']] for t in tests
+                       if 'disabled' not in t)
+
+        tests_by_manifest = []
+        for manifest in manifests:
+            mtests = [t for t in tests if t['manifest'] == manifest]
+            tests_by_manifest.append((total_runtime(mtests), mtests))
+        tests_by_manifest.sort(reverse=True)
+
+        tests_by_chunk = [[0, []] for i in range(self.total_chunks)]
+        for runtime, batch in tests_by_manifest:
+            # sort first by runtime, then by number of tests in case of a tie.
+            # This guarantees the chunk with the fastest runtime will always
+            # get the next batch of tests.
+            tests_by_chunk.sort(key=lambda x: (x[0], len(x[1])))
+            tests_by_chunk[0][0] += runtime
+            tests_by_chunk[0][1].extend(batch)
+
+        return (t for t in tests_by_chunk[self.this_chunk-1][1])
+
+
 # filter container

 DEFAULT_FILTERS = (
--- a/testing/mozbase/manifestparser/tests/test_chunking.py
+++ b/testing/mozbase/manifestparser/tests/test_chunking.py
@ -1,18 +1,20 @@
 #!/usr/bin/env python

 from itertools import chain
+from unittest import TestCase
 import os
-import unittest
+import random

 from manifestparser.filters import (
    chunk_by_dir,
+    chunk_by_runtime,
    chunk_by_slice,
 )

 here = os.path.dirname(os.path.abspath(__file__))


-class ChunkBySlice(unittest.TestCase):
+class ChunkBySlice(TestCase):
    """Test chunking related filters"""

    def generate_tests(self, num, disabled=None):
@ -50,7 +52,8 @@ class ChunkBySlice(unittest.TestCase):
            if disabled:
                lengths = [len(c) for c in res_disabled]
                self.assertLessEqual(max(lengths) - min(lengths), 1)
-                self.assertEqual(list(chain.from_iterable(res_disabled)), list(tests))
+                self.assertEqual(list(chain.from_iterable(res_disabled)),
+                                 list(tests))

    def test_chunk_by_slice(self):
        chunk = chunk_by_slice(1, 1)
@ -64,13 +67,13 @@ class ChunkBySlice(unittest.TestCase):
        self.run_all_combos(num_tests=num_tests, disabled=disabled)


-class ChunkByDir(unittest.TestCase):
+class ChunkByDir(TestCase):
    """Test chunking related filters"""

    def generate_tests(self, dirs):
        """
        :param dirs: dict of the form,
-                        { <dir>: <num tests>
+                        { <dir>: <num tests> }
        """
        i = 0
        for d, num in dirs.iteritems():
@ -153,3 +156,138 @@ class ChunkByDir(unittest.TestCase):
            'c/e': 1,
        }
        self.run_all_combos(dirs)
+
+
+class ChunkByRuntime(TestCase):
+    """Test chunking related filters"""
+
+    def generate_tests(self, dirs):
+        """
+        :param dirs: dict of the form,
+                     { <dir>: <num tests> }
+        """
+        i = 0
+        for d, num in dirs.iteritems():
+            for j in range(num):
+                i += 1
+                name = 'test%i' % i
+                test = {'name': name,
+                        'relpath': os.path.join(d, name),
+                        'manifest': os.path.join(d, 'manifest.ini')}
+                yield test
+
+    def get_runtimes(self, tests):
+        runtimes = {}
+        for test in tests:
+            runtimes[test['relpath']] = random.randint(0, 100)
+        return runtimes
+
+    def chunk_by_round_robin(self, tests, runtimes):
+        manifests = set(t['manifest'] for t in tests)
+        tests_by_manifest = []
+        for manifest in manifests:
+            mtests = [t for t in tests if t['manifest'] == manifest]
+            total = sum(runtimes[t['relpath']] for t in mtests
+                        if 'disabled' not in t)
+            tests_by_manifest.append((total, mtests))
+        tests_by_manifest.sort()
+
+        chunks = [[] for i in range(total)]
+        d = 1  # direction
+        i = 0
+        for runtime, batch in tests_by_manifest:
+            chunks[i].extend(batch)
+
+            # "draft" style (last pick goes first in the next round)
+            if (i == 0 and d == -1) or (i == total-1 and d == 1):
+                d = -d
+            else:
+                i += d
+
+        # make sure this test algorithm is valid
+        all_chunks = list(chain.from_iterable(chunks))
+        self.assertEqual(len(all_chunks), len(tests))
+        for t in tests:
+            self.assertIn(t, all_chunks)
+
+        return chunks
+
+    def run_all_combos(self, dirs):
+        tests = list(self.generate_tests(dirs))
+        runtimes = self.get_runtimes(tests)
+
+        for total in range(1, len(dirs)+1):
+            chunks = []
+            for this in range(1, total+1):
+                f = chunk_by_runtime(this, total, runtimes)
+                ret = list(f(tests, {}))
+                chunks.append(ret)
+
+            # chunk_by_runtime will mess up order, but chained chunks should
+            # contain all of the original tests and be the same length
+            all_chunks = list(chain.from_iterable(chunks))
+            self.assertEqual(len(all_chunks), len(tests))
+            for t in tests:
+                self.assertIn(t, all_chunks)
+
+            # calculate delta between slowest and fastest chunks
+            def runtime_delta(chunks):
+                totals = []
+                for chunk in chunks:
+                    total = sum(runtimes[t['relpath']] for t in chunk
+                                if 'disabled' not in t)
+                    totals.append(total)
+                return max(totals) - min(totals)
+            delta = runtime_delta(chunks)
+
+            # redo the chunking a second time using a round robin style
+            # algorithm
+            chunks = self.chunk_by_round_robin(tests, runtimes)
+
+            # since chunks will never have exactly equal runtimes, it's hard
+            # to tell if they were chunked optimally. Make sure it at least
+            # beats a naive round robin approach.
+            self.assertLessEqual(delta, runtime_delta(chunks))
+
+    def test_chunk_by_runtime(self):
+        random.seed(42)
+
+        chunk = chunk_by_runtime(1, 1, {})
+        self.assertEqual(list(chunk([], {})), [])
+
+        dirs = {
+            'a': 2,
+        }
+        self.run_all_combos(dirs)
+
+        dirs = {
+            '': 1,
+            'foo': 1,
+            'bar': 0,
+            '/foobar': 1,
+        }
+        self.run_all_combos(dirs)
+
+        dirs = {
+            'a': 1,
+            'b': 1,
+            'a/b': 2,
+            'a/c': 1,
+        }
+        self.run_all_combos(dirs)
+
+        dirs = {
+            'a': 5,
+            'a/b': 4,
+            'a/b/c': 7,
+            'a/b/c/d': 1,
+            'a/b/c/e': 3,
+            'b/c': 2,
+            'b/d': 5,
+            'b/d/e': 6,
+            'c': 8,
+            'c/d/e/f/g/h/i/j/k/l': 5,
+            'c/d/e/f/g/i/j/k/l/m/n': 2,
+            'c/e': 1,
+        }
+        self.run_all_combos(dirs)