Bug 1137339 - [manifestparser] implement a chunk_by_runtime filter, r=jmaher

With this chunking strategy, the runtimes of tests are taken into account, such that each chunk
takes roughly the same amount of time to finish. Tests belonging to the same manifest will not get
split up.

The algorithm works by sorting every manifest from slowest to fastest. Each manifest is popped off
and its tests are added to the fastest chunk to date until no manifests are left. Total runtimes of
the chunks are re-calculated after every addition.
This commit is contained in:
Andrew Halberstadt 2015-03-05 09:12:55 -05:00
parent 56c951c61f
commit aa85a0d1dc
2 changed files with 210 additions and 24 deletions

View File

@ -135,18 +135,18 @@ class chunk_by_slice(InstanceFilter):
"""
Basic chunking algorithm that splits tests evenly across total chunks.
:param this: the current chunk, 1 <= this <= total
:param total: the total number of chunks
:param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
:param total_chunks: the total number of chunks
:param disabled: Whether to include disabled tests in the chunking
algorithm. If False, each chunk contains an equal number
of non-disabled tests. If True, each chunk contains an
equal number of tests (default False)
"""
def __init__(self, this, total, disabled=False):
assert 1 <= this <= total
self.this = this
self.total = total
def __init__(self, this_chunk, total_chunks, disabled=False):
assert 1 <= this_chunk <= total_chunks
self.this_chunk = this_chunk
self.total_chunks = total_chunks
self.disabled = disabled
def __call__(self, tests, values):
@ -156,20 +156,20 @@ class chunk_by_slice(InstanceFilter):
else:
chunk_tests = [t for t in tests if 'disabled' not in t]
tests_per_chunk = float(len(chunk_tests)) / self.total
start = int(round((self.this - 1) * tests_per_chunk))
end = int(round(self.this * tests_per_chunk))
tests_per_chunk = float(len(chunk_tests)) / self.total_chunks
start = int(round((self.this_chunk - 1) * tests_per_chunk))
end = int(round(self.this_chunk * tests_per_chunk))
if not self.disabled:
# map start and end back onto original list of tests. Disabled
# tests will still be included in the returned list, but each
# chunk will contain an equal number of enabled tests.
if self.this == 1:
if self.this_chunk == 1:
start = 0
else:
start = tests.index(chunk_tests[start])
if self.this == self.total:
if self.this_chunk == self.total_chunks:
end = len(tests)
else:
end = tests.index(chunk_tests[end])
@ -188,15 +188,15 @@ class chunk_by_dir(InstanceFilter):
paths must be relative to the same root (typically the root of the source
repository).
:param this: the current chunk, 1 <= this <= total
:param total: the total number of chunks
:param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
:param total_chunks: the total number of chunks
:param depth: the minimum depth of a subdirectory before it will be
considered unique
"""
def __init__(self, this, total, depth):
self.this = this
self.total = total
def __init__(self, this_chunk, total_chunks, depth):
self.this_chunk = this_chunk
self.total_chunks = total_chunks
self.depth = depth
def __call__(self, tests, values):
@ -216,15 +216,63 @@ class chunk_by_dir(InstanceFilter):
ordered_dirs.append(path)
tests_by_dir[path].append(test)
tests_per_chunk = float(len(tests_by_dir)) / self.total
start = int(round((self.this - 1) * tests_per_chunk))
end = int(round(self.this * tests_per_chunk))
tests_per_chunk = float(len(tests_by_dir)) / self.total_chunks
start = int(round((self.this_chunk - 1) * tests_per_chunk))
end = int(round(self.this_chunk * tests_per_chunk))
for i in range(start, end):
for test in tests_by_dir[ordered_dirs[i]]:
yield test
class chunk_by_runtime(InstanceFilter):
"""
Chunking algorithm that attempts to group tests into chunks based on their
average runtimes. It keeps manifests of tests together and pairs slow
running manifests with fast ones.
:param this_chunk: the current chunk, 1 <= this_chunk <= total_chunks
:param total_chunks: the total number of chunks
:param runtimes: dictionary of test runtime data, of the form
{<test path>: <average runtime>}
"""
def __init__(self, this_chunk, total_chunks, runtimes):
self.this_chunk = this_chunk
self.total_chunks = total_chunks
# defaultdict(int) assigns all non-existent keys a value of 0. This
# essentially means all tests we encounter that don't exist in the
# runtimes file won't factor in to the chunking determination.
self.runtimes = defaultdict(int)
self.runtimes.update(runtimes)
def __call__(self, tests, values):
tests = list(tests)
manifests = set(t['manifest'] for t in tests)
def total_runtime(tests):
return sum(self.runtimes[t['relpath']] for t in tests
if 'disabled' not in t)
tests_by_manifest = []
for manifest in manifests:
mtests = [t for t in tests if t['manifest'] == manifest]
tests_by_manifest.append((total_runtime(mtests), mtests))
tests_by_manifest.sort(reverse=True)
tests_by_chunk = [[0, []] for i in range(self.total_chunks)]
for runtime, batch in tests_by_manifest:
# sort first by runtime, then by number of tests in case of a tie.
# This guarantees the chunk with the fastest runtime will always
# get the next batch of tests.
tests_by_chunk.sort(key=lambda x: (x[0], len(x[1])))
tests_by_chunk[0][0] += runtime
tests_by_chunk[0][1].extend(batch)
return (t for t in tests_by_chunk[self.this_chunk-1][1])
# filter container
DEFAULT_FILTERS = (

View File

@ -1,18 +1,20 @@
#!/usr/bin/env python
from itertools import chain
from unittest import TestCase
import os
import unittest
import random
from manifestparser.filters import (
chunk_by_dir,
chunk_by_runtime,
chunk_by_slice,
)
here = os.path.dirname(os.path.abspath(__file__))
class ChunkBySlice(unittest.TestCase):
class ChunkBySlice(TestCase):
"""Test chunking related filters"""
def generate_tests(self, num, disabled=None):
@ -50,7 +52,8 @@ class ChunkBySlice(unittest.TestCase):
if disabled:
lengths = [len(c) for c in res_disabled]
self.assertLessEqual(max(lengths) - min(lengths), 1)
self.assertEqual(list(chain.from_iterable(res_disabled)), list(tests))
self.assertEqual(list(chain.from_iterable(res_disabled)),
list(tests))
def test_chunk_by_slice(self):
chunk = chunk_by_slice(1, 1)
@ -64,13 +67,13 @@ class ChunkBySlice(unittest.TestCase):
self.run_all_combos(num_tests=num_tests, disabled=disabled)
class ChunkByDir(unittest.TestCase):
class ChunkByDir(TestCase):
"""Test chunking related filters"""
def generate_tests(self, dirs):
"""
:param dirs: dict of the form,
{ <dir>: <num tests>
{ <dir>: <num tests> }
"""
i = 0
for d, num in dirs.iteritems():
@ -153,3 +156,138 @@ class ChunkByDir(unittest.TestCase):
'c/e': 1,
}
self.run_all_combos(dirs)
class ChunkByRuntime(TestCase):
"""Test chunking related filters"""
def generate_tests(self, dirs):
"""
:param dirs: dict of the form,
{ <dir>: <num tests> }
"""
i = 0
for d, num in dirs.iteritems():
for j in range(num):
i += 1
name = 'test%i' % i
test = {'name': name,
'relpath': os.path.join(d, name),
'manifest': os.path.join(d, 'manifest.ini')}
yield test
def get_runtimes(self, tests):
runtimes = {}
for test in tests:
runtimes[test['relpath']] = random.randint(0, 100)
return runtimes
def chunk_by_round_robin(self, tests, runtimes):
manifests = set(t['manifest'] for t in tests)
tests_by_manifest = []
for manifest in manifests:
mtests = [t for t in tests if t['manifest'] == manifest]
total = sum(runtimes[t['relpath']] for t in mtests
if 'disabled' not in t)
tests_by_manifest.append((total, mtests))
tests_by_manifest.sort()
chunks = [[] for i in range(total)]
d = 1 # direction
i = 0
for runtime, batch in tests_by_manifest:
chunks[i].extend(batch)
# "draft" style (last pick goes first in the next round)
if (i == 0 and d == -1) or (i == total-1 and d == 1):
d = -d
else:
i += d
# make sure this test algorithm is valid
all_chunks = list(chain.from_iterable(chunks))
self.assertEqual(len(all_chunks), len(tests))
for t in tests:
self.assertIn(t, all_chunks)
return chunks
def run_all_combos(self, dirs):
tests = list(self.generate_tests(dirs))
runtimes = self.get_runtimes(tests)
for total in range(1, len(dirs)+1):
chunks = []
for this in range(1, total+1):
f = chunk_by_runtime(this, total, runtimes)
ret = list(f(tests, {}))
chunks.append(ret)
# chunk_by_runtime will mess up order, but chained chunks should
# contain all of the original tests and be the same length
all_chunks = list(chain.from_iterable(chunks))
self.assertEqual(len(all_chunks), len(tests))
for t in tests:
self.assertIn(t, all_chunks)
# calculate delta between slowest and fastest chunks
def runtime_delta(chunks):
totals = []
for chunk in chunks:
total = sum(runtimes[t['relpath']] for t in chunk
if 'disabled' not in t)
totals.append(total)
return max(totals) - min(totals)
delta = runtime_delta(chunks)
# redo the chunking a second time using a round robin style
# algorithm
chunks = self.chunk_by_round_robin(tests, runtimes)
# since chunks will never have exactly equal runtimes, it's hard
# to tell if they were chunked optimally. Make sure it at least
# beats a naive round robin approach.
self.assertLessEqual(delta, runtime_delta(chunks))
def test_chunk_by_runtime(self):
random.seed(42)
chunk = chunk_by_runtime(1, 1, {})
self.assertEqual(list(chunk([], {})), [])
dirs = {
'a': 2,
}
self.run_all_combos(dirs)
dirs = {
'': 1,
'foo': 1,
'bar': 0,
'/foobar': 1,
}
self.run_all_combos(dirs)
dirs = {
'a': 1,
'b': 1,
'a/b': 2,
'a/c': 1,
}
self.run_all_combos(dirs)
dirs = {
'a': 5,
'a/b': 4,
'a/b/c': 7,
'a/b/c/d': 1,
'a/b/c/e': 3,
'b/c': 2,
'b/d': 5,
'b/d/e': 6,
'c': 8,
'c/d/e/f/g/h/i/j/k/l': 5,
'c/d/e/f/g/i/j/k/l/m/n': 2,
'c/e': 1,
}
self.run_all_combos(dirs)