#include #include "Common/Thread/ParallelLoop.h" #include "Common/CPUDetect.h" class LoopRangeTask : public Task { public: LoopRangeTask(WaitableCounter *counter, const std::function &loop, int lower, int upper) : counter_(counter), loop_(loop), lower_(lower), upper_(upper) {} void Run() override { loop_(lower_, upper_); counter_->Count(); } std::function loop_; WaitableCounter *counter_; int lower_; int upper_; }; WaitableCounter *ParallelRangeLoopWaitable(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize) { if (minSize == -1) { minSize = 1; } int numTasks = threadMan->GetNumLooperThreads(); int range = upper - lower; if (range <= 0) { // Bad range. A finished counter allocated. return new WaitableCounter(0); } if (range <= numTasks) { // Just assign one task per thread, as many as we have. WaitableCounter *waitableCounter = new WaitableCounter(range); for (int i = 0; i < range; i++) { threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, i, i + 1), TaskType::CPU_COMPUTE); } return waitableCounter; } else { // Split the range between threads. Allow for some fractional bits. const int fractionalBits = 8; int64_t totalFrac = (int64_t)range << fractionalBits; int64_t delta = totalFrac / numTasks; delta = std::max(delta, (int64_t)minSize << fractionalBits); // Now we can compute the actual number of tasks. // Remember that stragglers are done on the current thread // so we don't round up. numTasks = (int)(totalFrac / delta); printf("numTasks: %d\n", numTasks); WaitableCounter *waitableCounter = new WaitableCounter(numTasks); int64_t counter = (int64_t)lower << fractionalBits; // Split up tasks as equitable as possible. for (int i = 0; i < numTasks; i++) { int start = (int)(counter >> fractionalBits); int end = (int)((counter + delta) >> fractionalBits); if (end > upper) { // Let's do the stragglers on the current thread. break; } threadMan->EnqueueTaskOnThread(i, new LoopRangeTask(waitableCounter, loop, start, end), TaskType::CPU_COMPUTE); counter += delta; if ((counter >> fractionalBits) > upper) { break; } } // Run stragglers on the calling thread directly. // We might add a flag later to avoid this for some cases. int stragglerStart = (int)(counter >> fractionalBits); int stragglerEnd = upper; if (stragglerStart < stragglerEnd) { // printf("doing stragglers: %d-%d\n", start, upper); loop(stragglerStart, stragglerEnd); } return waitableCounter; } } void ParallelRangeLoop(ThreadManager *threadMan, const std::function &loop, int lower, int upper, int minSize) { if (cpu_info.num_cores == 1 || (minSize >= (upper - lower) && upper > lower)) { // "Optimization" for single-core devices, or minSize larger than the range. // No point in adding threading overhead, let's just do it inline (since this is the blocking variant). loop(lower, upper); return; } if (minSize < 1) { // There's no obvious value to default to. minSize = 1; } WaitableCounter *counter = ParallelRangeLoopWaitable(threadMan, loop, lower, upper, minSize); // TODO: Optimize using minSize. We'll just compute whether there's a remainer, remove it from the call to ParallelRangeLoopWaitable, // and process the remainder right here. If there's no remainer, we'll steal a whole chunk. if (counter) { counter->WaitAndRelease(); } } // NOTE: Supports a max of 2GB. void ParallelMemcpy(ThreadManager *threadMan, void *dst, const void *src, size_t bytes) { // This threshold can probably be a lot bigger. if (bytes < 512) { memcpy(dst, src, bytes); return; } // 128 is the largest cacheline size on common CPUs. // Still I suspect that the optimal minSize is a lot higher. char *d = (char *)dst; char *s = (char *)src; ParallelRangeLoop(threadMan, [&](int l, int h) { memmove(d + l, s + l, h - l); }, 0, (int)bytes, 128); } // NOTE: Supports a max of 2GB. void ParallelMemset(ThreadManager *threadMan, void *dst, uint8_t value, size_t bytes) { // This threshold can probably be a lot bigger. if (bytes < 512) { memset(dst, 0, bytes); return; } // 128 is the largest cacheline size on common CPUs. // Still I suspect that the optimal minSize is a lot higher. char *d = (char *)dst; ParallelRangeLoop(threadMan, [&](int l, int h) { memset(d + l, value, h - l); }, 0, (int)bytes, 128); }