Files
llvm-project/bolt/lib/Core/ParallelUtilities.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

247 lines
7.7 KiB
C++
Raw Permalink Normal View History

//===- bolt/Core/ParallelUtilities.cpp - Parallel utilities ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Implementation of the class that manages parallel work on BinaryFunctions.
//
//===----------------------------------------------------------------------===//
#include "bolt/Core/ParallelUtilities.h"
#include "bolt/Core/BinaryContext.h"
#include "bolt/Core/BinaryFunction.h"
#include "llvm/Support/RWMutex.h"
#include "llvm/Support/ThreadPool.h"
#include "llvm/Support/Timer.h"
#include <mutex>
#define DEBUG_TYPE "par-utils"
namespace opts {
extern cl::OptionCategory BoltCategory;
cl::opt<unsigned>
ThreadCount("thread-count",
cl::desc("number of threads"),
cl::init(hardware_concurrency().compute_thread_count()),
cl::cat(BoltCategory));
cl::opt<bool>
NoThreads("no-threads",
cl::desc("disable multithreading"),
cl::init(false),
cl::cat(BoltCategory));
cl::opt<unsigned>
TaskCount("tasks-per-thread",
cl::desc("number of tasks to be created per thread"),
cl::init(20),
cl::cat(BoltCategory));
} // namespace opts
namespace llvm {
namespace bolt {
namespace ParallelUtilities {
namespace {
/// A single thread pool that is used to run parallel tasks
std::unique_ptr<ThreadPoolInterface> ThreadPoolPtr;
unsigned computeCostFor(const BinaryFunction &BF,
const PredicateTy &SkipPredicate,
const SchedulingPolicy &SchedPolicy) {
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
return 1;
if (SkipPredicate && SkipPredicate(BF))
return 0;
switch (SchedPolicy) {
case SchedulingPolicy::SP_CONSTANT:
return 1;
case SchedulingPolicy::SP_INST_LINEAR:
return BF.getSize();
case SchedulingPolicy::SP_INST_QUADRATIC:
return BF.getSize() * BF.getSize();
case SchedulingPolicy::SP_BB_LINEAR:
return BF.size();
case SchedulingPolicy::SP_BB_QUADRATIC:
return BF.size() * BF.size();
default:
llvm_unreachable("unsupported scheduling policy");
}
}
inline unsigned estimateTotalCost(const BinaryContext &BC,
const PredicateTy &SkipPredicate,
SchedulingPolicy &SchedPolicy) {
if (SchedPolicy == SchedulingPolicy::SP_TRIVIAL)
return BC.getBinaryFunctions().size();
unsigned TotalCost = 0;
for (auto &BFI : BC.getBinaryFunctions()) {
const BinaryFunction &BF = BFI.second;
TotalCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
}
// Switch to trivial scheduling if total estimated work is zero
if (TotalCost == 0) {
[BOLT][NFC] Log through JournalingStreams (#81524) Make core BOLT functionality more friendly to being used as a library instead of in our standalone driver llvm-bolt. To accomplish this, we augment BinaryContext with journaling streams that are to be used by most BOLT code whenever something needs to be logged to the screen. Users of the library can decide if logs should be printed to a file, no file or to the screen, as before. To illustrate this, this patch adds a new option `--log-file` that allows the user to redirect BOLT logging to a file on disk or completely hide it by using `--log-file=/dev/null`. Future BOLT code should now use `BinaryContext::outs()` for printing important messages instead of `llvm::outs()`. A new test log.test enforces this by verifying that no strings are print to screen once the `--log-file` option is used. In previous patches we also added a new BOLTError class to report common and fatal errors, so code shouldn't call exit(1) now. To easily handle problems as before (by quitting with exit(1)), callers can now use `BinaryContext::logBOLTErrorsAndQuitOnFatal(Error)` whenever code needs to deal with BOLT errors. To test this, we have fatal.s that checks we are correctly quitting and printing a fatal error to the screen. Because this is a significant change by itself, not all code was yet ported. Code from Profiler libs (DataAggregator and friends) still print errors directly to screen. Co-authored-by: Rafael Auler <rafaelauler@fb.com> Test Plan: NFC
2024-02-12 14:53:53 -08:00
BC.outs()
<< "BOLT-WARNING: Running parallel work of 0 estimated cost, will "
"switch to trivial scheduling.\n";
SchedPolicy = SP_TRIVIAL;
TotalCost = BC.getBinaryFunctions().size();
}
return TotalCost;
}
} // namespace
ThreadPoolInterface &getThreadPool(const unsigned ThreadsCount) {
if (ThreadPoolPtr)
return *ThreadPoolPtr;
if (ThreadsCount > 1)
ThreadPoolPtr = std::make_unique<DefaultThreadPool>(
llvm::hardware_concurrency(ThreadsCount));
else
ThreadPoolPtr = std::make_unique<SingleThreadExecutor>();
return *ThreadPoolPtr;
}
void runOnEachFunction(BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName, bool ForceSequential,
unsigned TasksPerThread) {
if (BC.getBinaryFunctions().size() == 0)
return;
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd) {
Timer T(LogName, LogName);
LLVM_DEBUG(T.startTimer());
for (auto It = BlockBegin; It != BlockEnd; ++It) {
BinaryFunction &BF = It->second;
if (SkipPredicate && SkipPredicate(BF))
continue;
WorkFunction(BF);
}
LLVM_DEBUG(T.stopTimer());
};
if (opts::NoThreads || ForceSequential) {
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end());
return;
}
// Estimate the overall runtime cost using the scheduling policy
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
const unsigned BlockCost =
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
// Divide work into blocks of equal cost
ThreadPoolInterface &Pool = getThreadPool();
auto BlockBegin = BC.getBinaryFunctions().begin();
unsigned CurrentCost = 0;
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It) {
BinaryFunction &BF = It->second;
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
if (CurrentCost >= BlockCost) {
Pool.async(runBlock, BlockBegin, std::next(It));
BlockBegin = std::next(It);
CurrentCost = 0;
}
}
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end());
Pool.wait();
}
void runOnEachFunctionWithUniqueAllocId(
BinaryContext &BC, SchedulingPolicy SchedPolicy,
WorkFuncWithAllocTy WorkFunction, PredicateTy SkipPredicate,
std::string LogName, bool ForceSequential, unsigned TasksPerThread) {
if (BC.getBinaryFunctions().size() == 0)
return;
llvm::sys::RWMutex MainLock;
auto runBlock = [&](std::map<uint64_t, BinaryFunction>::iterator BlockBegin,
std::map<uint64_t, BinaryFunction>::iterator BlockEnd,
MCPlusBuilder::AllocatorIdTy AllocId) {
Timer T(LogName, LogName);
LLVM_DEBUG(T.startTimer());
std::shared_lock<llvm::sys::RWMutex> Lock(MainLock);
for (auto It = BlockBegin; It != BlockEnd; ++It) {
BinaryFunction &BF = It->second;
if (SkipPredicate && SkipPredicate(BF))
continue;
WorkFunction(BF, AllocId);
}
LLVM_DEBUG(T.stopTimer());
};
[BOLT] Fix runOnEachFunctionWithUniqueAllocId (#90039) When runOnEachFunctionWithUniqueAllocId is invoked with ForceSequential=true, then the current implementation runs the function with AllocId==0, which is the Id for the shared, non-unique, default AnnotationAllocator. However, the documentation for runOnEachFunctionWithUniqueAllocId states: ``` /// Perform the work on each BinaryFunction except those that are rejected /// by SkipPredicate, and create a unique annotation allocator for each /// task. This should be used whenever the work function creates annotations to /// allow thread-safe annotation creation. ``` Therefore, even when ForceSequential==true, a unique AllocId should be used, i.e. different from 0. In the current upstream BOLT this is presumably not depended on, but it is needed to reduce memory usage for analyses that use a lot of memory/annotations. Examples are the pac-ret and stack-clash analyses that currently have prototype implementations as described in https://discourse.llvm.org/t/rfc-bolt-based-binary-analysis-tool-to-verify-correctness-of-security-hardening/78148 These analyses use the DataFlowAnalysis framework to sometimes store quite a lot of information on each MCInst. They run in parallel on each function. When the dataflow analysis is finished, the annotations on each MCInst can be removed, hugely saving on memory consumption. The only annotations that need to remain are those that indicate some unexpected properties somewhere in the binary. Fixing this bug enables implementing the deletion of the memory used by those huge number of DataFlowAnalysis annotations (by invoking BC.MIB->freeValuesAllocator(AllocatorId)), even when run with --no-threads. Without this bug fixed, the invocation of BC.MIB->freeValuesAllocator(AllocatorId) results in also the memory for all other annotations to be deleted, as AllocatorId is 0. --------- Co-authored-by: Maksim Panchenko <maks@meta.com>
2024-05-04 09:26:35 +02:00
unsigned AllocId = 1;
auto EnsureAllocatorExists = [&BC](unsigned AllocId) {
if (!BC.MIB->checkAllocatorExists(AllocId)) {
MCPlusBuilder::AllocatorIdTy Id =
BC.MIB->initializeNewAnnotationAllocator();
(void)Id;
assert(AllocId == Id && "unexpected allocator id created");
}
};
if (opts::NoThreads || ForceSequential) {
[BOLT] Fix runOnEachFunctionWithUniqueAllocId (#90039) When runOnEachFunctionWithUniqueAllocId is invoked with ForceSequential=true, then the current implementation runs the function with AllocId==0, which is the Id for the shared, non-unique, default AnnotationAllocator. However, the documentation for runOnEachFunctionWithUniqueAllocId states: ``` /// Perform the work on each BinaryFunction except those that are rejected /// by SkipPredicate, and create a unique annotation allocator for each /// task. This should be used whenever the work function creates annotations to /// allow thread-safe annotation creation. ``` Therefore, even when ForceSequential==true, a unique AllocId should be used, i.e. different from 0. In the current upstream BOLT this is presumably not depended on, but it is needed to reduce memory usage for analyses that use a lot of memory/annotations. Examples are the pac-ret and stack-clash analyses that currently have prototype implementations as described in https://discourse.llvm.org/t/rfc-bolt-based-binary-analysis-tool-to-verify-correctness-of-security-hardening/78148 These analyses use the DataFlowAnalysis framework to sometimes store quite a lot of information on each MCInst. They run in parallel on each function. When the dataflow analysis is finished, the annotations on each MCInst can be removed, hugely saving on memory consumption. The only annotations that need to remain are those that indicate some unexpected properties somewhere in the binary. Fixing this bug enables implementing the deletion of the memory used by those huge number of DataFlowAnalysis annotations (by invoking BC.MIB->freeValuesAllocator(AllocatorId)), even when run with --no-threads. Without this bug fixed, the invocation of BC.MIB->freeValuesAllocator(AllocatorId) results in also the memory for all other annotations to be deleted, as AllocatorId is 0. --------- Co-authored-by: Maksim Panchenko <maks@meta.com>
2024-05-04 09:26:35 +02:00
EnsureAllocatorExists(AllocId);
runBlock(BC.getBinaryFunctions().begin(), BC.getBinaryFunctions().end(),
AllocId);
return;
}
// This lock is used to postpone task execution
std::unique_lock<llvm::sys::RWMutex> Lock(MainLock);
// Estimate the overall runtime cost using the scheduling policy
const unsigned TotalCost = estimateTotalCost(BC, SkipPredicate, SchedPolicy);
const unsigned BlocksCount = TasksPerThread * opts::ThreadCount;
const unsigned BlockCost =
TotalCost > BlocksCount ? TotalCost / BlocksCount : 1;
// Divide work into blocks of equal cost
ThreadPoolInterface &Pool = getThreadPool();
auto BlockBegin = BC.getBinaryFunctions().begin();
unsigned CurrentCost = 0;
for (auto It = BC.getBinaryFunctions().begin();
It != BC.getBinaryFunctions().end(); ++It) {
BinaryFunction &BF = It->second;
CurrentCost += computeCostFor(BF, SkipPredicate, SchedPolicy);
if (CurrentCost >= BlockCost) {
[BOLT] Fix runOnEachFunctionWithUniqueAllocId (#90039) When runOnEachFunctionWithUniqueAllocId is invoked with ForceSequential=true, then the current implementation runs the function with AllocId==0, which is the Id for the shared, non-unique, default AnnotationAllocator. However, the documentation for runOnEachFunctionWithUniqueAllocId states: ``` /// Perform the work on each BinaryFunction except those that are rejected /// by SkipPredicate, and create a unique annotation allocator for each /// task. This should be used whenever the work function creates annotations to /// allow thread-safe annotation creation. ``` Therefore, even when ForceSequential==true, a unique AllocId should be used, i.e. different from 0. In the current upstream BOLT this is presumably not depended on, but it is needed to reduce memory usage for analyses that use a lot of memory/annotations. Examples are the pac-ret and stack-clash analyses that currently have prototype implementations as described in https://discourse.llvm.org/t/rfc-bolt-based-binary-analysis-tool-to-verify-correctness-of-security-hardening/78148 These analyses use the DataFlowAnalysis framework to sometimes store quite a lot of information on each MCInst. They run in parallel on each function. When the dataflow analysis is finished, the annotations on each MCInst can be removed, hugely saving on memory consumption. The only annotations that need to remain are those that indicate some unexpected properties somewhere in the binary. Fixing this bug enables implementing the deletion of the memory used by those huge number of DataFlowAnalysis annotations (by invoking BC.MIB->freeValuesAllocator(AllocatorId)), even when run with --no-threads. Without this bug fixed, the invocation of BC.MIB->freeValuesAllocator(AllocatorId) results in also the memory for all other annotations to be deleted, as AllocatorId is 0. --------- Co-authored-by: Maksim Panchenko <maks@meta.com>
2024-05-04 09:26:35 +02:00
EnsureAllocatorExists(AllocId);
Pool.async(runBlock, BlockBegin, std::next(It), AllocId);
AllocId++;
BlockBegin = std::next(It);
CurrentCost = 0;
}
}
EnsureAllocatorExists(AllocId);
Pool.async(runBlock, BlockBegin, BC.getBinaryFunctions().end(), AllocId);
Lock.unlock();
Pool.wait();
}
} // namespace ParallelUtilities
} // namespace bolt
} // namespace llvm