Skip to content

Commit

Permalink
[RF] Avoid using static std::vector<double> buffer in RooBatchCompute
Browse files Browse the repository at this point in the history
In the RooBatchCompute CPU library, all scalar inputs have to be copied
n times into a buffer that is as long as the SIMD registers, to allow
for vectorization in all cases.

To avoid frequent memory allocations, this buffer was made a `static`
variable in the original implementation of the batchcompute library,
which of course made it non-threadsafe.

This is now hitting us, because RooFit needs to be able to do multiple
fits concurrently. This is a requirement for CMSSW, and a blocker for
ROOT master adoption in CMSSW since the new CPU backend is the default:
cms-sw/cmsdist#9034

This commit fixes the concurrency problem by doing the buffering in the
DataMaps that are used in the `RooFit::Evaluator`. Like this, multiple
computation graphs can be concurrently evaluated.

It was tested with the ATLAS benchmarks in `rootbench` that the fitting
performance remains the same.
  • Loading branch information
guitargeek committed Mar 4, 2024
1 parent 329c264 commit 3692889
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 29 deletions.
2 changes: 2 additions & 0 deletions roofit/batchcompute/res/RooBatchCompute.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ typedef std::span<double> ArgSpan;
typedef double *__restrict RestrictArr;
typedef const double *__restrict InputArr;

constexpr std::size_t bufferSize = 64;

void init();

/// Minimal configuration struct to steer the evaluation of a single node with
Expand Down
2 changes: 0 additions & 2 deletions roofit/batchcompute/src/Batches.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ so that they can contain data for every kind of compute function.

namespace RooBatchCompute {

constexpr std::size_t bufferSize = 64;

namespace RF_ARCH {

class Batch {
Expand Down
27 changes: 7 additions & 20 deletions roofit/batchcompute/src/RooBatchCompute.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,10 @@ void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size
batches._output = output;
}

void fillArrays(std::vector<Batch> &arrays, VarSpan vars, double *buffer, std::size_t nEvents)
void fillArrays(std::span<Batch> arrays, VarSpan vars, std::size_t nEvents)
{

arrays.resize(vars.size());
for (size_t i = 0; i < vars.size(); i++) {
const std::span<const double> &span = vars[i];
if (!span.empty() && span.size() < nEvents) {
// In the scalar case, copy the value to each element of vector input
// buffer.
std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]);
arrays[i].set(&buffer[i * bufferSize], false);
} else {
arrays[i].set(span.data(), true);
}
for (std::size_t i = 0; i < vars.size(); i++) {
arrays[i].set(vars[i].data(), vars[i].empty() || vars[i].size() >= nEvents);
}
}

Expand Down Expand Up @@ -111,9 +101,6 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
ArgSpan extraArgs) override
{
static std::vector<double> buffer;
buffer.resize(vars.size() * bufferSize);

if (ROOT::IsImplicitMTEnabled()) {
ROOT::Internal::TExecutor ex;
std::size_t nThreads = ex.GetPoolSize();
Expand All @@ -127,9 +114,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
// Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
// Then advance every object but the first to split the work between threads
Batches batches;
std::vector<Batch> arrays;
std::vector<Batch> arrays(vars.size());
fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs);
fillArrays(arrays, vars, buffer.data(), nEvents);
fillArrays(arrays, vars, nEvents);
batches._arrays = arrays.data();
batches.advance(batches.getNEvents() * idx);

Expand Down Expand Up @@ -159,9 +146,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
// Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
// Then advance every object but the first to split the work between threads
Batches batches;
std::vector<Batch> arrays;
std::vector<Batch> arrays(vars.size());
fillBatches(batches, output, nEvents, vars.size(), extraArgs);
fillArrays(arrays, vars, buffer.data(), nEvents);
fillArrays(arrays, vars, nEvents);
batches._arrays = arrays.data();

std::size_t events = batches.getNEvents();
Expand Down
11 changes: 7 additions & 4 deletions roofit/roofitcore/inc/RooFit/Detail/DataMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,7 @@ namespace Detail {

class DataMap {
public:
auto size() const
{
return _dataMap.size();
}
auto size() const { return _dataMap.size(); }
void resize(std::size_t n);

inline void set(RooAbsArg const *arg, std::span<const double> const &span)
Expand Down Expand Up @@ -119,8 +116,14 @@ class DataMap {

RooBatchCompute::Config config(RooAbsArg const *arg) const;

void enableVectorBuffers(bool enable) { _enableVectorBuffers = enable; }
void resetVectorBuffers() { _bufferIdx = 0; }

private:
std::vector<std::span<const double>> _dataMap;
bool _enableVectorBuffers = false;
std::vector<std::vector<double>> _buffers;
std::size_t _bufferIdx = 0;
std::vector<RooBatchCompute::Config> _cfgs;
};

Expand Down
28 changes: 25 additions & 3 deletions roofit/roofitcore/src/RooFit/Detail/DataMap.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,39 @@
#include <RooBatchCompute.h>
#include <RooRealVar.h>

#include <algorithm>

namespace RooFit {
namespace Detail {

std::span<const double> DataMap::at(RooAbsArg const *arg, RooAbsArg const * /*caller*/)
{
std::span<const double> out;

if (!arg->hasDataToken()) {
auto var = static_cast<RooRealVar const *>(arg);
return {&var->_value, 1};
out = std::span<const double>{&var->_value, 1};
} else {
std::size_t idx = arg->dataToken();
out = _dataMap[idx];
}
std::size_t idx = arg->dataToken();
return _dataMap[idx];

if (!_enableVectorBuffers || out.size() != 1) {
return out;
}

if (_bufferIdx == _buffers.size()) {
_buffers.emplace_back(RooBatchCompute::bufferSize);
}

double *buffer = _buffers[_bufferIdx].data();

std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]);
out = std::span<const double>{buffer, 1};

++_bufferIdx;

return out;
}

void DataMap::setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
Expand Down
5 changes: 5 additions & 0 deletions roofit/roofitcore/src/RooFit/Evaluator.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)
buffer = info.buffer->cpuWritePtr();
}
_dataMapCPU.set(node, {buffer, nOut});
if (nOut > 1) {
_dataMapCPU.enableVectorBuffers(true);
}
nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU);
_dataMapCPU.resetVectorBuffers();
_dataMapCPU.enableVectorBuffers(false);
#ifdef ROOFIT_CUDA
if (info.copyAfterEvaluation) {
_dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut});
Expand Down

0 comments on commit 3692889

Please sign in to comment.