Skip to content

Commit

Permalink
[RF] Avoid using static std::vector<double> buffer in RooBatchCompute
Browse files Browse the repository at this point in the history
In the RooBatchCompute CPU library, all scalar inputs have to be copied
n times into a buffer that is as long as the SIMD registers, to allow
for vectorization in all cases.

To avoid frequent memory allocations, this buffer was made a `static`
variable in the original implementation of the batchcompute library,
which of course made it non-threadsafe.

This is now hitting us, because RooFit needs to be able to do multiple
fits concurrently. This is a requirement for CMSSW, and a blocker for
ROOT master adoption in CMSSW since the new CPU backend is the default:
cms-sw/cmsdist#9034

This commit fixes the concurrency problem by doing the buffering in the
DataMaps that are used in the `RooFit::Evaluator`. Like this, multiple
computation graphs can be concurrently evaluated.

It was tested with the ATLAS benchmarks in `rootbench` that the fitting
performance remains the same.
  • Loading branch information
guitargeek authored and lobis committed Apr 10, 2024
1 parent 0fb9685 commit 62c72f4
Show file tree
Hide file tree
Showing 7 changed files with 65 additions and 32 deletions.
2 changes: 2 additions & 0 deletions roofit/batchcompute/res/RooBatchCompute.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ typedef std::span<double> ArgSpan;
typedef double *__restrict RestrictArr;
typedef const double *__restrict InputArr;

constexpr std::size_t bufferSize = 64;

void init();

/// Minimal configuration struct to steer the evaluation of a single node with
Expand Down
2 changes: 0 additions & 2 deletions roofit/batchcompute/src/Batches.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ so that they can contain data for every kind of compute function.

namespace RooBatchCompute {

constexpr std::size_t bufferSize = 64;

namespace RF_ARCH {

class Batch {
Expand Down
27 changes: 7 additions & 20 deletions roofit/batchcompute/src/RooBatchCompute.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,10 @@ void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size
batches._output = output;
}

void fillArrays(std::vector<Batch> &arrays, VarSpan vars, double *buffer, std::size_t nEvents)
void fillArrays(std::span<Batch> arrays, VarSpan vars, std::size_t nEvents)
{

arrays.resize(vars.size());
for (size_t i = 0; i < vars.size(); i++) {
const std::span<const double> &span = vars[i];
if (!span.empty() && span.size() < nEvents) {
// In the scalar case, copy the value to each element of vector input
// buffer.
std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]);
arrays[i].set(&buffer[i * bufferSize], false);
} else {
arrays[i].set(span.data(), true);
}
for (std::size_t i = 0; i < vars.size(); i++) {
arrays[i].set(vars[i].data(), vars[i].empty() || vars[i].size() >= nEvents);
}
}

Expand Down Expand Up @@ -112,9 +102,6 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
ArgSpan extraArgs) override
{
static std::vector<double> buffer;
buffer.resize(vars.size() * bufferSize);

if (ROOT::IsImplicitMTEnabled()) {
ROOT::Internal::TExecutor ex;
std::size_t nThreads = ex.GetPoolSize();
Expand All @@ -128,9 +115,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
// Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
// Then advance every object but the first to split the work between threads
Batches batches;
std::vector<Batch> arrays;
std::vector<Batch> arrays(vars.size());
fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs);
fillArrays(arrays, vars, buffer.data(), nEvents);
fillArrays(arrays, vars, nEvents);
batches._arrays = arrays.data();
batches.advance(batches.getNEvents() * idx);

Expand Down Expand Up @@ -160,9 +147,9 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
// Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
// Then advance every object but the first to split the work between threads
Batches batches;
std::vector<Batch> arrays;
std::vector<Batch> arrays(vars.size());
fillBatches(batches, output, nEvents, vars.size(), extraArgs);
fillArrays(arrays, vars, buffer.data(), nEvents);
fillArrays(arrays, vars, nEvents);
batches._arrays = arrays.data();

std::size_t events = batches.getNEvents();
Expand Down
11 changes: 7 additions & 4 deletions roofit/roofitcore/inc/RooFit/Detail/DataMap.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,7 @@ namespace Detail {

class DataMap {
public:
auto size() const
{
return _dataMap.size();
}
auto size() const { return _dataMap.size(); }
void resize(std::size_t n);

inline void set(RooAbsArg const *arg, std::span<const double> const &span)
Expand Down Expand Up @@ -119,8 +116,14 @@ class DataMap {

RooBatchCompute::Config config(RooAbsArg const *arg) const;

void enableVectorBuffers(bool enable) { _enableVectorBuffers = enable; }
void resetVectorBuffers() { _bufferIdx = 0; }

private:
std::vector<std::span<const double>> _dataMap;
bool _enableVectorBuffers = false;
std::vector<std::vector<double>> _buffers;
std::size_t _bufferIdx = 0;
std::vector<RooBatchCompute::Config> _cfgs;
};

Expand Down
39 changes: 36 additions & 3 deletions roofit/roofitcore/src/RooFit/Detail/DataMap.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,50 @@
#include <RooBatchCompute.h>
#include <RooRealVar.h>

#include <algorithm>

namespace {

// To avoid deleted move assignment.
template <class T>
void assignSpan(std::span<T> &to, std::span<T> const &from)
{
to = from;
}

} // namespace

namespace RooFit {
namespace Detail {

std::span<const double> DataMap::at(RooAbsArg const *arg, RooAbsArg const * /*caller*/)
{
std::span<const double> out;

if (!arg->hasDataToken()) {
auto var = static_cast<RooRealVar const *>(arg);
return {&var->_value, 1};
assignSpan(out, {&var->_value, 1});
} else {
std::size_t idx = arg->dataToken();
out = _dataMap[idx];
}
std::size_t idx = arg->dataToken();
return _dataMap[idx];

if (!_enableVectorBuffers || out.size() != 1) {
return out;
}

if (_bufferIdx == _buffers.size()) {
_buffers.emplace_back(RooBatchCompute::bufferSize);
}

double *buffer = _buffers[_bufferIdx].data();

std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]);
assignSpan(out, {buffer, 1});

++_bufferIdx;

return out;
}

void DataMap::setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
Expand Down
5 changes: 5 additions & 0 deletions roofit/roofitcore/src/RooFit/Evaluator.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)
buffer = info.buffer->cpuWritePtr();
}
_dataMapCPU.set(node, {buffer, nOut});
if (nOut > 1) {
_dataMapCPU.enableVectorBuffers(true);
}
nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU);
_dataMapCPU.resetVectorBuffers();
_dataMapCPU.enableVectorBuffers(false);
#ifdef ROOFIT_CUDA
if (info.copyAfterEvaluation) {
_dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut});
Expand Down
11 changes: 8 additions & 3 deletions roofit/roofitcore/src/RooPolyVar.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ it can define.

#include "TError.h"

#include <algorithm>
#include <array>
#include <cmath>

Expand Down Expand Up @@ -139,10 +140,14 @@ void RooPolyVar::computeBatchImpl(RooAbsArg const* caller, double *output, size_
// Fill the coefficients for the skipped orders. By a conventions started in
// RooPolynomial, if the zero-th order is skipped, it implies a coefficient
// for the constant term of one.
const double zero = 1.0;
const double one = 1.0;
std::array<double, RooBatchCompute::bufferSize> zeros;
std::array<double, RooBatchCompute::bufferSize> ones;
std::fill_n(zeros.data(), zeros.size(), 0.0);
std::fill_n(ones.data(), ones.size(), 1.0);
std::span<const double> zerosSpan{zeros.data(), 1};
std::span<const double> onesSpan{ones.data(), 1};
for (int i = lowestOrder - 1; i >= 0; --i) {
vars.push_back(i == 0 ? std::span<const double>{&one, 1} : std::span<const double>{&zero, 1});
vars.push_back(i == 0 ? onesSpan : zerosSpan);
}

for (RooAbsArg *coef : coefs) {
Expand Down

0 comments on commit 62c72f4

Please sign in to comment.