diff --git a/roofit/batchcompute/res/RooBatchCompute.h b/roofit/batchcompute/res/RooBatchCompute.h index 447e1d6704dad..c8554331aae6e 100644 --- a/roofit/batchcompute/res/RooBatchCompute.h +++ b/roofit/batchcompute/res/RooBatchCompute.h @@ -23,10 +23,8 @@ #include //for R__EXTERN, needed for windows -#include -#include +#include #include -#include /** * Namespace for dispatching RooFit computations to various backends. @@ -42,11 +40,13 @@ */ namespace RooBatchCompute { -typedef std::vector> VarVector; -typedef std::vector ArgVector; +typedef std::span> VarSpan; +typedef std::span ArgSpan; typedef double *__restrict RestrictArr; typedef const double *__restrict InputArr; +constexpr std::size_t bufferSize = 64; + void init(); /// Minimal configuration struct to steer the evaluation of a single node with @@ -140,12 +140,7 @@ struct ReduceNLLOutput { class RooBatchComputeInterface { public: virtual ~RooBatchComputeInterface() = default; - virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, const VarVector &, ArgVector &) = 0; - inline void compute(Config const &cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars) - { - ArgVector extraArgs{}; - compute(cfg, comp, output, size, vars, extraArgs); - } + virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, VarSpan, ArgSpan) = 0; virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0; virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span probas, std::span weights, @@ -182,18 +177,20 @@ inline bool hasCuda() return dispatchCUDA; } -inline void -compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars, ArgVector &extraArgs) +inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, VarSpan vars, ArgSpan extraArgs = {}) { init(); auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU; dispatch->compute(cfg, comp, output, size, vars, extraArgs); } -inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars) +/// It is not possible to construct a std::span directly from an initializer +/// list (probably it will be with C++26). That's why we need an explicit +/// overload for this. +inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, + std::initializer_list> vars, ArgSpan extraArgs = {}) { - ArgVector extraArgs{}; - compute(cfg, comp, output, size, vars, extraArgs); + compute(cfg, comp, output, size, VarSpan{vars.begin(), vars.end()}, extraArgs); } inline double reduceSum(Config cfg, InputArr input, size_t n) diff --git a/roofit/batchcompute/src/Batches.h b/roofit/batchcompute/src/Batches.h index 69c4a7837bd53..5589199ffc677 100644 --- a/roofit/batchcompute/src/Batches.h +++ b/roofit/batchcompute/src/Batches.h @@ -23,67 +23,32 @@ so that they can contain data for every kind of compute function. #ifndef ROOFIT_BATCHCOMPUTE_BATCHES_H #define ROOFIT_BATCHCOMPUTE_BATCHES_H -#include - #include namespace RooBatchCompute { -constexpr std::size_t bufferSize = 64; - -namespace RF_ARCH { - class Batch { public: const double *__restrict _array = nullptr; bool _isVector = false; - Batch() = default; - inline Batch(InputArr array, bool isVector) : _array{array}, _isVector{isVector} {} - - __roodevice__ constexpr bool isItVector() const { return _isVector; } - inline void set(InputArr array, bool isVector) - { - _array = array; - _isVector = isVector; - } - inline void advance(std::size_t _nEvents) { _array += _isVector * _nEvents; } #ifdef __CUDACC__ - __roodevice__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; } + __device__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; } #else constexpr double operator[](std::size_t i) const noexcept { return _array[i]; } #endif // #ifdef __CUDACC__ }; -///////////////////////////////////////////////////////////////////////////////////////////////////////// - class Batches { public: - Batch *_arrays = nullptr; - double *_extraArgs = nullptr; - std::size_t _nEvents = 0; - std::size_t _nBatches = 0; - std::size_t _nExtraArgs = 0; - RestrictArr _output = nullptr; - - __roodevice__ std::size_t getNEvents() const { return _nEvents; } - __roodevice__ std::size_t getNExtraArgs() const { return _nExtraArgs; } - __roodevice__ double extraArg(std::size_t i) const { return _extraArgs[i]; } - __roodevice__ void setExtraArg(std::size_t i, double val) { _extraArgs[i] = val; } - __roodevice__ Batch operator[](int batchIdx) const { return _arrays[batchIdx]; } - inline void setNEvents(std::size_t n) { _nEvents = n; } - inline void advance(std::size_t nEvents) - { - for (std::size_t i = 0; i < _nBatches; i++) - _arrays[i].advance(nEvents); - _output += nEvents; - } + Batch *args = nullptr; + double *extra; + std::size_t nEvents = 0; + std::size_t nBatches = 0; + std::size_t nExtra = 0; + RestrictArr output = nullptr; }; -// Defines the actual argument type of the compute function. -using BatchesHandle = Batches &; - -} // End namespace RF_ARCH } // end namespace RooBatchCompute #endif // #ifdef ROOFIT_BATCHCOMPUTE_BATCHES_H diff --git a/roofit/batchcompute/src/ComputeFunctions.cxx b/roofit/batchcompute/src/ComputeFunctions.cxx index e112dda4c19c2..2dc2320196def 100644 --- a/roofit/batchcompute/src/ComputeFunctions.cxx +++ b/roofit/batchcompute/src/ComputeFunctions.cxx @@ -32,6 +32,8 @@ of performance, maximum memory coalescing. For more details, see #include +#include + #ifdef __CUDACC__ #define BEGIN blockDim.x *blockIdx.x + threadIdx.x #define STEP blockDim.x *gridDim.x @@ -43,64 +45,66 @@ of performance, maximum memory coalescing. For more details, see namespace RooBatchCompute { namespace RF_ARCH { -__rooglobal__ void computeAddPdf(BatchesHandle batches) +__rooglobal__ void computeAddPdf(Batches &batches) { - const int nPdfs = batches.getNExtraArgs(); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = batches.extraArg(0) * batches[0][i]; + const int nPdfs = batches.nExtra; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.extra[0] * batches.args[0][i]; + } for (int pdf = 1; pdf < nPdfs; pdf++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] += batches.extraArg(pdf) * batches[pdf][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += batches.extra[pdf] * batches.args[pdf][i]; + } } } -__rooglobal__ void computeArgusBG(BatchesHandle batches) +__rooglobal__ void computeArgusBG(Batches &batches) { - Batch m = batches[0]; - Batch m0 = batches[1]; - Batch c = batches[2]; - Batch p = batches[3]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch m = batches.args[0]; + Batch m0 = batches.args[1]; + Batch c = batches.args[2]; + Batch p = batches.args[3]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double t = m[i] / m0[i]; const double u = 1 - t * t; - batches._output[i] = c[i] * u + p[i] * fast_log(u); + batches.output[i] = c[i] * u + p[i] * fast_log(u); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (m[i] >= m0[i]) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - batches._output[i] = m[i] * fast_exp(batches._output[i]); + batches.output[i] = m[i] * fast_exp(batches.output[i]); } } } -__rooglobal__ void computeBMixDecay(BatchesHandle batches) +__rooglobal__ void computeBMixDecay(Batches &batches) { - Batch coef0 = batches[0]; - Batch coef1 = batches[1]; - Batch tagFlav = batches[2]; - Batch delMistag = batches[3]; - Batch mixState = batches[4]; - Batch mistag = batches[5]; + Batch coef0 = batches.args[0]; + Batch coef1 = batches.args[1]; + Batch tagFlav = batches.args[2]; + Batch delMistag = batches.args[3]; + Batch mixState = batches.args[4]; + Batch mistag = batches.args[5]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = coef0[i] * (1.0 - tagFlav[i] * delMistag[0]) + coef1[i] * (mixState[i] * (1.0 - 2.0 * mistag[0])); } } -__rooglobal__ void computeBernstein(BatchesHandle batches) +__rooglobal__ void computeBernstein(Batches &batches) { - const int nCoef = batches.getNExtraArgs() - 2; + const int nCoef = batches.nExtra - 2; const int degree = nCoef - 1; - const double xmin = batches.extraArg(nCoef); - const double xmax = batches.extraArg(nCoef + 1); - Batch xData = batches[0]; + const double xmin = batches.extra[nCoef]; + const double xmax = batches.extra[nCoef + 1]; + Batch xData = batches.args[0]; // apply binomial coefficient in-place so we don't have to allocate new memory double binomial = 1.0; for (int k = 0; k < nCoef; k++) { - batches.setExtraArg(k, batches.extraArg(k) * binomial); + batches.extra[k] = batches.extra[k] * binomial; binomial = (binomial * (degree - k)) / (k + 1); } @@ -109,31 +113,31 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) double _1_X[bufferSize]; double powX[bufferSize]; double pow_1_X[bufferSize]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { powX[i] = pow_1_X[i] = 1.0; X[i] = (xData[i] - xmin) / (xmax - xmin); _1_X[i] = 1 - X[i]; - batches._output[i] = 0.0; + batches.output[i] = 0.0; } // raising 1-x to the power of degree for (int k = 2; k <= degree; k += 2) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) pow_1_X[i] *= _1_X[i] * _1_X[i]; } if (degree % 2 == 1) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) pow_1_X[i] *= _1_X[i]; } // inverting 1-x ---> 1/(1-x) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) _1_X[i] = 1 / _1_X[i]; for (int k = 0; k < nCoef; k++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] += batches.extraArg(k) * powX[i] * pow_1_X[i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += batches.extra[k] * powX[i] * pow_1_X[i]; // calculating next power for x and 1-x powX[i] *= X[i]; @@ -141,8 +145,8 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) } } } else { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; const double X = (xData[i] - xmin) / (xmax - xmin); double powX = 1.0; double pow_1_X = 1.0; @@ -150,7 +154,7 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) pow_1_X *= 1 - X; const double _1_X = 1 / (1 - X); for (int k = 0; k < nCoef; k++) { - batches._output[i] += batches.extraArg(k) * powX * pow_1_X; + batches.output[i] += batches.extra[k] * powX * pow_1_X; powX *= X; pow_1_X *= _1_X; } @@ -160,52 +164,52 @@ __rooglobal__ void computeBernstein(BatchesHandle batches) // reset extraArgs values so we don't mutate the Batches object binomial = 1.0; for (int k = 0; k < nCoef; k++) { - batches.setExtraArg(k, batches.extraArg(k) / binomial); + batches.extra[k] = batches.extra[k] / binomial; binomial = (binomial * (degree - k)) / (k + 1); } } -__rooglobal__ void computeBifurGauss(BatchesHandle batches) +__rooglobal__ void computeBifurGauss(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch SL = batches[2]; - Batch SR = batches[3]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch SL = batches.args[2]; + Batch SR = batches.args[3]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double arg = X[i] - M[i]; if (arg < 0) { arg /= SL[i]; } else { arg /= SR[i]; } - batches._output[i] = fast_exp(-0.5 * arg * arg); + batches.output[i] = fast_exp(-0.5 * arg * arg); } } -__rooglobal__ void computeBreitWigner(BatchesHandle batches) +__rooglobal__ void computeBreitWigner(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch W = batches[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch W = batches.args[2]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = X[i] - M[i]; - batches._output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]); + batches.output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]); } } -__rooglobal__ void computeBukin(BatchesHandle batches) +__rooglobal__ void computeBukin(Batches &batches) { - Batch X = batches[0]; - Batch XP = batches[1]; - Batch SP = batches[2]; - Batch XI = batches[3]; - Batch R1 = batches[4]; - Batch R2 = batches[5]; + Batch X = batches.args[0]; + Batch XP = batches.args[1]; + Batch SP = batches.args[2]; + Batch XI = batches.args[3]; + Batch R1 = batches.args[4]; + Batch R2 = batches.args[5]; const double r3 = log(2.0); const double r6 = exp(-6.0); const double r7 = 2 * sqrt(2 * log(2.0)); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double r1 = XI[i] * fast_isqrt(XI[i] * XI[i] + 1); const double r4 = 1 / fast_isqrt(XI[i] * XI[i] + 1); const double hp = 1 / (SP[i] * r7); @@ -229,61 +233,61 @@ __rooglobal__ void computeBukin(BatchesHandle batches) rho = R2[i]; } - batches._output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi; + batches.output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi; if (X[i] >= x1 && X[i] < x2) { - batches._output[i] = + batches.output[i] = fast_log(1 + 4 * XI[i] * r4 * (X[i] - XP[i]) * hp) / fast_log(1 + 2 * XI[i] * (XI[i] - r4)); - batches._output[i] *= -batches._output[i] * r3; + batches.output[i] *= -batches.output[i] * r3; } if (X[i] >= x1 && X[i] < x2 && XI[i] < r6 && XI[i] > -r6) - batches._output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp; + batches.output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp; } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computeCBShape(BatchesHandle batches) +__rooglobal__ void computeCBShape(Batches &batches) { - Batch M = batches[0]; - Batch M0 = batches[1]; - Batch S = batches[2]; - Batch A = batches[3]; - Batch N = batches[4]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch M = batches.args[0]; + Batch M0 = batches.args[1]; + Batch S = batches.args[2]; + Batch A = batches.args[3]; + Batch N = batches.args[4]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double t = (M[i] - M0[i]) / S[i]; if ((A[i] > 0 && t >= -A[i]) || (A[i] < 0 && -t >= A[i])) { - batches._output[i] = -0.5 * t * t; + batches.output[i] = -0.5 * t * t; } else { - batches._output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t); - batches._output[i] = fast_log(batches._output[i]); - batches._output[i] *= N[i]; - batches._output[i] -= 0.5 * A[i] * A[i]; + batches.output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t); + batches.output[i] = fast_log(batches.output[i]); + batches.output[i] *= N[i]; + batches.output[i] -= 0.5 * A[i] * A[i]; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computeChebychev(BatchesHandle batches) +__rooglobal__ void computeChebychev(Batches &batches) { - Batch xData = batches[0]; - const int nCoef = batches.getNExtraArgs() - 2; - const double xmin = batches.extraArg(nCoef); - const double xmax = batches.extraArg(nCoef + 1); + Batch xData = batches.args[0]; + const int nCoef = batches.nExtra - 2; + const double xmin = batches.extra[nCoef]; + const double xmax = batches.extra[nCoef + 1]; if (STEP == 1) { double prev[bufferSize][2]; double X[bufferSize]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { // set a0-->prev[i][0] and a1-->prev[i][1] // and x tranfsformed to range[-1..1]-->X[i] - prev[i][0] = batches._output[i] = 1.0; + prev[i][0] = batches.output[i] = 1.0; prev[i][1] = X[i] = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin); } for (int k = 0; k < nCoef; k++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] += prev[i][1] * batches.extraArg(k); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] += prev[i][1] * batches.extra[k]; // compute next order const double next = 2 * X[i] * prev[i][1] - prev[i][0]; @@ -292,13 +296,13 @@ __rooglobal__ void computeChebychev(BatchesHandle batches) } } } else { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double prev0 = 1.0; double prev1 = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin); double X = prev1; - batches._output[i] = 1.0; + batches.output[i] = 1.0; for (int k = 0; k < nCoef; k++) { - batches._output[i] += prev1 * batches.extraArg(k); + batches.output[i] += prev1 * batches.extra[k]; // compute next order const double next = 2 * X * prev1 - prev0; @@ -309,127 +313,127 @@ __rooglobal__ void computeChebychev(BatchesHandle batches) } } -__rooglobal__ void computeChiSquare(BatchesHandle batches) +__rooglobal__ void computeChiSquare(Batches &batches) { - Batch X = batches[0]; - const double ndof = batches.extraArg(0); + Batch X = batches.args[0]; + const double ndof = batches.extra[0]; const double gamma = 1 / std::tgamma(ndof / 2.0); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = gamma; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = gamma; constexpr double ln2 = 0.693147180559945309417232121458; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double arg = (ndof - 2) * fast_log(X[i]) - X[i] - ndof * ln2; - batches._output[i] *= fast_exp(0.5 * arg); + batches.output[i] *= fast_exp(0.5 * arg); } } -__rooglobal__ void computeDeltaFunction(BatchesHandle batches) +__rooglobal__ void computeDeltaFunction(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0 + (batches[0][i] == 1.0); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0 + (batches.args[0][i] == 1.0); } } -__rooglobal__ void computeDstD0BG(BatchesHandle batches) +__rooglobal__ void computeDstD0BG(Batches &batches) { - Batch DM = batches[0]; - Batch DM0 = batches[1]; - Batch C = batches[2]; - Batch A = batches[3]; - Batch B = batches[4]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch DM = batches.args[0]; + Batch DM0 = batches.args[1]; + Batch C = batches.args[2]; + Batch A = batches.args[3]; + Batch B = batches.args[4]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double ratio = DM[i] / DM0[i]; const double arg1 = (DM0[i] - DM[i]) / C[i]; const double arg2 = A[i] * fast_log(ratio); - batches._output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1); + batches.output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - if (batches._output[i] < 0) - batches._output[i] = 0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + if (batches.output[i] < 0) + batches.output[i] = 0; } } -__rooglobal__ void computeExpPoly(BatchesHandle batches) +__rooglobal__ void computeExpPoly(Batches &batches) { - int lowestOrder = batches.extraArg(0); - int nTerms = batches.extraArg(1); - auto x = batches[0]; + int lowestOrder = batches.extra[0]; + int nTerms = batches.extra[1]; + auto x = batches.args[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; double xTmp = std::pow(x[i], lowestOrder); for (int k = 0; k < nTerms; ++k) { - batches._output[i] += batches[k + 1][i] * xTmp; + batches.output[i] += batches.args[k + 1][i] * xTmp; xTmp *= x[i]; } - batches._output[i] = std::exp(batches._output[i]); + batches.output[i] = std::exp(batches.output[i]); } } -__rooglobal__ void computeExponential(BatchesHandle batches) +__rooglobal__ void computeExponential(Batches &batches) { - Batch x = batches[0]; - Batch c = batches[1]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = fast_exp(x[i] * c[i]); + Batch x = batches.args[0]; + Batch c = batches.args[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = fast_exp(x[i] * c[i]); } } -__rooglobal__ void computeExponentialNeg(BatchesHandle batches) +__rooglobal__ void computeExponentialNeg(Batches &batches) { - Batch x = batches[0]; - Batch c = batches[1]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = fast_exp(-x[i] * c[i]); + Batch x = batches.args[0]; + Batch c = batches.args[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = fast_exp(-x[i] * c[i]); } } -__rooglobal__ void computeGamma(BatchesHandle batches) +__rooglobal__ void computeGamma(Batches &batches) { - Batch X = batches[0]; - Batch G = batches[1]; - Batch B = batches[2]; - Batch M = batches[3]; + Batch X = batches.args[0]; + Batch G = batches.args[1]; + Batch B = batches.args[2]; + Batch M = batches.args[3]; double gamma = -std::lgamma(G[0]); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (X[i] == M[i]) { - batches._output[i] = (G[i] == 1.0) / B[i]; - } else if (G.isItVector()) { - batches._output[i] = -std::lgamma(G[i]); + batches.output[i] = (G[i] == 1.0) / B[i]; + } else if (G._isVector) { + batches.output[i] = -std::lgamma(G[i]); } else { - batches._output[i] = gamma; + batches.output[i] = gamma; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (X[i] != M[i]) { const double invBeta = 1 / B[i]; double arg = (X[i] - M[i]) * invBeta; - batches._output[i] -= arg; + batches.output[i] -= arg; arg = fast_log(arg); - batches._output[i] += arg * (G[i] - 1); - batches._output[i] = fast_exp(batches._output[i]); - batches._output[i] *= invBeta; + batches.output[i] += arg * (G[i] - 1); + batches.output[i] = fast_exp(batches.output[i]); + batches.output[i] *= invBeta; } } } -__rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) +__rooglobal__ void computeGaussModelExpBasis(Batches &batches) { const double root2 = std::sqrt(2.); const double root2pi = std::sqrt(2. * std::atan2(0., -1.)); - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { - const double x = batches[0][i]; - const double mean = batches[1][i] * batches[2][i]; - const double sigma = batches[3][i] * batches[4][i]; - const double tau = batches[5][i]; + const double x = batches.args[0][i]; + const double mean = batches.args[1][i] * batches.args[2][i]; + const double sigma = batches.args[3][i] * batches.args[4][i]; + const double tau = batches.args[5][i]; if (tau == 0.0) { // Straight Gaussian, used for unconvoluted PDF or expBasis with 0 lifetime @@ -437,7 +441,7 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) double result = std::exp(-0.5 * xprime * xprime) / (sigma * root2pi); if (!isMinus && !isPlus) result *= 2; - batches._output[i] = result; + batches.output[i] = result; } else { // Convolution with exp(-t/tau) const double xprime = (x - mean) / tau; @@ -449,52 +453,52 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches) result += RooHeterogeneousMath::evalCerf(0, -u, c).real(); if (!isPlus) result += RooHeterogeneousMath::evalCerf(0, u, c).real(); - batches._output[i] = result; + batches.output[i] = result; } } } -__rooglobal__ void computeGaussian(BatchesHandle batches) +__rooglobal__ void computeGaussian(Batches &batches) { - auto x = batches[0]; - auto mean = batches[1]; - auto sigma = batches[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + auto x = batches.args[0]; + auto mean = batches.args[1]; + auto sigma = batches.args[2]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = x[i] - mean[i]; const double halfBySigmaSq = -0.5 / (sigma[i] * sigma[i]); - batches._output[i] = fast_exp(arg * arg * halfBySigmaSq); + batches.output[i] = fast_exp(arg * arg * halfBySigmaSq); } } -__rooglobal__ void computeIdentity(BatchesHandle batches) +__rooglobal__ void computeIdentity(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = batches[0][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.args[0][i]; } } -__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches) +__rooglobal__ void computeNegativeLogarithms(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = -fast_log(batches[0][i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = -fast_log(batches.args[0][i]); // Multiply by weights if they exist - if (batches.extraArg(0)) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] *= batches[1][i]; + if (batches.extra[0]) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] *= batches.args[1][i]; } } -__rooglobal__ void computeJohnson(BatchesHandle batches) +__rooglobal__ void computeJohnson(Batches &batches) { - Batch mass = batches[0]; - Batch mu = batches[1]; - Batch lambda = batches[2]; - Batch gamma = batches[3]; - Batch delta = batches[4]; + Batch mass = batches.args[0]; + Batch mu = batches.args[1]; + Batch lambda = batches.args[2]; + Batch gamma = batches.args[3]; + Batch delta = batches.args[4]; const double sqrtTwoPi = std::sqrt(TMath::TwoPi()); - const double massThreshold = batches.extraArg(0); + const double massThreshold = batches.extra[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = (mass[i] - mu[i]) / lambda[i]; #ifdef R__HAS_VDT const double asinh_arg = fast_log(arg + 1 / fast_isqrt(arg * arg + 1)); @@ -506,7 +510,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches) delta[i] * fast_exp(-0.5 * expo * expo) * fast_isqrt(1. + arg * arg) / (sqrtTwoPi * lambda[i]); const double passThrough = mass[i] >= massThreshold; - batches._output[i] = result * passThrough; + batches.output[i] = result * passThrough; } } @@ -514,7 +518,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches) * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx) * and rewritten to enable vectorization. */ -__rooglobal__ void computeLandau(BatchesHandle batches) +__rooglobal__ void computeLandau(Batches &batches) { auto case0 = [](double x) { const double a1[3] = {0.04166666667, -0.01996527778, 0.02709538966}; @@ -567,82 +571,82 @@ __rooglobal__ void computeLandau(BatchesHandle batches) return u * u * (1 + (a2[0] + a2[1] * u) * u); }; - Batch X = batches[0]; - Batch M = batches[1]; - Batch S = batches[2]; + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch S = batches.args[2]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = (X[i] - M[i]) / S[i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = (X[i] - M[i]) / S[i]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (S[i] <= 0.0) { - batches._output[i] = 0; - } else if (batches._output[i] < -5.5) { - batches._output[i] = case0(batches._output[i]); - } else if (batches._output[i] < -1.0) { - batches._output[i] = case1(batches._output[i]); - } else if (batches._output[i] < 1.0) { - batches._output[i] = case2(batches._output[i]); - } else if (batches._output[i] < 5.0) { - batches._output[i] = case3(batches._output[i]); - } else if (batches._output[i] < 12.0) { - batches._output[i] = case4(batches._output[i]); - } else if (batches._output[i] < 50.0) { - batches._output[i] = case5(batches._output[i]); - } else if (batches._output[i] < 300.) { - batches._output[i] = case6(batches._output[i]); + batches.output[i] = 0; + } else if (batches.output[i] < -5.5) { + batches.output[i] = case0(batches.output[i]); + } else if (batches.output[i] < -1.0) { + batches.output[i] = case1(batches.output[i]); + } else if (batches.output[i] < 1.0) { + batches.output[i] = case2(batches.output[i]); + } else if (batches.output[i] < 5.0) { + batches.output[i] = case3(batches.output[i]); + } else if (batches.output[i] < 12.0) { + batches.output[i] = case4(batches.output[i]); + } else if (batches.output[i] < 50.0) { + batches.output[i] = case5(batches.output[i]); + } else if (batches.output[i] < 300.) { + batches.output[i] = case6(batches.output[i]); } else { - batches._output[i] = case7(batches._output[i]); + batches.output[i] = case7(batches.output[i]); } } } -__rooglobal__ void computeLognormal(BatchesHandle batches) +__rooglobal__ void computeLognormal(Batches &batches) { - Batch X = batches[0]; - Batch M0 = batches[1]; - Batch K = batches[2]; + Batch X = batches.args[0]; + Batch M0 = batches.args[1]; + Batch K = batches.args[2]; constexpr double rootOf2pi = 2.506628274631000502415765284811; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double lnxOverM0 = fast_log(X[i] / M0[i]); double lnk = fast_log(K[i]); if (lnk < 0) lnk = -lnk; double arg = lnxOverM0 / lnk; arg *= -0.5 * arg; - batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); + batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); } } -__rooglobal__ void computeLognormalStandard(BatchesHandle batches) +__rooglobal__ void computeLognormalStandard(Batches &batches) { - Batch X = batches[0]; - Batch M0 = batches[1]; - Batch K = batches[2]; + Batch X = batches.args[0]; + Batch M0 = batches.args[1]; + Batch K = batches.args[2]; constexpr double rootOf2pi = 2.506628274631000502415765284811; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double lnxOverM0 = fast_log(X[i]) - M0[i]; double lnk = K[i]; if (lnk < 0) lnk = -lnk; double arg = lnxOverM0 / lnk; arg *= -0.5 * arg; - batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); + batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi); } } -__rooglobal__ void computeNormalizedPdf(BatchesHandle batches) +__rooglobal__ void computeNormalizedPdf(Batches &batches) { - auto rawVal = batches[0]; - auto normVal = batches[1]; + auto rawVal = batches.args[0]; + auto normVal = batches.args[1]; int nEvalErrorsType0 = 0; int nEvalErrorsType1 = 0; int nEvalErrorsType2 = 0; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double out = 0.0; - // batches._output[i] = rawVal[i] / normVar[i]; + // batches.output[i] = rawVal[i] / normVar[i]; if (normVal[i] < 0. || (normVal[i] == 0. && rawVal[i] != 0)) { // Unreasonable normalisations. A zero integral can be tolerated if the function vanishes, though. out = RooNaNPacker::packFloatIntoNaN(-normVal[i] + (rawVal[i] < 0. ? -rawVal[i] : 0.)); @@ -658,15 +662,15 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches) } else { out = (rawVal[i] == 0. && normVal[i] == 0.) ? 0. : rawVal[i] / normVal[i]; } - batches._output[i] = out; + batches.output[i] = out; } if (nEvalErrorsType0 > 0) - batches.setExtraArg(0, batches.extraArg(0) + nEvalErrorsType0); + batches.extra[0] = batches.extra[0] + nEvalErrorsType0; if (nEvalErrorsType1 > 1) - batches.setExtraArg(1, batches.extraArg(1) + nEvalErrorsType1); + batches.extra[1] = batches.extra[1] + nEvalErrorsType1; if (nEvalErrorsType2 > 2) - batches.setExtraArg(2, batches.extraArg(2) + nEvalErrorsType2); + batches.extra[2] = batches.extra[2] + nEvalErrorsType2; } /* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1)) @@ -677,240 +681,244 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches) * ln is the logarithm that was solely present in the initial * formula, that is before the asinh replacement */ -__rooglobal__ void computeNovosibirsk(BatchesHandle batches) +__rooglobal__ void computeNovosibirsk(Batches &batches) { - Batch X = batches[0]; - Batch P = batches[1]; - Batch W = batches[2]; - Batch T = batches[3]; + Batch X = batches.args[0]; + Batch P = batches.args[1]; + Batch W = batches.args[2]; + Batch T = batches.args[3]; constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) ) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { double argasinh = 0.5 * xi * T[i]; double argln = argasinh + 1 / fast_isqrt(argasinh * argasinh + 1); double asinh = fast_log(argln); double argln2 = 1 - (X[i] - P[i]) * T[i] / W[i]; double ln = fast_log(argln2); - batches._output[i] = ln / asinh; - batches._output[i] *= -0.125 * xi * xi * batches._output[i]; - batches._output[i] -= 2.0 / xi / xi * asinh * asinh; + batches.output[i] = ln / asinh; + batches.output[i] *= -0.125 * xi * xi * batches.output[i]; + batches.output[i] -= 2.0 / xi / xi * asinh * asinh; } // faster if you exponentiate in a separate loop (dark magic!) - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) - batches._output[i] = fast_exp(batches._output[i]); + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) + batches.output[i] = fast_exp(batches.output[i]); } -__rooglobal__ void computePoisson(BatchesHandle batches) +__rooglobal__ void computePoisson(Batches &batches) { - Batch x = batches[0]; - Batch mean = batches[1]; - bool protectNegative = batches.extraArg(0); - bool noRounding = batches.extraArg(1); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + Batch x = batches.args[0]; + Batch mean = batches.args[1]; + bool protectNegative = batches.extra[0]; + bool noRounding = batches.extra[1]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double x_i = noRounding ? x[i] : floor(x[i]); - batches._output[i] = std::lgamma(x_i + 1.); + batches.output[i] = std::lgamma(x_i + 1.); } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double x_i = noRounding ? x[i] : floor(x[i]); const double logMean = fast_log(mean[i]); - const double logPoisson = x_i * logMean - mean[i] - batches._output[i]; - batches._output[i] = fast_exp(logPoisson); + const double logPoisson = x_i * logMean - mean[i] - batches.output[i]; + batches.output[i] = fast_exp(logPoisson); // Cosmetics if (x_i < 0) { - batches._output[i] = 0; + batches.output[i] = 0; } else if (x_i == 0) { - batches._output[i] = 1 / fast_exp(mean[i]); + batches.output[i] = 1 / fast_exp(mean[i]); } if (protectNegative && mean[i] < 0) - batches._output[i] = 1.E-3; + batches.output[i] = 1.E-3; } } -__rooglobal__ void computePolynomial(BatchesHandle batches) +__rooglobal__ void computePolynomial(Batches &batches) { - const int nCoef = batches.extraArg(0); - const std::size_t nEvents = batches.getNEvents(); - Batch x = batches[nCoef]; + const int nCoef = batches.extra[0]; + const std::size_t nEvents = batches.nEvents; + Batch x = batches.args[nCoef]; for (size_t i = BEGIN; i < nEvents; i += STEP) { - batches._output[i] = batches[nCoef - 1][i]; + batches.output[i] = batches.args[nCoef - 1][i]; } // Indexes are in range 0..nCoef-1 but coefList[nCoef-1] has already been // processed. for (int k = nCoef - 2; k >= 0; k--) { for (size_t i = BEGIN; i < nEvents; i += STEP) { - batches._output[i] = batches[k][i] + x[i] * batches._output[i]; + batches.output[i] = batches.args[k][i] + x[i] * batches.output[i]; } } } -__rooglobal__ void computePower(BatchesHandle batches) +__rooglobal__ void computePower(Batches &batches) { - const int nCoef = batches.extraArg(0); - Batch x = batches[0]; + const int nCoef = batches.extra[0]; + Batch x = batches.args[0]; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 0.0; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 0.0; for (int k = 0; k < nCoef; ++k) { - batches._output[i] += batches[2 * k + 1][i] * std::pow(x[i], batches[2 * k + 2][i]); + batches.output[i] += batches.args[2 * k + 1][i] * std::pow(x[i], batches.args[2 * k + 2][i]); } } } -__rooglobal__ void computeProdPdf(BatchesHandle batches) +__rooglobal__ void computeProdPdf(Batches &batches) { - const int nPdfs = batches.extraArg(0); - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = 1.; + const int nPdfs = batches.extra[0]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = 1.; } for (int pdf = 0; pdf < nPdfs; pdf++) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] *= batches[pdf][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] *= batches.args[pdf][i]; } } } -__rooglobal__ void computeRatio(BatchesHandle batches) +__rooglobal__ void computeRatio(Batches &batches) { - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - batches._output[i] = batches[0][i] / batches[1][i]; + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { + batches.output[i] = batches.args[0][i] / batches.args[1][i]; } } -__rooglobal__ void computeTruthModelExpBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelExpBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]); + batches.output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]); } } -__rooglobal__ void computeTruthModelSinBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelSinBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_sin(x * batches[2][i]); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_sin(x * batches.args[2][i]); } } -__rooglobal__ void computeTruthModelCosBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelCosBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_cos(x * batches[2][i]); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_cos(x * batches.args[2][i]); } } -__rooglobal__ void computeTruthModelLinBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelLinBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); if (isOutOfSign) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - const double tscaled = std::abs(x) / batches[1][i]; - batches._output[i] = fast_exp(-tscaled) * tscaled; + const double tscaled = std::abs(x) / batches.args[1][i]; + batches.output[i] = fast_exp(-tscaled) * tscaled; } } } -__rooglobal__ void computeTruthModelQuadBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelQuadBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); if (isOutOfSign) { - batches._output[i] = 0.0; + batches.output[i] = 0.0; } else { - const double tscaled = std::abs(x) / batches[1][i]; - batches._output[i] = fast_exp(-tscaled) * tscaled * tscaled; + const double tscaled = std::abs(x) / batches.args[1][i]; + batches.output[i] = fast_exp(-tscaled) * tscaled * tscaled; } } } -__rooglobal__ void computeTruthModelSinhBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelSinhBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * sinh(x * batches[2][i] * 0.5); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * sinh(x * batches.args[2][i] * 0.5); } } -__rooglobal__ void computeTruthModelCoshBasis(BatchesHandle batches) +__rooglobal__ void computeTruthModelCoshBasis(Batches &batches) { - const bool isMinus = batches.extraArg(0) < 0.0; - const bool isPlus = batches.extraArg(0) > 0.0; - for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { - double x = batches[0][i]; + const bool isMinus = batches.extra[0] < 0.0; + const bool isPlus = batches.extra[0] > 0.0; + for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) { + double x = batches.args[0][i]; // Enforce sign compatibility const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0); - batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * cosh(x * batches[2][i] * .5); + batches.output[i] = + isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * cosh(x * batches.args[2][i] * .5); } } -__rooglobal__ void computeVoigtian(BatchesHandle batches) +__rooglobal__ void computeVoigtian(Batches &batches) { - Batch X = batches[0]; - Batch M = batches[1]; - Batch W = batches[2]; - Batch S = batches[3]; + Batch X = batches.args[0]; + Batch M = batches.args[1]; + Batch W = batches.args[2]; + Batch S = batches.args[3]; const double invSqrt2 = 0.707106781186547524400844362105; - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { const double arg = (X[i] - M[i]) * (X[i] - M[i]); if (S[i] == 0.0 && W[i] == 0.0) { - batches._output[i] = 1.0; + batches.output[i] = 1.0; } else if (S[i] == 0.0) { - batches._output[i] = 1 / (arg + 0.25 * W[i] * W[i]); + batches.output[i] = 1 / (arg + 0.25 * W[i] * W[i]); } else if (W[i] == 0.0) { - batches._output[i] = fast_exp(-0.5 * arg / (S[i] * S[i])); + batches.output[i] = fast_exp(-0.5 * arg / (S[i] * S[i])); } else { - batches._output[i] = invSqrt2 / S[i]; + batches.output[i] = invSqrt2 / S[i]; } } - for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) { + for (size_t i = BEGIN; i < batches.nEvents; i += STEP) { if (S[i] != 0.0 && W[i] != 0.0) { - if (batches._output[i] < 0) - batches._output[i] = -batches._output[i]; + if (batches.output[i] < 0) + batches.output[i] = -batches.output[i]; const double factor = W[i] > 0.0 ? 0.5 : -0.5; - RooHeterogeneousMath::STD::complex z(batches._output[i] * (X[i] - M[i]), - factor * batches._output[i] * W[i]); - batches._output[i] *= RooHeterogeneousMath::faddeeva(z).real(); + RooHeterogeneousMath::STD::complex z(batches.output[i] * (X[i] - M[i]), + factor * batches.output[i] * W[i]); + batches.output[i] *= RooHeterogeneousMath::faddeeva(z).real(); } } } /// Returns a std::vector of pointers to the compute functions in this file. -std::vector getFunctions() +std::vector getFunctions() { return {computeAddPdf, computeArgusBG, diff --git a/roofit/batchcompute/src/RooBatchCompute.cu b/roofit/batchcompute/src/RooBatchCompute.cu index 55f3d4d457b1c..ce8b10197e8ee 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cu +++ b/roofit/batchcompute/src/RooBatchCompute.cu @@ -25,6 +25,7 @@ This file contains the code for cuda computations using the RooBatchCompute libr #include #include +#include #ifndef RF_ARCH #error "RF_ARCH should always be defined" @@ -41,27 +42,28 @@ namespace { void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs) { - batches._nEvents = nEvents; - batches._nBatches = nBatches; - batches._nExtraArgs = nExtraArgs; - batches._output = output; + batches.nEvents = nEvents; + batches.nBatches = nBatches; + batches.nExtra = nExtraArgs; + batches.output = output; } -void fillArrays(Batch *arrays, const VarVector &vars, double *buffer, double *bufferDevice, std::size_t nEvents) +void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents) { for (int i = 0; i < vars.size(); i++) { const std::span &span = vars[i]; - if (!span.empty() && span.size() < nEvents) { + arrays[i]._isVector = span.empty() || span.size() >= nEvents; + if (!arrays[i]._isVector) { // In the scalar case, the value is not on the GPU yet, so we have to // copy the value to the GPU buffer. buffer[i] = span[0]; - arrays[i].set(bufferDevice + i, false); + arrays[i]._array = bufferDevice + i; } else { // In the vector input cases, they are already on the GPU, so we can // fill be buffer with some dummy value and set the input span // directly. buffer[i] = 0.0; - arrays[i].set(span.data(), true); + arrays[i]._array = span.data(); } } } @@ -85,13 +87,13 @@ int getGridSize(std::size_t n) } // namespace -std::vector getFunctions(); +std::vector getFunctions(); /// This class overrides some RooBatchComputeInterface functions, for the /// purpose of providing a cuda specific implementation of the library. class RooBatchComputeClass : public RooBatchComputeInterface { private: - const std::vector _computeFunctions; + const std::vector _computeFunctions; public: RooBatchComputeClass() : _computeFunctions(getFunctions()) @@ -114,10 +116,10 @@ public: \param computer An enum specifying the compute function to be used. \param output The array where the computation results are stored. \param nEvents The number of events to be processed. - \param vars A std::vector containing pointers to the variables involved in the computation. - \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/ - void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, - const VarVector &vars, ArgVector &extraArgs) override + \param vars A std::span containing pointers to the variables involved in the computation. + \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/ + void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, + ArgSpan extraArgs) override { using namespace RooFit::Detail::CudaInterface; @@ -138,11 +140,11 @@ public: fillBatches(*batches, output, nEvents, vars.size(), extraArgs.size()); fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents); - batches->_arrays = arraysDevice; + batches->args = arraysDevice; if (!extraArgs.empty()) { std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost); - batches->_extraArgs = extraArgsDevice; + batches->extra = extraArgsDevice; } copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream()); diff --git a/roofit/batchcompute/src/RooBatchCompute.cxx b/roofit/batchcompute/src/RooBatchCompute.cxx index b6a22cb13ded4..d84c46b81936f 100644 --- a/roofit/batchcompute/src/RooBatchCompute.cxx +++ b/roofit/batchcompute/src/RooBatchCompute.cxx @@ -31,6 +31,7 @@ This file contains the code for cpu computations using the RooBatchCompute libra #include #include #include +#include #ifndef RF_ARCH #error "RF_ARCH should always be defined" @@ -41,41 +42,41 @@ namespace RF_ARCH { namespace { -void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgVector &extraArgs) +void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgSpan extraArgs) { - batches._extraArgs = extraArgs.data(); - batches._nEvents = nEvents; - batches._nBatches = nBatches; - batches._nExtraArgs = extraArgs.size(); - batches._output = output; + batches.extra = extraArgs.data(); + batches.nEvents = nEvents; + batches.nBatches = nBatches; + batches.nExtra = extraArgs.size(); + batches.output = output; } -void fillArrays(std::vector &arrays, const VarVector &vars, double *buffer, std::size_t nEvents) +void fillArrays(std::span arrays, VarSpan vars, std::size_t nEvents) { + for (std::size_t i = 0; i < vars.size(); i++) { + arrays[i]._array = vars[i].data(); + arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents; + } +} - arrays.resize(vars.size()); - for (size_t i = 0; i < vars.size(); i++) { - const std::span &span = vars[i]; - if (!span.empty() && span.size() < nEvents) { - // In the scalar case, copy the value to each element of vector input - // buffer. - std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]); - arrays[i].set(&buffer[i * bufferSize], false); - } else { - arrays[i].set(span.data(), true); - } +inline void advance(Batches &batches, std::size_t nEvents) +{ + for (std::size_t i = 0; i < batches.nBatches; i++) { + Batch &arg = batches.args[i]; + arg._array += arg._isVector * nEvents; } + batches.output += nEvents; } } // namespace -std::vector getFunctions(); +std::vector getFunctions(); /// This class overrides some RooBatchComputeInterface functions, for the /// purpose of providing a CPU specific implementation of the library. class RooBatchComputeClass : public RooBatchComputeInterface { private: - const std::vector _computeFunctions; + const std::vector _computeFunctions; public: RooBatchComputeClass() : _computeFunctions(getFunctions()) @@ -106,14 +107,11 @@ class RooBatchComputeClass : public RooBatchComputeInterface { \param computer An enum specifying the compute function to be used. \param output The array where the computation results are stored. \param nEvents The number of events to be processed. - \param vars A std::vector containing pointers to the variables involved in the computation. - \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/ - void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars, - ArgVector &extraArgs) override + \param vars A std::span containing pointers to the variables involved in the computation. + \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/ + void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars, + ArgSpan extraArgs) override { - static std::vector buffer; - buffer.resize(vars.size() * bufferSize); - if (ROOT::IsImplicitMTEnabled()) { ROOT::Internal::TExecutor ex; std::size_t nThreads = ex.GetPoolSize(); @@ -127,25 +125,25 @@ class RooBatchComputeClass : public RooBatchComputeInterface { // Fill a std::vector with the same object and with ~nEvents/nThreads // Then advance every object but the first to split the work between threads Batches batches; - std::vector arrays; + std::vector arrays(vars.size()); fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs); - fillArrays(arrays, vars, buffer.data(), nEvents); - batches._arrays = arrays.data(); - batches.advance(batches.getNEvents() * idx); + fillArrays(arrays, vars, nEvents); + batches.args = arrays.data(); + advance(batches, batches.nEvents * idx); // Set the number of events of the last Batches object as the remaining events if (idx == nThreads - 1) { - batches.setNEvents(nEvents - idx * batches.getNEvents()); + batches.nEvents = nEvents - idx * batches.nEvents; } - std::size_t events = batches.getNEvents(); - batches.setNEvents(bufferSize); + std::size_t events = batches.nEvents; + batches.nEvents = bufferSize; while (events > bufferSize) { _computeFunctions[computer](batches); - batches.advance(bufferSize); + advance(batches, bufferSize); events -= bufferSize; } - batches.setNEvents(events); + batches.nEvents = events; _computeFunctions[computer](batches); return 0; }; @@ -159,19 +157,19 @@ class RooBatchComputeClass : public RooBatchComputeInterface { // Fill a std::vector with the same object and with ~nEvents/nThreads // Then advance every object but the first to split the work between threads Batches batches; - std::vector arrays; + std::vector arrays(vars.size()); fillBatches(batches, output, nEvents, vars.size(), extraArgs); - fillArrays(arrays, vars, buffer.data(), nEvents); - batches._arrays = arrays.data(); + fillArrays(arrays, vars, nEvents); + batches.args = arrays.data(); - std::size_t events = batches.getNEvents(); - batches.setNEvents(bufferSize); + std::size_t events = batches.nEvents; + batches.nEvents = bufferSize; while (events > bufferSize) { _computeFunctions[computer](batches); - batches.advance(bufferSize); + advance(batches, bufferSize); events -= bufferSize; } - batches.setNEvents(events); + batches.nEvents = events; _computeFunctions[computer](batches); } } diff --git a/roofit/roofit/src/RooChebychev.cxx b/roofit/roofit/src/RooChebychev.cxx index f6ff2a7922a7b..cbb15adbd99f9 100644 --- a/roofit/roofit/src/RooChebychev.cxx +++ b/roofit/roofit/src/RooChebychev.cxx @@ -104,14 +104,17 @@ void RooChebychev::translate(RooFit::Detail::CodeSquashContext &ctx) const //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of Chebychev. -void RooChebychev::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const +void RooChebychev::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::ArgVector extraArgs; - for (auto* coef:_coefList) - extraArgs.push_back( static_cast(coef)->getVal() ); - extraArgs.push_back( _x.min(_refRangeName?_refRangeName->GetName() : nullptr) ); - extraArgs.push_back( _x.max(_refRangeName?_refRangeName->GetName() : nullptr) ); - RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Chebychev, output, nEvents, {dataMap.at(_x)}, extraArgs); + std::vector extraArgs; + extraArgs.reserve(_coefList.size() + 2); + for (auto *coef : _coefList) { + extraArgs.push_back(static_cast(coef)->getVal()); + } + extraArgs.push_back(_x.min(_refRangeName ? _refRangeName->GetName() : nullptr)); + extraArgs.push_back(_x.max(_refRangeName ? _refRangeName->GetName() : nullptr)); + RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Chebychev, output, nEvents, {dataMap.at(_x)}, + extraArgs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooChiSquarePdf.cxx b/roofit/roofit/src/RooChiSquarePdf.cxx index 1f1a61b161918..7368bf12bd43f 100644 --- a/roofit/roofit/src/RooChiSquarePdf.cxx +++ b/roofit/roofit/src/RooChiSquarePdf.cxx @@ -21,8 +21,8 @@ Here we also implement the analytic integral. #include "TMath.h" +#include #include -using namespace std; ClassImp(RooChiSquarePdf); @@ -55,17 +55,19 @@ RooChiSquarePdf::RooChiSquarePdf(const RooChiSquarePdf& other, const char* name) double RooChiSquarePdf::evaluate() const { - if(_x <= 0) return 0; + if (_x <= 0) + return 0; - return pow(_x,(_ndof/2.)-1.) * exp(-_x/2.) / TMath::Gamma(_ndof/2.) / pow(2.,_ndof/2.); + return pow(_x, (_ndof / 2.) - 1.) * std::exp(-_x / 2.) / TMath::Gamma(_ndof / 2.) / std::pow(2., _ndof / 2.); } //////////////////////////////////////////////////////////////////////////////// /// Compute multiple values of ChiSquare distribution. -void RooChiSquarePdf::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const +void RooChiSquarePdf::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::ArgVector extraArgs{_ndof}; - RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ChiSquare, output, nEvents, {dataMap.at(_x)}, extraArgs); + std::array extraArgs{_ndof}; + RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ChiSquare, output, nEvents, {dataMap.at(_x)}, + extraArgs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofit/src/RooExpPoly.cxx b/roofit/roofit/src/RooExpPoly.cxx index b992b992b040b..b6fcae6c0552e 100644 --- a/roofit/roofit/src/RooExpPoly.cxx +++ b/roofit/roofit/src/RooExpPoly.cxx @@ -35,10 +35,11 @@ RooExpPoly::RooExpPoly(const char*, const char*, RooAbsReal&, const RooArgList&, #include #include -#include -#include +#include #include +#include #include +#include ClassImp(RooExpPoly); @@ -128,7 +129,7 @@ double RooExpPoly::evaluateLog() const /// Compute multiple values of ExpPoly distribution. void RooExpPoly::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::VarVector vars; + std::vector> vars; vars.reserve(_coefList.size() + 1); vars.push_back(dataMap.at(_x)); @@ -137,9 +138,7 @@ void RooExpPoly::computeBatch(double *output, size_t nEvents, RooFit::Detail::Da vars.push_back(dataMap.at(coef)); } - RooBatchCompute::ArgVector args; - args.push_back(_lowestOrder); - args.push_back(_coefList.size()); + std::array args{static_cast(_lowestOrder), static_cast(_coefList.size())}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ExpPoly, output, nEvents, vars, args); } diff --git a/roofit/roofit/src/RooGaussModel.cxx b/roofit/roofit/src/RooGaussModel.cxx index 7fbc29ed8b8cd..6ff16d7ebf190 100644 --- a/roofit/roofit/src/RooGaussModel.cxx +++ b/roofit/roofit/src/RooGaussModel.cxx @@ -34,6 +34,8 @@ for analytical convolutions with classes inheriting from RooAbsAnaConvPdf #include +#include + namespace { enum RooGaussBasis { @@ -190,7 +192,7 @@ void RooGaussModel::computeBatch(double *output, size_t size, // arises, they can be implemented following this example. Remember to also // adapt RooGaussModel::canComputeBatchWithCuda(). if (basisType == expBasis) { - RooBatchCompute::ArgVector extraArgs{basisSign}; + std::array extraArgs{basisSign}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::GaussModelExpBasis, output, size, {xVals, meanVals, meanSfVals, sigmaVals, sigmaSfVals, param1Vals}, extraArgs); return; diff --git a/roofit/roofit/src/RooJohnson.cxx b/roofit/roofit/src/RooJohnson.cxx index a770db6e0ea08..24fa2d67fb2e4 100644 --- a/roofit/roofit/src/RooJohnson.cxx +++ b/roofit/roofit/src/RooJohnson.cxx @@ -45,9 +45,11 @@ Johnson, N. L. (1949). *Systems of Frequency Curves Generated by Methods of Tran #include "RooHelpers.h" #include "RooBatchCompute.h" -#include #include "TMath.h" +#include +#include + ClassImp(RooJohnson); //////////////////////////////////////////////////////////////////////////////// @@ -114,7 +116,7 @@ double RooJohnson::evaluate() const /// Compute multiple values of the Johnson distribution. void RooJohnson::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const { - RooBatchCompute::ArgVector extraArgs{_massThreshold}; + std::array extraArgs{_massThreshold}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Johnson, output, nEvents, {dataMap.at(_mass), dataMap.at(_mu), dataMap.at(_lambda), dataMap.at(_gamma), dataMap.at(_delta)}, extraArgs); diff --git a/roofit/roofit/src/RooPoisson.cxx b/roofit/roofit/src/RooPoisson.cxx index 0a821f6a493c3..fd803d8077a29 100644 --- a/roofit/roofit/src/RooPoisson.cxx +++ b/roofit/roofit/src/RooPoisson.cxx @@ -22,6 +22,8 @@ Poisson pdf #include "RooFit/Detail/EvaluateFuncs.h" #include "Math/ProbFuncMathCore.h" +#include + ClassImp(RooPoisson); //////////////////////////////////////////////////////////////////////////////// @@ -78,7 +80,7 @@ void RooPoisson::translate(RooFit::Detail::CodeSquashContext &ctx) const void RooPoisson::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::ArgVector extraArgs{static_cast(_protectNegative), static_cast(_noRounding)}; + std::array extraArgs{static_cast(_protectNegative), static_cast(_noRounding)}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Poisson, output, nEvents, {dataMap.at(x), dataMap.at(mean)}, extraArgs); } diff --git a/roofit/roofit/src/RooPower.cxx b/roofit/roofit/src/RooPower.cxx index 603ef6c4f30ce..4dba7e180ea9e 100644 --- a/roofit/roofit/src/RooPower.cxx +++ b/roofit/roofit/src/RooPower.cxx @@ -26,8 +26,9 @@ RooPower implements a power law PDF of the form #include -#include +#include #include +#include #include ClassImp(RooPower); @@ -84,7 +85,7 @@ RooPower::RooPower(const RooPower &other, const char *name) /// Compute multiple values of Power distribution. void RooPower::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::VarVector vars; + std::vector> vars; vars.reserve(2 * _coefList.size() + 1); vars.push_back(dataMap.at(_x)); @@ -95,8 +96,7 @@ void RooPower::computeBatch(double *output, size_t nEvents, RooFit::Detail::Data vars.push_back(dataMap.at(&_expList[i])); } - RooBatchCompute::ArgVector args; - args.push_back(_coefList.size()); + std::array args{static_cast(_coefList.size())}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Power, output, nEvents, vars, args); } diff --git a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h index 007431e4726e4..2a9d933b9051a 100644 --- a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h +++ b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h @@ -82,10 +82,7 @@ namespace Detail { class DataMap { public: - auto size() const - { - return _dataMap.size(); - } + auto size() const { return _dataMap.size(); } void resize(std::size_t n); inline void set(RooAbsArg const *arg, std::span const &span) @@ -119,8 +116,14 @@ class DataMap { RooBatchCompute::Config config(RooAbsArg const *arg) const; + void enableVectorBuffers(bool enable) { _enableVectorBuffers = enable; } + void resetVectorBuffers() { _bufferIdx = 0; } + private: std::vector> _dataMap; + bool _enableVectorBuffers = false; + std::vector> _buffers; + std::size_t _bufferIdx = 0; std::vector _cfgs; }; diff --git a/roofit/roofitcore/src/RooAbsPdf.cxx b/roofit/roofitcore/src/RooAbsPdf.cxx index 282938551ff5b..830613defca3d 100644 --- a/roofit/roofitcore/src/RooAbsPdf.cxx +++ b/roofit/roofitcore/src/RooAbsPdf.cxx @@ -853,18 +853,19 @@ double RooAbsPdf::extendedTerm(RooAbsData const& data, bool weightSquared, bool * `EvalBackend(std::string const&)` Choose a likelihood evaluation backend: * *
Backend Description - *
**legacy** - *default* The original likelihood evaluation method. - * Evaluates the PDF for each single data entry at a time before summing the negative log probabilities. - * This is the default if `EvalBackend()` is not passed. - *
**cpu** New vectorized evaluation mode, using faster math functions and auto-vectorisation. - * If all RooAbsArg objects in the model support it, likelihood computations are 2 to 10 times faster, - * unless your dataset is so small that the vectorization is not worth it. - * The relative difference of the single log-likelihoods w.r.t. the legacy mode is usually better than \f$10^{-12}\f$, + *
**cpu** - *default* New vectorized evaluation mode, using faster math functions and auto-vectorisation. + * Since ROOT 6.23, this is the default if `EvalBackend()` is not passed, succeeding the **legacy** backend. + * If all RooAbsArg objects in the model support vectorized evaluation, + * likelihood computations are 2 to 10 times faster than with the **legacy** backend + * - unless your dataset is so small that the vectorization is not worth it. + * The relative difference of the single log-likelihoods with respect to the legacy mode is usually better than \f$10^{-12}\f$, * and for fit parameters it's usually better than \f$10^{-6}\f$. In past ROOT releases, this backend could be activated with the now deprecated `BatchMode()` option. *
**cuda** Evaluate the likelihood on a GPU that supports CUDA. * This backend re-uses code from the **cpu** backend, but compiled in CUDA kernels. * Hence, the results are expected to be identical, modulo some numerical differences that can arise from the different order in which the GPU is summing the log probabilities. * This backend can drastically speed up the fit if all RooAbsArg object in the model support it. + *
**legacy** The original likelihood evaluation method. + * Evaluates the PDF for each single data entry at a time before summing the negative log probabilities. *
**codegen** **Experimental** - Generates and compiles minimal C++ code for the NLL on-the-fly and wraps it in the returned RooAbsReal. * Also generates and compiles the code for the gradient using Automatic Differentiation (AD) with [Clad](https://github.com/vgvassilev/clad). * This analytic gradient is passed to the minimizer, which can result in significant speedups for many-parameter fits, diff --git a/roofit/roofitcore/src/RooAddModel.cxx b/roofit/roofitcore/src/RooAddModel.cxx index 51d99f2fd44f5..21e659ded6d55 100644 --- a/roofit/roofitcore/src/RooAddModel.cxx +++ b/roofit/roofitcore/src/RooAddModel.cxx @@ -397,8 +397,8 @@ void RooAddModel::computeBatch(double *output, size_t nEvents, RooFit::Detail::D _coefCache[i] = coefVals[0]; } - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; + std::vector> pdfs; + std::vector coefs; AddCacheElem *cache = getProjCache(nullptr); updateCoefficients(*cache, nullptr); diff --git a/roofit/roofitcore/src/RooAddPdf.cxx b/roofit/roofitcore/src/RooAddPdf.cxx index d720cb0c17156..614734b50b6b7 100644 --- a/roofit/roofitcore/src/RooAddPdf.cxx +++ b/roofit/roofitcore/src/RooAddPdf.cxx @@ -574,8 +574,8 @@ void RooAddPdf::computeBatch(double* output, size_t nEvents, RooFit::Detail::Dat _coefCache[i] = coefVals[0]; } - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; + std::vector> pdfs; + std::vector coefs; AddCacheElem* cache = getProjCache(nullptr); // We don't sync the coefficient values from the _coefList to the _coefCache // because we have already done it using the dataMap. diff --git a/roofit/roofitcore/src/RooAddition.cxx b/roofit/roofitcore/src/RooAddition.cxx index f29693d50e916..c3d15be3dc365 100644 --- a/roofit/roofitcore/src/RooAddition.cxx +++ b/roofit/roofitcore/src/RooAddition.cxx @@ -143,18 +143,17 @@ double RooAddition::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute addition of PDFs in batches. -void RooAddition::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const +void RooAddition::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; - pdfs.reserve(_set.size()); - coefs.reserve(_set.size()); - for (const auto arg : _set) - { - pdfs.push_back(dataMap.at(arg)); - coefs.push_back(1.0); - } - RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs); + std::vector> pdfs; + std::vector coefs; + pdfs.reserve(_set.size()); + coefs.reserve(_set.size()); + for (const auto arg : _set) { + pdfs.push_back(dataMap.at(arg)); + coefs.push_back(1.0); + } + RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx index b73f767d465bf..49600ea7977f7 100644 --- a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx +++ b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx @@ -16,17 +16,50 @@ #include #include +#include + +namespace { + +// To avoid deleted move assignment. +template +void assignSpan(std::span &to, std::span const &from) +{ + to = from; +} + +} // namespace + namespace RooFit { namespace Detail { std::span DataMap::at(RooAbsArg const *arg, RooAbsArg const * /*caller*/) { + std::span out; + if (!arg->hasDataToken()) { auto var = static_cast(arg); - return {&var->_value, 1}; + assignSpan(out, {&var->_value, 1}); + } else { + std::size_t idx = arg->dataToken(); + out = _dataMap[idx]; } - std::size_t idx = arg->dataToken(); - return _dataMap[idx]; + + if (!_enableVectorBuffers || out.size() != 1) { + return out; + } + + if (_bufferIdx == _buffers.size()) { + _buffers.emplace_back(RooBatchCompute::bufferSize); + } + + double *buffer = _buffers[_bufferIdx].data(); + + std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]); + assignSpan(out, {buffer, 1}); + + ++_bufferIdx; + + return out; } void DataMap::setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config) diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx index d7734c6edfbfa..ff018251ff106 100644 --- a/roofit/roofitcore/src/RooFit/Evaluator.cxx +++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx @@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info) buffer = info.buffer->cpuWritePtr(); } _dataMapCPU.set(node, {buffer, nOut}); + if (nOut > 1) { + _dataMapCPU.enableVectorBuffers(true); + } nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU); + _dataMapCPU.resetVectorBuffers(); + _dataMapCPU.enableVectorBuffers(false); #ifdef ROOFIT_CUDA if (info.copyAfterEvaluation) { _dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut}); diff --git a/roofit/roofitcore/src/RooNormalizedPdf.cxx b/roofit/roofitcore/src/RooNormalizedPdf.cxx index 6138f5ec06d6d..2e0461843f834 100644 --- a/roofit/roofitcore/src/RooNormalizedPdf.cxx +++ b/roofit/roofitcore/src/RooNormalizedPdf.cxx @@ -13,6 +13,8 @@ #include "RooNormalizedPdf.h" #include "RooBatchCompute.h" +#include + /** * \class RooNormalizedPdf * @@ -26,7 +28,7 @@ void RooNormalizedPdf::computeBatch(double *output, size_t nEvents, RooFit::Deta auto integralSpan = dataMap.at(_normIntegral); // We use the extraArgs as output parameter to count evaluation errors. - RooBatchCompute::ArgVector extraArgs{0.0, 0.0, 0.0}; + std::array extraArgs{0.0, 0.0, 0.0}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::NormalizedPdf, output, nEvents, {nums, integralSpan}, extraArgs); diff --git a/roofit/roofitcore/src/RooPolyVar.cxx b/roofit/roofitcore/src/RooPolyVar.cxx index 1e88f195972b0..ae6a48176e871 100644 --- a/roofit/roofitcore/src/RooPolyVar.cxx +++ b/roofit/roofitcore/src/RooPolyVar.cxx @@ -26,8 +26,6 @@ Class RooPolyvar implements analytical integrals of all polynomials it can define. **/ -#include - #include "RooPolyVar.h" #include "RooArgList.h" #include "RooMsgService.h" @@ -38,6 +36,10 @@ it can define. #include "TError.h" +#include +#include +#include + ClassImp(RooPolyVar); //////////////////////////////////////////////////////////////////////////////// @@ -132,23 +134,27 @@ void RooPolyVar::computeBatchImpl(RooAbsArg const* caller, double *output, size_ return; } - RooBatchCompute::VarVector vars; + std::vector> vars; vars.reserve(coefs.size() + 2); // Fill the coefficients for the skipped orders. By a conventions started in // RooPolynomial, if the zero-th order is skipped, it implies a coefficient // for the constant term of one. - const double zero = 1.0; - const double one = 1.0; + std::array zeros; + std::array ones; + std::fill_n(zeros.data(), zeros.size(), 0.0); + std::fill_n(ones.data(), ones.size(), 1.0); + std::span zerosSpan{zeros.data(), 1}; + std::span onesSpan{ones.data(), 1}; for (int i = lowestOrder - 1; i >= 0; --i) { - vars.push_back(i == 0 ? std::span{&one, 1} : std::span{&zero, 1}); + vars.push_back(i == 0 ? onesSpan : zerosSpan); } for (RooAbsArg *coef : coefs) { vars.push_back(dataMap.at(coef)); } vars.push_back(dataMap.at(&x)); - RooBatchCompute::ArgVector extraArgs{double(vars.size() - 1)}; + std::array extraArgs{double(vars.size() - 1)}; RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Polynomial, output, nEvents, vars, extraArgs); } diff --git a/roofit/roofitcore/src/RooProdPdf.cxx b/roofit/roofitcore/src/RooProdPdf.cxx index ebe61fda7c483..e8e2b323ec6ac 100644 --- a/roofit/roofitcore/src/RooProdPdf.cxx +++ b/roofit/roofitcore/src/RooProdPdf.cxx @@ -62,9 +62,10 @@ have to appear in any specific place in the list. #include "RooFitImplHelpers.h" #include "strtok.h" +#include +#include #include #include -#include #ifndef _WIN32 #include @@ -418,13 +419,13 @@ void RooProdPdf::calculateBatch(RooAbsArg const *caller, const RooProdPdf::Cache RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Ratio, output, nEvents, {numerator, denominator}); } else { - RooBatchCompute::VarVector factors; + std::vector> factors; factors.reserve(cache._partList.size()); for (const RooAbsArg *i : cache._partList) { auto span = dataMap.at(i); factors.push_back(span); } - RooBatchCompute::ArgVector special{static_cast(factors.size())}; + std::array special{static_cast(factors.size())}; RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::ProdPdf, output, nEvents, factors, special); } } diff --git a/roofit/roofitcore/src/RooTruthModel.cxx b/roofit/roofitcore/src/RooTruthModel.cxx index 2aa7c7bef559d..acf8a505be8cb 100644 --- a/roofit/roofitcore/src/RooTruthModel.cxx +++ b/roofit/roofitcore/src/RooTruthModel.cxx @@ -38,6 +38,7 @@ functions used in D mixing have been hand coded for increased execution speed. #include #include +#include #include #include @@ -265,7 +266,7 @@ void RooTruthModel::computeBatch(double *output, size_t nEvents, RooFit::Detail: auto param2Vals = param2 ? dataMap.at(param2) : std::span{}; // Return desired basis function - RooBatchCompute::ArgVector extraArgs{basisSign}; + std::array extraArgs{basisSign}; switch (basisType) { case expBasis: { RooBatchCompute::compute(config, RooBatchCompute::TruthModelExpBasis, output, nEvents, {xVals, param1Vals},