diff --git a/roofit/batchcompute/res/RooBatchCompute.h b/roofit/batchcompute/res/RooBatchCompute.h
index 447e1d6704dad..c8554331aae6e 100644
--- a/roofit/batchcompute/res/RooBatchCompute.h
+++ b/roofit/batchcompute/res/RooBatchCompute.h
@@ -23,10 +23,8 @@
 
 #include <DllImport.h> //for R__EXTERN, needed for windows
 
-#include <cassert>
-#include <functional>
+#include <initializer_list>
 #include <string>
-#include <vector>
 
 /**
  * Namespace for dispatching RooFit computations to various backends.
@@ -42,11 +40,13 @@
  */
 namespace RooBatchCompute {
 
-typedef std::vector<std::span<const double>> VarVector;
-typedef std::vector<double> ArgVector;
+typedef std::span<const std::span<const double>> VarSpan;
+typedef std::span<double> ArgSpan;
 typedef double *__restrict RestrictArr;
 typedef const double *__restrict InputArr;
 
+constexpr std::size_t bufferSize = 64;
+
 void init();
 
 /// Minimal configuration struct to steer the evaluation of a single node with
@@ -140,12 +140,7 @@ struct ReduceNLLOutput {
 class RooBatchComputeInterface {
 public:
    virtual ~RooBatchComputeInterface() = default;
-   virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, const VarVector &, ArgVector &) = 0;
-   inline void compute(Config const &cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars)
-   {
-      ArgVector extraArgs{};
-      compute(cfg, comp, output, size, vars, extraArgs);
-   }
+   virtual void compute(Config const &cfg, Computer, RestrictArr, size_t, VarSpan, ArgSpan) = 0;
 
    virtual double reduceSum(Config const &cfg, InputArr input, size_t n) = 0;
    virtual ReduceNLLOutput reduceNLL(Config const &cfg, std::span<const double> probas, std::span<const double> weights,
@@ -182,18 +177,20 @@ inline bool hasCuda()
    return dispatchCUDA;
 }
 
-inline void
-compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars, ArgVector &extraArgs)
+inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, VarSpan vars, ArgSpan extraArgs = {})
 {
    init();
    auto dispatch = cfg.useCuda() ? dispatchCUDA : dispatchCPU;
    dispatch->compute(cfg, comp, output, size, vars, extraArgs);
 }
 
-inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size, const VarVector &vars)
+/// It is not possible to construct a std::span directly from an initializer
+/// list (probably it will be with C++26). That's why we need an explicit
+/// overload for this.
+inline void compute(Config cfg, Computer comp, RestrictArr output, size_t size,
+                    std::initializer_list<std::span<const double>> vars, ArgSpan extraArgs = {})
 {
-   ArgVector extraArgs{};
-   compute(cfg, comp, output, size, vars, extraArgs);
+   compute(cfg, comp, output, size, VarSpan{vars.begin(), vars.end()}, extraArgs);
 }
 
 inline double reduceSum(Config cfg, InputArr input, size_t n)
diff --git a/roofit/batchcompute/src/Batches.h b/roofit/batchcompute/src/Batches.h
index 69c4a7837bd53..5589199ffc677 100644
--- a/roofit/batchcompute/src/Batches.h
+++ b/roofit/batchcompute/src/Batches.h
@@ -23,67 +23,32 @@ so that they can contain data for every kind of compute function.
 #ifndef ROOFIT_BATCHCOMPUTE_BATCHES_H
 #define ROOFIT_BATCHCOMPUTE_BATCHES_H
 
-#include <RooBatchComputeTypes.h>
-
 #include <cstdint>
 
 namespace RooBatchCompute {
 
-constexpr std::size_t bufferSize = 64;
-
-namespace RF_ARCH {
-
 class Batch {
 public:
    const double *__restrict _array = nullptr;
    bool _isVector = false;
 
-   Batch() = default;
-   inline Batch(InputArr array, bool isVector) : _array{array}, _isVector{isVector} {}
-
-   __roodevice__ constexpr bool isItVector() const { return _isVector; }
-   inline void set(InputArr array, bool isVector)
-   {
-      _array = array;
-      _isVector = isVector;
-   }
-   inline void advance(std::size_t _nEvents) { _array += _isVector * _nEvents; }
 #ifdef __CUDACC__
-   __roodevice__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; }
+   __device__ constexpr double operator[](std::size_t i) const noexcept { return _isVector ? _array[i] : _array[0]; }
 #else
    constexpr double operator[](std::size_t i) const noexcept { return _array[i]; }
 #endif // #ifdef __CUDACC__
 };
 
-/////////////////////////////////////////////////////////////////////////////////////////////////////////
-
 class Batches {
 public:
-   Batch *_arrays = nullptr;
-   double *_extraArgs = nullptr;
-   std::size_t _nEvents = 0;
-   std::size_t _nBatches = 0;
-   std::size_t _nExtraArgs = 0;
-   RestrictArr _output = nullptr;
-
-   __roodevice__ std::size_t getNEvents() const { return _nEvents; }
-   __roodevice__ std::size_t getNExtraArgs() const { return _nExtraArgs; }
-   __roodevice__ double extraArg(std::size_t i) const { return _extraArgs[i]; }
-   __roodevice__ void setExtraArg(std::size_t i, double val) { _extraArgs[i] = val; }
-   __roodevice__ Batch operator[](int batchIdx) const { return _arrays[batchIdx]; }
-   inline void setNEvents(std::size_t n) { _nEvents = n; }
-   inline void advance(std::size_t nEvents)
-   {
-      for (std::size_t i = 0; i < _nBatches; i++)
-         _arrays[i].advance(nEvents);
-      _output += nEvents;
-   }
+   Batch *args = nullptr;
+   double *extra;
+   std::size_t nEvents = 0;
+   std::size_t nBatches = 0;
+   std::size_t nExtra = 0;
+   RestrictArr output = nullptr;
 };
 
-// Defines the actual argument type of the compute function.
-using BatchesHandle = Batches &;
-
-} // End namespace RF_ARCH
 } // end namespace RooBatchCompute
 
 #endif // #ifdef ROOFIT_BATCHCOMPUTE_BATCHES_H
diff --git a/roofit/batchcompute/src/ComputeFunctions.cxx b/roofit/batchcompute/src/ComputeFunctions.cxx
index e112dda4c19c2..2dc2320196def 100644
--- a/roofit/batchcompute/src/ComputeFunctions.cxx
+++ b/roofit/batchcompute/src/ComputeFunctions.cxx
@@ -32,6 +32,8 @@ of performance, maximum memory coalescing. For more details, see
 
 #include <RooHeterogeneousMath.h>
 
+#include <vector>
+
 #ifdef __CUDACC__
 #define BEGIN blockDim.x *blockIdx.x + threadIdx.x
 #define STEP blockDim.x *gridDim.x
@@ -43,64 +45,66 @@ of performance, maximum memory coalescing. For more details, see
 namespace RooBatchCompute {
 namespace RF_ARCH {
 
-__rooglobal__ void computeAddPdf(BatchesHandle batches)
+__rooglobal__ void computeAddPdf(Batches &batches)
 {
-   const int nPdfs = batches.getNExtraArgs();
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = batches.extraArg(0) * batches[0][i];
+   const int nPdfs = batches.nExtra;
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = batches.extra[0] * batches.args[0][i];
+   }
    for (int pdf = 1; pdf < nPdfs; pdf++) {
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-         batches._output[i] += batches.extraArg(pdf) * batches[pdf][i];
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+         batches.output[i] += batches.extra[pdf] * batches.args[pdf][i];
+      }
    }
 }
 
-__rooglobal__ void computeArgusBG(BatchesHandle batches)
+__rooglobal__ void computeArgusBG(Batches &batches)
 {
-   Batch m = batches[0];
-   Batch m0 = batches[1];
-   Batch c = batches[2];
-   Batch p = batches[3];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch m = batches.args[0];
+   Batch m0 = batches.args[1];
+   Batch c = batches.args[2];
+   Batch p = batches.args[3];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double t = m[i] / m0[i];
       const double u = 1 - t * t;
-      batches._output[i] = c[i] * u + p[i] * fast_log(u);
+      batches.output[i] = c[i] * u + p[i] * fast_log(u);
    }
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       if (m[i] >= m0[i]) {
-         batches._output[i] = 0.0;
+         batches.output[i] = 0.0;
       } else {
-         batches._output[i] = m[i] * fast_exp(batches._output[i]);
+         batches.output[i] = m[i] * fast_exp(batches.output[i]);
       }
    }
 }
 
-__rooglobal__ void computeBMixDecay(BatchesHandle batches)
+__rooglobal__ void computeBMixDecay(Batches &batches)
 {
-   Batch coef0 = batches[0];
-   Batch coef1 = batches[1];
-   Batch tagFlav = batches[2];
-   Batch delMistag = batches[3];
-   Batch mixState = batches[4];
-   Batch mistag = batches[5];
+   Batch coef0 = batches.args[0];
+   Batch coef1 = batches.args[1];
+   Batch tagFlav = batches.args[2];
+   Batch delMistag = batches.args[3];
+   Batch mixState = batches.args[4];
+   Batch mistag = batches.args[5];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] =
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] =
          coef0[i] * (1.0 - tagFlav[i] * delMistag[0]) + coef1[i] * (mixState[i] * (1.0 - 2.0 * mistag[0]));
    }
 }
 
-__rooglobal__ void computeBernstein(BatchesHandle batches)
+__rooglobal__ void computeBernstein(Batches &batches)
 {
-   const int nCoef = batches.getNExtraArgs() - 2;
+   const int nCoef = batches.nExtra - 2;
    const int degree = nCoef - 1;
-   const double xmin = batches.extraArg(nCoef);
-   const double xmax = batches.extraArg(nCoef + 1);
-   Batch xData = batches[0];
+   const double xmin = batches.extra[nCoef];
+   const double xmax = batches.extra[nCoef + 1];
+   Batch xData = batches.args[0];
 
    // apply binomial coefficient in-place so we don't have to allocate new memory
    double binomial = 1.0;
    for (int k = 0; k < nCoef; k++) {
-      batches.setExtraArg(k, batches.extraArg(k) * binomial);
+      batches.extra[k] = batches.extra[k] * binomial;
       binomial = (binomial * (degree - k)) / (k + 1);
    }
 
@@ -109,31 +113,31 @@ __rooglobal__ void computeBernstein(BatchesHandle batches)
       double _1_X[bufferSize];
       double powX[bufferSize];
       double pow_1_X[bufferSize];
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
          powX[i] = pow_1_X[i] = 1.0;
          X[i] = (xData[i] - xmin) / (xmax - xmin);
          _1_X[i] = 1 - X[i];
-         batches._output[i] = 0.0;
+         batches.output[i] = 0.0;
       }
 
       // raising 1-x to the power of degree
       for (int k = 2; k <= degree; k += 2) {
-         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
+         for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
             pow_1_X[i] *= _1_X[i] * _1_X[i];
       }
 
       if (degree % 2 == 1) {
-         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
+         for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
             pow_1_X[i] *= _1_X[i];
       }
 
       // inverting 1-x ---> 1/(1-x)
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
          _1_X[i] = 1 / _1_X[i];
 
       for (int k = 0; k < nCoef; k++) {
-         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-            batches._output[i] += batches.extraArg(k) * powX[i] * pow_1_X[i];
+         for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+            batches.output[i] += batches.extra[k] * powX[i] * pow_1_X[i];
 
             // calculating next power for x and 1-x
             powX[i] *= X[i];
@@ -141,8 +145,8 @@ __rooglobal__ void computeBernstein(BatchesHandle batches)
          }
       }
    } else {
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-         batches._output[i] = 0.0;
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+         batches.output[i] = 0.0;
          const double X = (xData[i] - xmin) / (xmax - xmin);
          double powX = 1.0;
          double pow_1_X = 1.0;
@@ -150,7 +154,7 @@ __rooglobal__ void computeBernstein(BatchesHandle batches)
             pow_1_X *= 1 - X;
          const double _1_X = 1 / (1 - X);
          for (int k = 0; k < nCoef; k++) {
-            batches._output[i] += batches.extraArg(k) * powX * pow_1_X;
+            batches.output[i] += batches.extra[k] * powX * pow_1_X;
             powX *= X;
             pow_1_X *= _1_X;
          }
@@ -160,52 +164,52 @@ __rooglobal__ void computeBernstein(BatchesHandle batches)
    // reset extraArgs values so we don't mutate the Batches object
    binomial = 1.0;
    for (int k = 0; k < nCoef; k++) {
-      batches.setExtraArg(k, batches.extraArg(k) / binomial);
+      batches.extra[k] = batches.extra[k] / binomial;
       binomial = (binomial * (degree - k)) / (k + 1);
    }
 }
 
-__rooglobal__ void computeBifurGauss(BatchesHandle batches)
+__rooglobal__ void computeBifurGauss(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch M = batches[1];
-   Batch SL = batches[2];
-   Batch SR = batches[3];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch X = batches.args[0];
+   Batch M = batches.args[1];
+   Batch SL = batches.args[2];
+   Batch SR = batches.args[3];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double arg = X[i] - M[i];
       if (arg < 0) {
          arg /= SL[i];
       } else {
          arg /= SR[i];
       }
-      batches._output[i] = fast_exp(-0.5 * arg * arg);
+      batches.output[i] = fast_exp(-0.5 * arg * arg);
    }
 }
 
-__rooglobal__ void computeBreitWigner(BatchesHandle batches)
+__rooglobal__ void computeBreitWigner(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch M = batches[1];
-   Batch W = batches[2];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch X = batches.args[0];
+   Batch M = batches.args[1];
+   Batch W = batches.args[2];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double arg = X[i] - M[i];
-      batches._output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]);
+      batches.output[i] = 1 / (arg * arg + 0.25 * W[i] * W[i]);
    }
 }
 
-__rooglobal__ void computeBukin(BatchesHandle batches)
+__rooglobal__ void computeBukin(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch XP = batches[1];
-   Batch SP = batches[2];
-   Batch XI = batches[3];
-   Batch R1 = batches[4];
-   Batch R2 = batches[5];
+   Batch X = batches.args[0];
+   Batch XP = batches.args[1];
+   Batch SP = batches.args[2];
+   Batch XI = batches.args[3];
+   Batch R1 = batches.args[4];
+   Batch R2 = batches.args[5];
    const double r3 = log(2.0);
    const double r6 = exp(-6.0);
    const double r7 = 2 * sqrt(2 * log(2.0));
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double r1 = XI[i] * fast_isqrt(XI[i] * XI[i] + 1);
       const double r4 = 1 / fast_isqrt(XI[i] * XI[i] + 1);
       const double hp = 1 / (SP[i] * r7);
@@ -229,61 +233,61 @@ __rooglobal__ void computeBukin(BatchesHandle batches)
          rho = R2[i];
       }
 
-      batches._output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi;
+      batches.output[i] = rho * y * y / Yp / Yp - r3 + factor * 4 * r3 * y * hp * r5 * r4 / yi / yi;
       if (X[i] >= x1 && X[i] < x2) {
-         batches._output[i] =
+         batches.output[i] =
             fast_log(1 + 4 * XI[i] * r4 * (X[i] - XP[i]) * hp) / fast_log(1 + 2 * XI[i] * (XI[i] - r4));
-         batches._output[i] *= -batches._output[i] * r3;
+         batches.output[i] *= -batches.output[i] * r3;
       }
       if (X[i] >= x1 && X[i] < x2 && XI[i] < r6 && XI[i] > -r6)
-         batches._output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp;
+         batches.output[i] = -4 * r3 * (X[i] - XP[i]) * (X[i] - XP[i]) * hp * hp;
    }
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = fast_exp(batches._output[i]);
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = fast_exp(batches.output[i]);
 }
 
-__rooglobal__ void computeCBShape(BatchesHandle batches)
+__rooglobal__ void computeCBShape(Batches &batches)
 {
-   Batch M = batches[0];
-   Batch M0 = batches[1];
-   Batch S = batches[2];
-   Batch A = batches[3];
-   Batch N = batches[4];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch M = batches.args[0];
+   Batch M0 = batches.args[1];
+   Batch S = batches.args[2];
+   Batch A = batches.args[3];
+   Batch N = batches.args[4];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double t = (M[i] - M0[i]) / S[i];
       if ((A[i] > 0 && t >= -A[i]) || (A[i] < 0 && -t >= A[i])) {
-         batches._output[i] = -0.5 * t * t;
+         batches.output[i] = -0.5 * t * t;
       } else {
-         batches._output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t);
-         batches._output[i] = fast_log(batches._output[i]);
-         batches._output[i] *= N[i];
-         batches._output[i] -= 0.5 * A[i] * A[i];
+         batches.output[i] = N[i] / (N[i] - A[i] * A[i] - A[i] * t);
+         batches.output[i] = fast_log(batches.output[i]);
+         batches.output[i] *= N[i];
+         batches.output[i] -= 0.5 * A[i] * A[i];
       }
    }
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = fast_exp(batches._output[i]);
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = fast_exp(batches.output[i]);
 }
 
-__rooglobal__ void computeChebychev(BatchesHandle batches)
+__rooglobal__ void computeChebychev(Batches &batches)
 {
-   Batch xData = batches[0];
-   const int nCoef = batches.getNExtraArgs() - 2;
-   const double xmin = batches.extraArg(nCoef);
-   const double xmax = batches.extraArg(nCoef + 1);
+   Batch xData = batches.args[0];
+   const int nCoef = batches.nExtra - 2;
+   const double xmin = batches.extra[nCoef];
+   const double xmax = batches.extra[nCoef + 1];
 
    if (STEP == 1) {
       double prev[bufferSize][2];
       double X[bufferSize];
 
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
          // set a0-->prev[i][0] and a1-->prev[i][1]
          // and x tranfsformed to range[-1..1]-->X[i]
-         prev[i][0] = batches._output[i] = 1.0;
+         prev[i][0] = batches.output[i] = 1.0;
          prev[i][1] = X[i] = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin);
       }
       for (int k = 0; k < nCoef; k++) {
-         for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-            batches._output[i] += prev[i][1] * batches.extraArg(k);
+         for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+            batches.output[i] += prev[i][1] * batches.extra[k];
 
             // compute next order
             const double next = 2 * X[i] * prev[i][1] - prev[i][0];
@@ -292,13 +296,13 @@ __rooglobal__ void computeChebychev(BatchesHandle batches)
          }
       }
    } else {
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
          double prev0 = 1.0;
          double prev1 = 2 * (xData[i] - 0.5 * (xmax + xmin)) / (xmax - xmin);
          double X = prev1;
-         batches._output[i] = 1.0;
+         batches.output[i] = 1.0;
          for (int k = 0; k < nCoef; k++) {
-            batches._output[i] += prev1 * batches.extraArg(k);
+            batches.output[i] += prev1 * batches.extra[k];
 
             // compute next order
             const double next = 2 * X * prev1 - prev0;
@@ -309,127 +313,127 @@ __rooglobal__ void computeChebychev(BatchesHandle batches)
    }
 }
 
-__rooglobal__ void computeChiSquare(BatchesHandle batches)
+__rooglobal__ void computeChiSquare(Batches &batches)
 {
-   Batch X = batches[0];
-   const double ndof = batches.extraArg(0);
+   Batch X = batches.args[0];
+   const double ndof = batches.extra[0];
    const double gamma = 1 / std::tgamma(ndof / 2.0);
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = gamma;
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = gamma;
 
    constexpr double ln2 = 0.693147180559945309417232121458;
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double arg = (ndof - 2) * fast_log(X[i]) - X[i] - ndof * ln2;
-      batches._output[i] *= fast_exp(0.5 * arg);
+      batches.output[i] *= fast_exp(0.5 * arg);
    }
 }
 
-__rooglobal__ void computeDeltaFunction(BatchesHandle batches)
+__rooglobal__ void computeDeltaFunction(Batches &batches)
 {
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = 0.0 + (batches[0][i] == 1.0);
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = 0.0 + (batches.args[0][i] == 1.0);
    }
 }
 
-__rooglobal__ void computeDstD0BG(BatchesHandle batches)
+__rooglobal__ void computeDstD0BG(Batches &batches)
 {
-   Batch DM = batches[0];
-   Batch DM0 = batches[1];
-   Batch C = batches[2];
-   Batch A = batches[3];
-   Batch B = batches[4];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch DM = batches.args[0];
+   Batch DM0 = batches.args[1];
+   Batch C = batches.args[2];
+   Batch A = batches.args[3];
+   Batch B = batches.args[4];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double ratio = DM[i] / DM0[i];
       const double arg1 = (DM0[i] - DM[i]) / C[i];
       const double arg2 = A[i] * fast_log(ratio);
-      batches._output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1);
+      batches.output[i] = (1 - fast_exp(arg1)) * fast_exp(arg2) + B[i] * (ratio - 1);
    }
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      if (batches._output[i] < 0)
-         batches._output[i] = 0;
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      if (batches.output[i] < 0)
+         batches.output[i] = 0;
    }
 }
 
-__rooglobal__ void computeExpPoly(BatchesHandle batches)
+__rooglobal__ void computeExpPoly(Batches &batches)
 {
-   int lowestOrder = batches.extraArg(0);
-   int nTerms = batches.extraArg(1);
-   auto x = batches[0];
+   int lowestOrder = batches.extra[0];
+   int nTerms = batches.extra[1];
+   auto x = batches.args[0];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = 0.0;
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = 0.0;
       double xTmp = std::pow(x[i], lowestOrder);
       for (int k = 0; k < nTerms; ++k) {
-         batches._output[i] += batches[k + 1][i] * xTmp;
+         batches.output[i] += batches.args[k + 1][i] * xTmp;
          xTmp *= x[i];
       }
-      batches._output[i] = std::exp(batches._output[i]);
+      batches.output[i] = std::exp(batches.output[i]);
    }
 }
 
-__rooglobal__ void computeExponential(BatchesHandle batches)
+__rooglobal__ void computeExponential(Batches &batches)
 {
-   Batch x = batches[0];
-   Batch c = batches[1];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = fast_exp(x[i] * c[i]);
+   Batch x = batches.args[0];
+   Batch c = batches.args[1];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = fast_exp(x[i] * c[i]);
    }
 }
 
-__rooglobal__ void computeExponentialNeg(BatchesHandle batches)
+__rooglobal__ void computeExponentialNeg(Batches &batches)
 {
-   Batch x = batches[0];
-   Batch c = batches[1];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = fast_exp(-x[i] * c[i]);
+   Batch x = batches.args[0];
+   Batch c = batches.args[1];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = fast_exp(-x[i] * c[i]);
    }
 }
 
-__rooglobal__ void computeGamma(BatchesHandle batches)
+__rooglobal__ void computeGamma(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch G = batches[1];
-   Batch B = batches[2];
-   Batch M = batches[3];
+   Batch X = batches.args[0];
+   Batch G = batches.args[1];
+   Batch B = batches.args[2];
+   Batch M = batches.args[3];
    double gamma = -std::lgamma(G[0]);
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       if (X[i] == M[i]) {
-         batches._output[i] = (G[i] == 1.0) / B[i];
-      } else if (G.isItVector()) {
-         batches._output[i] = -std::lgamma(G[i]);
+         batches.output[i] = (G[i] == 1.0) / B[i];
+      } else if (G._isVector) {
+         batches.output[i] = -std::lgamma(G[i]);
       } else {
-         batches._output[i] = gamma;
+         batches.output[i] = gamma;
       }
    }
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       if (X[i] != M[i]) {
          const double invBeta = 1 / B[i];
          double arg = (X[i] - M[i]) * invBeta;
-         batches._output[i] -= arg;
+         batches.output[i] -= arg;
          arg = fast_log(arg);
-         batches._output[i] += arg * (G[i] - 1);
-         batches._output[i] = fast_exp(batches._output[i]);
-         batches._output[i] *= invBeta;
+         batches.output[i] += arg * (G[i] - 1);
+         batches.output[i] = fast_exp(batches.output[i]);
+         batches.output[i] *= invBeta;
       }
    }
 }
 
-__rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches)
+__rooglobal__ void computeGaussModelExpBasis(Batches &batches)
 {
    const double root2 = std::sqrt(2.);
    const double root2pi = std::sqrt(2. * std::atan2(0., -1.));
 
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
 
-      const double x = batches[0][i];
-      const double mean = batches[1][i] * batches[2][i];
-      const double sigma = batches[3][i] * batches[4][i];
-      const double tau = batches[5][i];
+      const double x = batches.args[0][i];
+      const double mean = batches.args[1][i] * batches.args[2][i];
+      const double sigma = batches.args[3][i] * batches.args[4][i];
+      const double tau = batches.args[5][i];
 
       if (tau == 0.0) {
          // Straight Gaussian, used for unconvoluted PDF or expBasis with 0 lifetime
@@ -437,7 +441,7 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches)
          double result = std::exp(-0.5 * xprime * xprime) / (sigma * root2pi);
          if (!isMinus && !isPlus)
             result *= 2;
-         batches._output[i] = result;
+         batches.output[i] = result;
       } else {
          // Convolution with exp(-t/tau)
          const double xprime = (x - mean) / tau;
@@ -449,52 +453,52 @@ __rooglobal__ void computeGaussModelExpBasis(BatchesHandle batches)
             result += RooHeterogeneousMath::evalCerf(0, -u, c).real();
          if (!isPlus)
             result += RooHeterogeneousMath::evalCerf(0, u, c).real();
-         batches._output[i] = result;
+         batches.output[i] = result;
       }
    }
 }
 
-__rooglobal__ void computeGaussian(BatchesHandle batches)
+__rooglobal__ void computeGaussian(Batches &batches)
 {
-   auto x = batches[0];
-   auto mean = batches[1];
-   auto sigma = batches[2];
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   auto x = batches.args[0];
+   auto mean = batches.args[1];
+   auto sigma = batches.args[2];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double arg = x[i] - mean[i];
       const double halfBySigmaSq = -0.5 / (sigma[i] * sigma[i]);
-      batches._output[i] = fast_exp(arg * arg * halfBySigmaSq);
+      batches.output[i] = fast_exp(arg * arg * halfBySigmaSq);
    }
 }
 
-__rooglobal__ void computeIdentity(BatchesHandle batches)
+__rooglobal__ void computeIdentity(Batches &batches)
 {
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = batches[0][i];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = batches.args[0][i];
    }
 }
 
-__rooglobal__ void computeNegativeLogarithms(BatchesHandle batches)
+__rooglobal__ void computeNegativeLogarithms(Batches &batches)
 {
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = -fast_log(batches[0][i]);
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = -fast_log(batches.args[0][i]);
    // Multiply by weights if they exist
-   if (batches.extraArg(0)) {
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-         batches._output[i] *= batches[1][i];
+   if (batches.extra[0]) {
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+         batches.output[i] *= batches.args[1][i];
    }
 }
 
-__rooglobal__ void computeJohnson(BatchesHandle batches)
+__rooglobal__ void computeJohnson(Batches &batches)
 {
-   Batch mass = batches[0];
-   Batch mu = batches[1];
-   Batch lambda = batches[2];
-   Batch gamma = batches[3];
-   Batch delta = batches[4];
+   Batch mass = batches.args[0];
+   Batch mu = batches.args[1];
+   Batch lambda = batches.args[2];
+   Batch gamma = batches.args[3];
+   Batch delta = batches.args[4];
    const double sqrtTwoPi = std::sqrt(TMath::TwoPi());
-   const double massThreshold = batches.extraArg(0);
+   const double massThreshold = batches.extra[0];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double arg = (mass[i] - mu[i]) / lambda[i];
 #ifdef R__HAS_VDT
       const double asinh_arg = fast_log(arg + 1 / fast_isqrt(arg * arg + 1));
@@ -506,7 +510,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches)
          delta[i] * fast_exp(-0.5 * expo * expo) * fast_isqrt(1. + arg * arg) / (sqrtTwoPi * lambda[i]);
 
       const double passThrough = mass[i] >= massThreshold;
-      batches._output[i] = result * passThrough;
+      batches.output[i] = result * passThrough;
    }
 }
 
@@ -514,7 +518,7 @@ __rooglobal__ void computeJohnson(BatchesHandle batches)
  * Code copied from function landau_pdf (math/mathcore/src/PdfFuncMathCore.cxx)
  * and rewritten to enable vectorization.
  */
-__rooglobal__ void computeLandau(BatchesHandle batches)
+__rooglobal__ void computeLandau(Batches &batches)
 {
    auto case0 = [](double x) {
       const double a1[3] = {0.04166666667, -0.01996527778, 0.02709538966};
@@ -567,82 +571,82 @@ __rooglobal__ void computeLandau(BatchesHandle batches)
       return u * u * (1 + (a2[0] + a2[1] * u) * u);
    };
 
-   Batch X = batches[0];
-   Batch M = batches[1];
-   Batch S = batches[2];
+   Batch X = batches.args[0];
+   Batch M = batches.args[1];
+   Batch S = batches.args[2];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = (X[i] - M[i]) / S[i];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = (X[i] - M[i]) / S[i];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       if (S[i] <= 0.0) {
-         batches._output[i] = 0;
-      } else if (batches._output[i] < -5.5) {
-         batches._output[i] = case0(batches._output[i]);
-      } else if (batches._output[i] < -1.0) {
-         batches._output[i] = case1(batches._output[i]);
-      } else if (batches._output[i] < 1.0) {
-         batches._output[i] = case2(batches._output[i]);
-      } else if (batches._output[i] < 5.0) {
-         batches._output[i] = case3(batches._output[i]);
-      } else if (batches._output[i] < 12.0) {
-         batches._output[i] = case4(batches._output[i]);
-      } else if (batches._output[i] < 50.0) {
-         batches._output[i] = case5(batches._output[i]);
-      } else if (batches._output[i] < 300.) {
-         batches._output[i] = case6(batches._output[i]);
+         batches.output[i] = 0;
+      } else if (batches.output[i] < -5.5) {
+         batches.output[i] = case0(batches.output[i]);
+      } else if (batches.output[i] < -1.0) {
+         batches.output[i] = case1(batches.output[i]);
+      } else if (batches.output[i] < 1.0) {
+         batches.output[i] = case2(batches.output[i]);
+      } else if (batches.output[i] < 5.0) {
+         batches.output[i] = case3(batches.output[i]);
+      } else if (batches.output[i] < 12.0) {
+         batches.output[i] = case4(batches.output[i]);
+      } else if (batches.output[i] < 50.0) {
+         batches.output[i] = case5(batches.output[i]);
+      } else if (batches.output[i] < 300.) {
+         batches.output[i] = case6(batches.output[i]);
       } else {
-         batches._output[i] = case7(batches._output[i]);
+         batches.output[i] = case7(batches.output[i]);
       }
    }
 }
 
-__rooglobal__ void computeLognormal(BatchesHandle batches)
+__rooglobal__ void computeLognormal(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch M0 = batches[1];
-   Batch K = batches[2];
+   Batch X = batches.args[0];
+   Batch M0 = batches.args[1];
+   Batch K = batches.args[2];
    constexpr double rootOf2pi = 2.506628274631000502415765284811;
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double lnxOverM0 = fast_log(X[i] / M0[i]);
       double lnk = fast_log(K[i]);
       if (lnk < 0)
          lnk = -lnk;
       double arg = lnxOverM0 / lnk;
       arg *= -0.5 * arg;
-      batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);
+      batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);
    }
 }
 
-__rooglobal__ void computeLognormalStandard(BatchesHandle batches)
+__rooglobal__ void computeLognormalStandard(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch M0 = batches[1];
-   Batch K = batches[2];
+   Batch X = batches.args[0];
+   Batch M0 = batches.args[1];
+   Batch K = batches.args[2];
    constexpr double rootOf2pi = 2.506628274631000502415765284811;
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double lnxOverM0 = fast_log(X[i]) - M0[i];
       double lnk = K[i];
       if (lnk < 0)
          lnk = -lnk;
       double arg = lnxOverM0 / lnk;
       arg *= -0.5 * arg;
-      batches._output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);
+      batches.output[i] = fast_exp(arg) / (X[i] * lnk * rootOf2pi);
    }
 }
 
-__rooglobal__ void computeNormalizedPdf(BatchesHandle batches)
+__rooglobal__ void computeNormalizedPdf(Batches &batches)
 {
-   auto rawVal = batches[0];
-   auto normVal = batches[1];
+   auto rawVal = batches.args[0];
+   auto normVal = batches.args[1];
 
    int nEvalErrorsType0 = 0;
    int nEvalErrorsType1 = 0;
    int nEvalErrorsType2 = 0;
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double out = 0.0;
-      // batches._output[i] = rawVal[i] / normVar[i];
+      // batches.output[i] = rawVal[i] / normVar[i];
       if (normVal[i] < 0. || (normVal[i] == 0. && rawVal[i] != 0)) {
          // Unreasonable normalisations. A zero integral can be tolerated if the function vanishes, though.
          out = RooNaNPacker::packFloatIntoNaN(-normVal[i] + (rawVal[i] < 0. ? -rawVal[i] : 0.));
@@ -658,15 +662,15 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches)
       } else {
          out = (rawVal[i] == 0. && normVal[i] == 0.) ? 0. : rawVal[i] / normVal[i];
       }
-      batches._output[i] = out;
+      batches.output[i] = out;
    }
 
    if (nEvalErrorsType0 > 0)
-      batches.setExtraArg(0, batches.extraArg(0) + nEvalErrorsType0);
+      batches.extra[0] = batches.extra[0] + nEvalErrorsType0;
    if (nEvalErrorsType1 > 1)
-      batches.setExtraArg(1, batches.extraArg(1) + nEvalErrorsType1);
+      batches.extra[1] = batches.extra[1] + nEvalErrorsType1;
    if (nEvalErrorsType2 > 2)
-      batches.setExtraArg(2, batches.extraArg(2) + nEvalErrorsType2);
+      batches.extra[2] = batches.extra[2] + nEvalErrorsType2;
 }
 
 /* TMath::ASinH(x) needs to be replaced with ln( x + sqrt(x^2+1))
@@ -677,240 +681,244 @@ __rooglobal__ void computeNormalizedPdf(BatchesHandle batches)
  * ln is the logarithm that was solely present in the initial
  * formula, that is before the asinh replacement
  */
-__rooglobal__ void computeNovosibirsk(BatchesHandle batches)
+__rooglobal__ void computeNovosibirsk(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch P = batches[1];
-   Batch W = batches[2];
-   Batch T = batches[3];
+   Batch X = batches.args[0];
+   Batch P = batches.args[1];
+   Batch W = batches.args[2];
+   Batch T = batches.args[3];
    constexpr double xi = 2.3548200450309494; // 2 Sqrt( Ln(4) )
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       double argasinh = 0.5 * xi * T[i];
       double argln = argasinh + 1 / fast_isqrt(argasinh * argasinh + 1);
       double asinh = fast_log(argln);
 
       double argln2 = 1 - (X[i] - P[i]) * T[i] / W[i];
       double ln = fast_log(argln2);
-      batches._output[i] = ln / asinh;
-      batches._output[i] *= -0.125 * xi * xi * batches._output[i];
-      batches._output[i] -= 2.0 / xi / xi * asinh * asinh;
+      batches.output[i] = ln / asinh;
+      batches.output[i] *= -0.125 * xi * xi * batches.output[i];
+      batches.output[i] -= 2.0 / xi / xi * asinh * asinh;
    }
 
    // faster if you exponentiate in a separate loop (dark magic!)
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP)
-      batches._output[i] = fast_exp(batches._output[i]);
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP)
+      batches.output[i] = fast_exp(batches.output[i]);
 }
 
-__rooglobal__ void computePoisson(BatchesHandle batches)
+__rooglobal__ void computePoisson(Batches &batches)
 {
-   Batch x = batches[0];
-   Batch mean = batches[1];
-   bool protectNegative = batches.extraArg(0);
-   bool noRounding = batches.extraArg(1);
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   Batch x = batches.args[0];
+   Batch mean = batches.args[1];
+   bool protectNegative = batches.extra[0];
+   bool noRounding = batches.extra[1];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double x_i = noRounding ? x[i] : floor(x[i]);
-      batches._output[i] = std::lgamma(x_i + 1.);
+      batches.output[i] = std::lgamma(x_i + 1.);
    }
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double x_i = noRounding ? x[i] : floor(x[i]);
       const double logMean = fast_log(mean[i]);
-      const double logPoisson = x_i * logMean - mean[i] - batches._output[i];
-      batches._output[i] = fast_exp(logPoisson);
+      const double logPoisson = x_i * logMean - mean[i] - batches.output[i];
+      batches.output[i] = fast_exp(logPoisson);
 
       // Cosmetics
       if (x_i < 0) {
-         batches._output[i] = 0;
+         batches.output[i] = 0;
       } else if (x_i == 0) {
-         batches._output[i] = 1 / fast_exp(mean[i]);
+         batches.output[i] = 1 / fast_exp(mean[i]);
       }
 
       if (protectNegative && mean[i] < 0)
-         batches._output[i] = 1.E-3;
+         batches.output[i] = 1.E-3;
    }
 }
 
-__rooglobal__ void computePolynomial(BatchesHandle batches)
+__rooglobal__ void computePolynomial(Batches &batches)
 {
-   const int nCoef = batches.extraArg(0);
-   const std::size_t nEvents = batches.getNEvents();
-   Batch x = batches[nCoef];
+   const int nCoef = batches.extra[0];
+   const std::size_t nEvents = batches.nEvents;
+   Batch x = batches.args[nCoef];
 
    for (size_t i = BEGIN; i < nEvents; i += STEP) {
-      batches._output[i] = batches[nCoef - 1][i];
+      batches.output[i] = batches.args[nCoef - 1][i];
    }
 
    // Indexes are in range 0..nCoef-1 but coefList[nCoef-1] has already been
    // processed.
    for (int k = nCoef - 2; k >= 0; k--) {
       for (size_t i = BEGIN; i < nEvents; i += STEP) {
-         batches._output[i] = batches[k][i] + x[i] * batches._output[i];
+         batches.output[i] = batches.args[k][i] + x[i] * batches.output[i];
       }
    }
 }
 
-__rooglobal__ void computePower(BatchesHandle batches)
+__rooglobal__ void computePower(Batches &batches)
 {
-   const int nCoef = batches.extraArg(0);
-   Batch x = batches[0];
+   const int nCoef = batches.extra[0];
+   Batch x = batches.args[0];
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = 0.0;
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = 0.0;
       for (int k = 0; k < nCoef; ++k) {
-         batches._output[i] += batches[2 * k + 1][i] * std::pow(x[i], batches[2 * k + 2][i]);
+         batches.output[i] += batches.args[2 * k + 1][i] * std::pow(x[i], batches.args[2 * k + 2][i]);
       }
    }
 }
 
-__rooglobal__ void computeProdPdf(BatchesHandle batches)
+__rooglobal__ void computeProdPdf(Batches &batches)
 {
-   const int nPdfs = batches.extraArg(0);
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = 1.;
+   const int nPdfs = batches.extra[0];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = 1.;
    }
    for (int pdf = 0; pdf < nPdfs; pdf++) {
-      for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-         batches._output[i] *= batches[pdf][i];
+      for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+         batches.output[i] *= batches.args[pdf][i];
       }
    }
 }
 
-__rooglobal__ void computeRatio(BatchesHandle batches)
+__rooglobal__ void computeRatio(Batches &batches)
 {
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      batches._output[i] = batches[0][i] / batches[1][i];
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      batches.output[i] = batches.args[0][i] / batches.args[1][i];
    }
 }
 
-__rooglobal__ void computeTruthModelExpBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelExpBasis(Batches &batches)
 {
 
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
-      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]);
+      batches.output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]);
    }
 }
 
-__rooglobal__ void computeTruthModelSinBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelSinBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
-      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_sin(x * batches[2][i]);
+      batches.output[i] =
+         isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_sin(x * batches.args[2][i]);
    }
 }
 
-__rooglobal__ void computeTruthModelCosBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelCosBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
-      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * fast_cos(x * batches[2][i]);
+      batches.output[i] =
+         isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * fast_cos(x * batches.args[2][i]);
    }
 }
 
-__rooglobal__ void computeTruthModelLinBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelLinBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
       if (isOutOfSign) {
-         batches._output[i] = 0.0;
+         batches.output[i] = 0.0;
       } else {
-         const double tscaled = std::abs(x) / batches[1][i];
-         batches._output[i] = fast_exp(-tscaled) * tscaled;
+         const double tscaled = std::abs(x) / batches.args[1][i];
+         batches.output[i] = fast_exp(-tscaled) * tscaled;
       }
    }
 }
 
-__rooglobal__ void computeTruthModelQuadBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelQuadBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
       if (isOutOfSign) {
-         batches._output[i] = 0.0;
+         batches.output[i] = 0.0;
       } else {
-         const double tscaled = std::abs(x) / batches[1][i];
-         batches._output[i] = fast_exp(-tscaled) * tscaled * tscaled;
+         const double tscaled = std::abs(x) / batches.args[1][i];
+         batches.output[i] = fast_exp(-tscaled) * tscaled * tscaled;
       }
    }
 }
 
-__rooglobal__ void computeTruthModelSinhBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelSinhBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
-      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * sinh(x * batches[2][i] * 0.5);
+      batches.output[i] =
+         isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * sinh(x * batches.args[2][i] * 0.5);
    }
 }
 
-__rooglobal__ void computeTruthModelCoshBasis(BatchesHandle batches)
+__rooglobal__ void computeTruthModelCoshBasis(Batches &batches)
 {
-   const bool isMinus = batches.extraArg(0) < 0.0;
-   const bool isPlus = batches.extraArg(0) > 0.0;
-   for (std::size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
-      double x = batches[0][i];
+   const bool isMinus = batches.extra[0] < 0.0;
+   const bool isPlus = batches.extra[0] > 0.0;
+   for (std::size_t i = BEGIN; i < batches.nEvents; i += STEP) {
+      double x = batches.args[0][i];
       // Enforce sign compatibility
       const bool isOutOfSign = (isMinus && x > 0.0) || (isPlus && x < 0.0);
-      batches._output[i] = isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches[1][i]) * cosh(x * batches[2][i] * .5);
+      batches.output[i] =
+         isOutOfSign ? 0.0 : fast_exp(-std::abs(x) / batches.args[1][i]) * cosh(x * batches.args[2][i] * .5);
    }
 }
 
-__rooglobal__ void computeVoigtian(BatchesHandle batches)
+__rooglobal__ void computeVoigtian(Batches &batches)
 {
-   Batch X = batches[0];
-   Batch M = batches[1];
-   Batch W = batches[2];
-   Batch S = batches[3];
+   Batch X = batches.args[0];
+   Batch M = batches.args[1];
+   Batch W = batches.args[2];
+   Batch S = batches.args[3];
    const double invSqrt2 = 0.707106781186547524400844362105;
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       const double arg = (X[i] - M[i]) * (X[i] - M[i]);
       if (S[i] == 0.0 && W[i] == 0.0) {
-         batches._output[i] = 1.0;
+         batches.output[i] = 1.0;
       } else if (S[i] == 0.0) {
-         batches._output[i] = 1 / (arg + 0.25 * W[i] * W[i]);
+         batches.output[i] = 1 / (arg + 0.25 * W[i] * W[i]);
       } else if (W[i] == 0.0) {
-         batches._output[i] = fast_exp(-0.5 * arg / (S[i] * S[i]));
+         batches.output[i] = fast_exp(-0.5 * arg / (S[i] * S[i]));
       } else {
-         batches._output[i] = invSqrt2 / S[i];
+         batches.output[i] = invSqrt2 / S[i];
       }
    }
 
-   for (size_t i = BEGIN; i < batches.getNEvents(); i += STEP) {
+   for (size_t i = BEGIN; i < batches.nEvents; i += STEP) {
       if (S[i] != 0.0 && W[i] != 0.0) {
-         if (batches._output[i] < 0)
-            batches._output[i] = -batches._output[i];
+         if (batches.output[i] < 0)
+            batches.output[i] = -batches.output[i];
          const double factor = W[i] > 0.0 ? 0.5 : -0.5;
-         RooHeterogeneousMath::STD::complex<double> z(batches._output[i] * (X[i] - M[i]),
-                                                      factor * batches._output[i] * W[i]);
-         batches._output[i] *= RooHeterogeneousMath::faddeeva(z).real();
+         RooHeterogeneousMath::STD::complex<double> z(batches.output[i] * (X[i] - M[i]),
+                                                      factor * batches.output[i] * W[i]);
+         batches.output[i] *= RooHeterogeneousMath::faddeeva(z).real();
       }
    }
 }
 
 /// Returns a std::vector of pointers to the compute functions in this file.
-std::vector<void (*)(BatchesHandle)> getFunctions()
+std::vector<void (*)(Batches &)> getFunctions()
 {
    return {computeAddPdf,
            computeArgusBG,
diff --git a/roofit/batchcompute/src/RooBatchCompute.cu b/roofit/batchcompute/src/RooBatchCompute.cu
index 55f3d4d457b1c..ce8b10197e8ee 100644
--- a/roofit/batchcompute/src/RooBatchCompute.cu
+++ b/roofit/batchcompute/src/RooBatchCompute.cu
@@ -25,6 +25,7 @@ This file contains the code for cuda computations using the RooBatchCompute libr
 #include <TError.h>
 
 #include <algorithm>
+#include <vector>
 
 #ifndef RF_ARCH
 #error "RF_ARCH should always be defined"
@@ -41,27 +42,28 @@ namespace {
 
 void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, std::size_t nExtraArgs)
 {
-   batches._nEvents = nEvents;
-   batches._nBatches = nBatches;
-   batches._nExtraArgs = nExtraArgs;
-   batches._output = output;
+   batches.nEvents = nEvents;
+   batches.nBatches = nBatches;
+   batches.nExtra = nExtraArgs;
+   batches.output = output;
 }
 
-void fillArrays(Batch *arrays, const VarVector &vars, double *buffer, double *bufferDevice, std::size_t nEvents)
+void fillArrays(Batch *arrays, VarSpan vars, double *buffer, double *bufferDevice, std::size_t nEvents)
 {
    for (int i = 0; i < vars.size(); i++) {
       const std::span<const double> &span = vars[i];
-      if (!span.empty() && span.size() < nEvents) {
+      arrays[i]._isVector = span.empty() || span.size() >= nEvents;
+      if (!arrays[i]._isVector) {
          // In the scalar case, the value is not on the GPU yet, so we have to
          // copy the value to the GPU buffer.
          buffer[i] = span[0];
-         arrays[i].set(bufferDevice + i, false);
+         arrays[i]._array = bufferDevice + i;
       } else {
          // In the vector input cases, they are already on the GPU, so we can
          // fill be buffer with some dummy value and set the input span
          // directly.
          buffer[i] = 0.0;
-         arrays[i].set(span.data(), true);
+         arrays[i]._array = span.data();
       }
    }
 }
@@ -85,13 +87,13 @@ int getGridSize(std::size_t n)
 
 } // namespace
 
-std::vector<void (*)(BatchesHandle)> getFunctions();
+std::vector<void (*)(Batches &)> getFunctions();
 
 /// This class overrides some RooBatchComputeInterface functions, for the
 /// purpose of providing a cuda specific implementation of the library.
 class RooBatchComputeClass : public RooBatchComputeInterface {
 private:
-   const std::vector<void (*)(BatchesHandle)> _computeFunctions;
+   const std::vector<void (*)(Batches &)> _computeFunctions;
 
 public:
    RooBatchComputeClass() : _computeFunctions(getFunctions())
@@ -114,10 +116,10 @@ public:
    \param computer An enum specifying the compute function to be used.
    \param output The array where the computation results are stored.
    \param nEvents The number of events to be processed.
-   \param vars A std::vector containing pointers to the variables involved in the computation.
-   \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/
-   void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents,
-                const VarVector &vars, ArgVector &extraArgs) override
+   \param vars A std::span containing pointers to the variables involved in the computation.
+   \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
+   void compute(RooBatchCompute::Config const &cfg, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
+                ArgSpan extraArgs) override
    {
       using namespace RooFit::Detail::CudaInterface;
 
@@ -138,11 +140,11 @@ public:
 
       fillBatches(*batches, output, nEvents, vars.size(), extraArgs.size());
       fillArrays(arrays, vars, scalarBuffer, scalarBufferDevice, nEvents);
-      batches->_arrays = arraysDevice;
+      batches->args = arraysDevice;
 
       if (!extraArgs.empty()) {
          std::copy(std::cbegin(extraArgs), std::cend(extraArgs), extraArgsHost);
-         batches->_extraArgs = extraArgsDevice;
+         batches->extra = extraArgsDevice;
       }
 
       copyHostToDevice(hostMem.data(), deviceMem.data(), hostMem.size(), cfg.cudaStream());
diff --git a/roofit/batchcompute/src/RooBatchCompute.cxx b/roofit/batchcompute/src/RooBatchCompute.cxx
index b6a22cb13ded4..d84c46b81936f 100644
--- a/roofit/batchcompute/src/RooBatchCompute.cxx
+++ b/roofit/batchcompute/src/RooBatchCompute.cxx
@@ -31,6 +31,7 @@ This file contains the code for cpu computations using the RooBatchCompute libra
 #include <algorithm>
 #include <sstream>
 #include <stdexcept>
+#include <vector>
 
 #ifndef RF_ARCH
 #error "RF_ARCH should always be defined"
@@ -41,41 +42,41 @@ namespace RF_ARCH {
 
 namespace {
 
-void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgVector &extraArgs)
+void fillBatches(Batches &batches, RestrictArr output, size_t nEvents, std::size_t nBatches, ArgSpan extraArgs)
 {
-   batches._extraArgs = extraArgs.data();
-   batches._nEvents = nEvents;
-   batches._nBatches = nBatches;
-   batches._nExtraArgs = extraArgs.size();
-   batches._output = output;
+   batches.extra = extraArgs.data();
+   batches.nEvents = nEvents;
+   batches.nBatches = nBatches;
+   batches.nExtra = extraArgs.size();
+   batches.output = output;
 }
 
-void fillArrays(std::vector<Batch> &arrays, const VarVector &vars, double *buffer, std::size_t nEvents)
+void fillArrays(std::span<Batch> arrays, VarSpan vars, std::size_t nEvents)
 {
+   for (std::size_t i = 0; i < vars.size(); i++) {
+      arrays[i]._array = vars[i].data();
+      arrays[i]._isVector = vars[i].empty() || vars[i].size() >= nEvents;
+   }
+}
 
-   arrays.resize(vars.size());
-   for (size_t i = 0; i < vars.size(); i++) {
-      const std::span<const double> &span = vars[i];
-      if (!span.empty() && span.size() < nEvents) {
-         // In the scalar case, copy the value to each element of vector input
-         // buffer.
-         std::fill_n(&buffer[i * bufferSize], bufferSize, span.data()[0]);
-         arrays[i].set(&buffer[i * bufferSize], false);
-      } else {
-         arrays[i].set(span.data(), true);
-      }
+inline void advance(Batches &batches, std::size_t nEvents)
+{
+   for (std::size_t i = 0; i < batches.nBatches; i++) {
+      Batch &arg = batches.args[i];
+      arg._array += arg._isVector * nEvents;
    }
+   batches.output += nEvents;
 }
 
 } // namespace
 
-std::vector<void (*)(BatchesHandle)> getFunctions();
+std::vector<void (*)(Batches &)> getFunctions();
 
 /// This class overrides some RooBatchComputeInterface functions, for the
 /// purpose of providing a CPU specific implementation of the library.
 class RooBatchComputeClass : public RooBatchComputeInterface {
 private:
-   const std::vector<void (*)(BatchesHandle)> _computeFunctions;
+   const std::vector<void (*)(Batches &)> _computeFunctions;
 
 public:
    RooBatchComputeClass() : _computeFunctions(getFunctions())
@@ -106,14 +107,11 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
    \param computer An enum specifying the compute function to be used.
    \param output The array where the computation results are stored.
    \param nEvents The number of events to be processed.
-   \param vars A std::vector containing pointers to the variables involved in the computation.
-   \param extraArgs An optional std::vector containing extra double values that may participate in the computation. **/
-   void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, const VarVector &vars,
-                ArgVector &extraArgs) override
+   \param vars A std::span containing pointers to the variables involved in the computation.
+   \param extraArgs An optional std::span containing extra double values that may participate in the computation. **/
+   void compute(Config const &, Computer computer, RestrictArr output, size_t nEvents, VarSpan vars,
+                ArgSpan extraArgs) override
    {
-      static std::vector<double> buffer;
-      buffer.resize(vars.size() * bufferSize);
-
       if (ROOT::IsImplicitMTEnabled()) {
          ROOT::Internal::TExecutor ex;
          std::size_t nThreads = ex.GetPoolSize();
@@ -127,25 +125,25 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
             // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
             // Then advance every object but the first to split the work between threads
             Batches batches;
-            std::vector<Batch> arrays;
+            std::vector<Batch> arrays(vars.size());
             fillBatches(batches, output, nEventsPerThread, vars.size(), extraArgs);
-            fillArrays(arrays, vars, buffer.data(), nEvents);
-            batches._arrays = arrays.data();
-            batches.advance(batches.getNEvents() * idx);
+            fillArrays(arrays, vars, nEvents);
+            batches.args = arrays.data();
+            advance(batches, batches.nEvents * idx);
 
             // Set the number of events of the last Batches object as the remaining events
             if (idx == nThreads - 1) {
-               batches.setNEvents(nEvents - idx * batches.getNEvents());
+               batches.nEvents = nEvents - idx * batches.nEvents;
             }
 
-            std::size_t events = batches.getNEvents();
-            batches.setNEvents(bufferSize);
+            std::size_t events = batches.nEvents;
+            batches.nEvents = bufferSize;
             while (events > bufferSize) {
                _computeFunctions[computer](batches);
-               batches.advance(bufferSize);
+               advance(batches, bufferSize);
                events -= bufferSize;
             }
-            batches.setNEvents(events);
+            batches.nEvents = events;
             _computeFunctions[computer](batches);
             return 0;
          };
@@ -159,19 +157,19 @@ class RooBatchComputeClass : public RooBatchComputeInterface {
          // Fill a std::vector<Batches> with the same object and with ~nEvents/nThreads
          // Then advance every object but the first to split the work between threads
          Batches batches;
-         std::vector<Batch> arrays;
+         std::vector<Batch> arrays(vars.size());
          fillBatches(batches, output, nEvents, vars.size(), extraArgs);
-         fillArrays(arrays, vars, buffer.data(), nEvents);
-         batches._arrays = arrays.data();
+         fillArrays(arrays, vars, nEvents);
+         batches.args = arrays.data();
 
-         std::size_t events = batches.getNEvents();
-         batches.setNEvents(bufferSize);
+         std::size_t events = batches.nEvents;
+         batches.nEvents = bufferSize;
          while (events > bufferSize) {
             _computeFunctions[computer](batches);
-            batches.advance(bufferSize);
+            advance(batches, bufferSize);
             events -= bufferSize;
          }
-         batches.setNEvents(events);
+         batches.nEvents = events;
          _computeFunctions[computer](batches);
       }
    }
diff --git a/roofit/roofit/src/RooChebychev.cxx b/roofit/roofit/src/RooChebychev.cxx
index f6ff2a7922a7b..cbb15adbd99f9 100644
--- a/roofit/roofit/src/RooChebychev.cxx
+++ b/roofit/roofit/src/RooChebychev.cxx
@@ -104,14 +104,17 @@ void RooChebychev::translate(RooFit::Detail::CodeSquashContext &ctx) const
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Compute multiple values of Chebychev.
-void RooChebychev::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const
+void RooChebychev::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const
 {
-  RooBatchCompute::ArgVector extraArgs;
-  for (auto* coef:_coefList)
-    extraArgs.push_back( static_cast<const RooAbsReal*>(coef)->getVal() );
-  extraArgs.push_back( _x.min(_refRangeName?_refRangeName->GetName() : nullptr) );
-  extraArgs.push_back( _x.max(_refRangeName?_refRangeName->GetName() : nullptr) );
-  RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Chebychev, output, nEvents, {dataMap.at(_x)}, extraArgs);
+   std::vector<double> extraArgs;
+   extraArgs.reserve(_coefList.size() + 2);
+   for (auto *coef : _coefList) {
+      extraArgs.push_back(static_cast<const RooAbsReal *>(coef)->getVal());
+   }
+   extraArgs.push_back(_x.min(_refRangeName ? _refRangeName->GetName() : nullptr));
+   extraArgs.push_back(_x.max(_refRangeName ? _refRangeName->GetName() : nullptr));
+   RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Chebychev, output, nEvents, {dataMap.at(_x)},
+                            extraArgs);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/roofit/roofit/src/RooChiSquarePdf.cxx b/roofit/roofit/src/RooChiSquarePdf.cxx
index 1f1a61b161918..7368bf12bd43f 100644
--- a/roofit/roofit/src/RooChiSquarePdf.cxx
+++ b/roofit/roofit/src/RooChiSquarePdf.cxx
@@ -21,8 +21,8 @@ Here we also implement the analytic integral.
 
 #include "TMath.h"
 
+#include <array>
 #include <cmath>
-using namespace std;
 
 ClassImp(RooChiSquarePdf);
 
@@ -55,17 +55,19 @@ RooChiSquarePdf::RooChiSquarePdf(const RooChiSquarePdf& other, const char* name)
 
 double RooChiSquarePdf::evaluate() const
 {
-  if(_x <= 0) return 0;
+   if (_x <= 0)
+      return 0;
 
-  return  pow(_x,(_ndof/2.)-1.) * exp(-_x/2.) / TMath::Gamma(_ndof/2.) / pow(2.,_ndof/2.);
+   return pow(_x, (_ndof / 2.) - 1.) * std::exp(-_x / 2.) / TMath::Gamma(_ndof / 2.) / std::pow(2., _ndof / 2.);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Compute multiple values of ChiSquare distribution.
-void RooChiSquarePdf::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const
+void RooChiSquarePdf::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const
 {
-  RooBatchCompute::ArgVector extraArgs{_ndof};
-  RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ChiSquare, output, nEvents, {dataMap.at(_x)}, extraArgs);
+   std::array<double, 1> extraArgs{_ndof};
+   RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ChiSquare, output, nEvents, {dataMap.at(_x)},
+                            extraArgs);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/roofit/roofit/src/RooExpPoly.cxx b/roofit/roofit/src/RooExpPoly.cxx
index b992b992b040b..b6fcae6c0552e 100644
--- a/roofit/roofit/src/RooExpPoly.cxx
+++ b/roofit/roofit/src/RooExpPoly.cxx
@@ -35,10 +35,11 @@ RooExpPoly::RooExpPoly(const char*, const char*, RooAbsReal&, const RooArgList&,
 #include <TMath.h>
 #include <TError.h>
 
-#include <cmath>
-#include <sstream>
+#include <array>
 #include <cassert>
+#include <cmath>
 #include <complex>
+#include <sstream>
 
 ClassImp(RooExpPoly);
 
@@ -128,7 +129,7 @@ double RooExpPoly::evaluateLog() const
 /// Compute multiple values of ExpPoly distribution.
 void RooExpPoly::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const
 {
-   RooBatchCompute::VarVector vars;
+   std::vector<std::span<const double>> vars;
    vars.reserve(_coefList.size() + 1);
    vars.push_back(dataMap.at(_x));
 
@@ -137,9 +138,7 @@ void RooExpPoly::computeBatch(double *output, size_t nEvents, RooFit::Detail::Da
       vars.push_back(dataMap.at(coef));
    }
 
-   RooBatchCompute::ArgVector args;
-   args.push_back(_lowestOrder);
-   args.push_back(_coefList.size());
+   std::array<double, 2> args{static_cast<double>(_lowestOrder), static_cast<double>(_coefList.size())};
 
    RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::ExpPoly, output, nEvents, vars, args);
 }
diff --git a/roofit/roofit/src/RooGaussModel.cxx b/roofit/roofit/src/RooGaussModel.cxx
index 7fbc29ed8b8cd..6ff16d7ebf190 100644
--- a/roofit/roofit/src/RooGaussModel.cxx
+++ b/roofit/roofit/src/RooGaussModel.cxx
@@ -34,6 +34,8 @@ for analytical convolutions with classes inheriting from RooAbsAnaConvPdf
 
 #include <RooHeterogeneousMath.h>
 
+#include <array>
+
 namespace {
 
 enum RooGaussBasis {
@@ -190,7 +192,7 @@ void RooGaussModel::computeBatch(double *output, size_t size,
    // arises, they can be implemented following this example. Remember to also
    // adapt RooGaussModel::canComputeBatchWithCuda().
    if (basisType == expBasis) {
-      RooBatchCompute::ArgVector extraArgs{basisSign};
+      std::array<double, 1> extraArgs{basisSign};
       RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::GaussModelExpBasis, output, size,
                         {xVals, meanVals, meanSfVals, sigmaVals, sigmaSfVals, param1Vals}, extraArgs);
       return;
diff --git a/roofit/roofit/src/RooJohnson.cxx b/roofit/roofit/src/RooJohnson.cxx
index a770db6e0ea08..24fa2d67fb2e4 100644
--- a/roofit/roofit/src/RooJohnson.cxx
+++ b/roofit/roofit/src/RooJohnson.cxx
@@ -45,9 +45,11 @@ Johnson, N. L. (1949). *Systems of Frequency Curves Generated by Methods of Tran
 #include "RooHelpers.h"
 #include "RooBatchCompute.h"
 
-#include <cmath>
 #include "TMath.h"
 
+#include <array>
+#include <cmath>
+
 ClassImp(RooJohnson);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -114,7 +116,7 @@ double RooJohnson::evaluate() const
 /// Compute multiple values of the Johnson distribution.
 void RooJohnson::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const
 {
-  RooBatchCompute::ArgVector extraArgs{_massThreshold};
+  std::array<double, 1> extraArgs{_massThreshold};
   RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Johnson, output, nEvents,
           {dataMap.at(_mass), dataMap.at(_mu), dataMap.at(_lambda), dataMap.at(_gamma), dataMap.at(_delta)},
           extraArgs);
diff --git a/roofit/roofit/src/RooPoisson.cxx b/roofit/roofit/src/RooPoisson.cxx
index 0a821f6a493c3..fd803d8077a29 100644
--- a/roofit/roofit/src/RooPoisson.cxx
+++ b/roofit/roofit/src/RooPoisson.cxx
@@ -22,6 +22,8 @@ Poisson pdf
 #include "RooFit/Detail/EvaluateFuncs.h"
 #include "Math/ProbFuncMathCore.h"
 
+#include <array>
+
 ClassImp(RooPoisson);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -78,7 +80,7 @@ void RooPoisson::translate(RooFit::Detail::CodeSquashContext &ctx) const
 void RooPoisson::computeBatch(double *output, size_t nEvents,
                               RooFit::Detail::DataMap const &dataMap) const
 {
-   RooBatchCompute::ArgVector extraArgs{static_cast<double>(_protectNegative), static_cast<double>(_noRounding)};
+   std::array<double, 2> extraArgs{static_cast<double>(_protectNegative), static_cast<double>(_noRounding)};
    RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Poisson, output, nEvents,
                             {dataMap.at(x), dataMap.at(mean)}, extraArgs);
 }
diff --git a/roofit/roofit/src/RooPower.cxx b/roofit/roofit/src/RooPower.cxx
index 603ef6c4f30ce..4dba7e180ea9e 100644
--- a/roofit/roofit/src/RooPower.cxx
+++ b/roofit/roofit/src/RooPower.cxx
@@ -26,8 +26,9 @@ RooPower implements a power law PDF of the form
 
 #include <TError.h>
 
-#include <cmath>
+#include <array>
 #include <cassert>
+#include <cmath>
 #include <sstream>
 
 ClassImp(RooPower);
@@ -84,7 +85,7 @@ RooPower::RooPower(const RooPower &other, const char *name)
 /// Compute multiple values of Power distribution.
 void RooPower::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const
 {
-    RooBatchCompute::VarVector vars;
+    std::vector<std::span<const double>> vars;
     vars.reserve(2 *  _coefList.size() + 1);
     vars.push_back(dataMap.at(_x));
 
@@ -95,8 +96,7 @@ void RooPower::computeBatch(double *output, size_t nEvents, RooFit::Detail::Data
      vars.push_back(dataMap.at(&_expList[i]));
    }
 
-    RooBatchCompute::ArgVector args;
-    args.push_back(_coefList.size());
+   std::array<double, 1> args{static_cast<double>(_coefList.size())};
 
    RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::Power, output, nEvents, vars, args);
 }
diff --git a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h
index 007431e4726e4..2a9d933b9051a 100644
--- a/roofit/roofitcore/inc/RooFit/Detail/DataMap.h
+++ b/roofit/roofitcore/inc/RooFit/Detail/DataMap.h
@@ -82,10 +82,7 @@ namespace Detail {
 
 class DataMap {
 public:
-   auto size() const
-   {
-      return _dataMap.size();
-   }
+   auto size() const { return _dataMap.size(); }
    void resize(std::size_t n);
 
    inline void set(RooAbsArg const *arg, std::span<const double> const &span)
@@ -119,8 +116,14 @@ class DataMap {
 
    RooBatchCompute::Config config(RooAbsArg const *arg) const;
 
+   void enableVectorBuffers(bool enable) { _enableVectorBuffers = enable; }
+   void resetVectorBuffers() { _bufferIdx = 0; }
+
 private:
    std::vector<std::span<const double>> _dataMap;
+   bool _enableVectorBuffers = false;
+   std::vector<std::vector<double>> _buffers;
+   std::size_t _bufferIdx = 0;
    std::vector<RooBatchCompute::Config> _cfgs;
 };
 
diff --git a/roofit/roofitcore/src/RooAbsPdf.cxx b/roofit/roofitcore/src/RooAbsPdf.cxx
index 282938551ff5b..830613defca3d 100644
--- a/roofit/roofitcore/src/RooAbsPdf.cxx
+++ b/roofit/roofitcore/src/RooAbsPdf.cxx
@@ -853,18 +853,19 @@ double RooAbsPdf::extendedTerm(RooAbsData const& data, bool weightSquared, bool
  * <tr><td> `EvalBackend(std::string const&)` <td> Choose a likelihood evaluation backend:
  *   <table>
  *   <tr><th> Backend <th> Description
- *   <tr><td> **legacy** - *default* <td> The original likelihood evaluation method.
- *                                        Evaluates the PDF for each single data entry at a time before summing the negative log probabilities.
- *                                        This is the default if `EvalBackend()` is not passed.
- *   <tr><td> **cpu** <td> New vectorized evaluation mode, using faster math functions and auto-vectorisation.
- *                         If all RooAbsArg objects in the model support it, likelihood computations are 2 to 10 times faster,
- *                         unless your dataset is so small that the vectorization is not worth it.
- *                         The relative difference of the single log-likelihoods w.r.t. the legacy mode is usually better than \f$10^{-12}\f$,
+ *   <tr><td> **cpu** - *default* <td> New vectorized evaluation mode, using faster math functions and auto-vectorisation.
+ *                         Since ROOT 6.23, this is the default if `EvalBackend()` is not passed, succeeding the **legacy** backend.
+ *                         If all RooAbsArg objects in the model support vectorized evaluation,
+ *                         likelihood computations are 2 to 10 times faster than with the **legacy** backend
+ *                         - unless your dataset is so small that the vectorization is not worth it.
+ *                         The relative difference of the single log-likelihoods with respect to the legacy mode is usually better than \f$10^{-12}\f$,
  *                         and for fit parameters it's usually better than \f$10^{-6}\f$. In past ROOT releases, this backend could be activated with the now deprecated `BatchMode()` option.
  *   <tr><td> **cuda** <td> Evaluate the likelihood on a GPU that supports CUDA.
  *                          This backend re-uses code from the **cpu** backend, but compiled in CUDA kernels.
  *                          Hence, the results are expected to be identical, modulo some numerical differences that can arise from the different order in which the GPU is summing the log probabilities.
  *                          This backend can drastically speed up the fit if all RooAbsArg object in the model support it.
+ *   <tr><td> **legacy** <td> The original likelihood evaluation method.
+ *                            Evaluates the PDF for each single data entry at a time before summing the negative log probabilities.
  *   <tr><td> **codegen** <td> **Experimental** - Generates and compiles minimal C++ code for the NLL on-the-fly and wraps it in the returned RooAbsReal.
  *                             Also generates and compiles the code for the gradient using Automatic Differentiation (AD) with [Clad](https://github.com/vgvassilev/clad).
  *                             This analytic gradient is passed to the minimizer, which can result in significant speedups for many-parameter fits,
diff --git a/roofit/roofitcore/src/RooAddModel.cxx b/roofit/roofitcore/src/RooAddModel.cxx
index 51d99f2fd44f5..21e659ded6d55 100644
--- a/roofit/roofitcore/src/RooAddModel.cxx
+++ b/roofit/roofitcore/src/RooAddModel.cxx
@@ -397,8 +397,8 @@ void RooAddModel::computeBatch(double *output, size_t nEvents, RooFit::Detail::D
       _coefCache[i] = coefVals[0];
    }
 
-   RooBatchCompute::VarVector pdfs;
-   RooBatchCompute::ArgVector coefs;
+   std::vector<std::span<const double>> pdfs;
+   std::vector<double> coefs;
    AddCacheElem *cache = getProjCache(nullptr);
    updateCoefficients(*cache, nullptr);
 
diff --git a/roofit/roofitcore/src/RooAddPdf.cxx b/roofit/roofitcore/src/RooAddPdf.cxx
index d720cb0c17156..614734b50b6b7 100644
--- a/roofit/roofitcore/src/RooAddPdf.cxx
+++ b/roofit/roofitcore/src/RooAddPdf.cxx
@@ -574,8 +574,8 @@ void RooAddPdf::computeBatch(double* output, size_t nEvents, RooFit::Detail::Dat
     _coefCache[i] = coefVals[0];
   }
 
-  RooBatchCompute::VarVector pdfs;
-  RooBatchCompute::ArgVector coefs;
+  std::vector<std::span<const double>> pdfs;
+  std::vector<double> coefs;
   AddCacheElem* cache = getProjCache(nullptr);
   // We don't sync the coefficient values from the _coefList to the _coefCache
   // because we have already done it using the dataMap.
diff --git a/roofit/roofitcore/src/RooAddition.cxx b/roofit/roofitcore/src/RooAddition.cxx
index f29693d50e916..c3d15be3dc365 100644
--- a/roofit/roofitcore/src/RooAddition.cxx
+++ b/roofit/roofitcore/src/RooAddition.cxx
@@ -143,18 +143,17 @@ double RooAddition::evaluate() const
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Compute addition of PDFs in batches.
-void RooAddition::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const
+void RooAddition::computeBatch(double *output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const
 {
-  RooBatchCompute::VarVector pdfs;
-  RooBatchCompute::ArgVector coefs;
-  pdfs.reserve(_set.size());
-  coefs.reserve(_set.size());
-  for (const auto arg : _set)
-  {
-    pdfs.push_back(dataMap.at(arg));
-    coefs.push_back(1.0);
-  }
-  RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs);
+   std::vector<std::span<const double>> pdfs;
+   std::vector<double> coefs;
+   pdfs.reserve(_set.size());
+   coefs.reserve(_set.size());
+   for (const auto arg : _set) {
+      pdfs.push_back(dataMap.at(arg));
+      coefs.push_back(1.0);
+   }
+   RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx
index b73f767d465bf..49600ea7977f7 100644
--- a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx
+++ b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx
@@ -16,17 +16,50 @@
 #include <RooBatchCompute.h>
 #include <RooRealVar.h>
 
+#include <algorithm>
+
+namespace {
+
+// To avoid deleted move assignment.
+template <class T>
+void assignSpan(std::span<T> &to, std::span<T> const &from)
+{
+   to = from;
+}
+
+} // namespace
+
 namespace RooFit {
 namespace Detail {
 
 std::span<const double> DataMap::at(RooAbsArg const *arg, RooAbsArg const * /*caller*/)
 {
+   std::span<const double> out;
+
    if (!arg->hasDataToken()) {
       auto var = static_cast<RooRealVar const *>(arg);
-      return {&var->_value, 1};
+      assignSpan(out, {&var->_value, 1});
+   } else {
+      std::size_t idx = arg->dataToken();
+      out = _dataMap[idx];
    }
-   std::size_t idx = arg->dataToken();
-   return _dataMap[idx];
+
+   if (!_enableVectorBuffers || out.size() != 1) {
+      return out;
+   }
+
+   if (_bufferIdx == _buffers.size()) {
+      _buffers.emplace_back(RooBatchCompute::bufferSize);
+   }
+
+   double *buffer = _buffers[_bufferIdx].data();
+
+   std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]);
+   assignSpan(out, {buffer, 1});
+
+   ++_bufferIdx;
+
+   return out;
 }
 
 void DataMap::setConfig(RooAbsArg const *arg, RooBatchCompute::Config const &config)
diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx
index d7734c6edfbfa..ff018251ff106 100644
--- a/roofit/roofitcore/src/RooFit/Evaluator.cxx
+++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx
@@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg *node, NodeInfo &info)
       buffer = info.buffer->cpuWritePtr();
    }
    _dataMapCPU.set(node, {buffer, nOut});
+   if (nOut > 1) {
+      _dataMapCPU.enableVectorBuffers(true);
+   }
    nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU);
+   _dataMapCPU.resetVectorBuffers();
+   _dataMapCPU.enableVectorBuffers(false);
 #ifdef ROOFIT_CUDA
    if (info.copyAfterEvaluation) {
       _dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut});
diff --git a/roofit/roofitcore/src/RooNormalizedPdf.cxx b/roofit/roofitcore/src/RooNormalizedPdf.cxx
index 6138f5ec06d6d..2e0461843f834 100644
--- a/roofit/roofitcore/src/RooNormalizedPdf.cxx
+++ b/roofit/roofitcore/src/RooNormalizedPdf.cxx
@@ -13,6 +13,8 @@
 #include "RooNormalizedPdf.h"
 #include "RooBatchCompute.h"
 
+#include <array>
+
 /**
  * \class RooNormalizedPdf
  *
@@ -26,7 +28,7 @@ void RooNormalizedPdf::computeBatch(double *output, size_t nEvents, RooFit::Deta
    auto integralSpan = dataMap.at(_normIntegral);
 
    // We use the extraArgs as output parameter to count evaluation errors.
-   RooBatchCompute::ArgVector extraArgs{0.0, 0.0, 0.0};
+   std::array<double, 3> extraArgs{0.0, 0.0, 0.0};
 
    RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::NormalizedPdf, output, nEvents, {nums, integralSpan},
                             extraArgs);
diff --git a/roofit/roofitcore/src/RooPolyVar.cxx b/roofit/roofitcore/src/RooPolyVar.cxx
index 1e88f195972b0..ae6a48176e871 100644
--- a/roofit/roofitcore/src/RooPolyVar.cxx
+++ b/roofit/roofitcore/src/RooPolyVar.cxx
@@ -26,8 +26,6 @@ Class RooPolyvar implements analytical integrals of all polynomials
 it can define.
 **/
 
-#include <cmath>
-
 #include "RooPolyVar.h"
 #include "RooArgList.h"
 #include "RooMsgService.h"
@@ -38,6 +36,10 @@ it can define.
 
 #include "TError.h"
 
+#include <algorithm>
+#include <array>
+#include <cmath>
+
 ClassImp(RooPolyVar);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -132,23 +134,27 @@ void RooPolyVar::computeBatchImpl(RooAbsArg const* caller, double *output, size_
       return;
    }
 
-   RooBatchCompute::VarVector vars;
+   std::vector<std::span<const double>> vars;
    vars.reserve(coefs.size() + 2);
 
    // Fill the coefficients for the skipped orders. By a conventions started in
    // RooPolynomial, if the zero-th order is skipped, it implies a coefficient
    // for the constant term of one.
-   const double zero = 1.0;
-   const double one = 1.0;
+   std::array<double, RooBatchCompute::bufferSize> zeros;
+   std::array<double, RooBatchCompute::bufferSize> ones;
+   std::fill_n(zeros.data(), zeros.size(), 0.0);
+   std::fill_n(ones.data(), ones.size(), 1.0);
+   std::span<const double> zerosSpan{zeros.data(), 1};
+   std::span<const double> onesSpan{ones.data(), 1};
    for (int i = lowestOrder - 1; i >= 0; --i) {
-      vars.push_back(i == 0 ? std::span<const double>{&one, 1} : std::span<const double>{&zero, 1});
+      vars.push_back(i == 0 ? onesSpan : zerosSpan);
    }
 
    for (RooAbsArg *coef : coefs) {
       vars.push_back(dataMap.at(coef));
    }
    vars.push_back(dataMap.at(&x));
-   RooBatchCompute::ArgVector extraArgs{double(vars.size() - 1)};
+   std::array<double, 1> extraArgs{double(vars.size() - 1)};
    RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Polynomial, output, nEvents, vars, extraArgs);
 }
 
diff --git a/roofit/roofitcore/src/RooProdPdf.cxx b/roofit/roofitcore/src/RooProdPdf.cxx
index ebe61fda7c483..e8e2b323ec6ac 100644
--- a/roofit/roofitcore/src/RooProdPdf.cxx
+++ b/roofit/roofitcore/src/RooProdPdf.cxx
@@ -62,9 +62,10 @@ have to appear in any specific place in the list.
 #include "RooFitImplHelpers.h"
 #include "strtok.h"
 
+#include <algorithm>
+#include <array>
 #include <cstring>
 #include <sstream>
-#include <algorithm>
 
 #ifndef _WIN32
 #include <strings.h>
@@ -418,13 +419,13 @@ void RooProdPdf::calculateBatch(RooAbsArg const *caller, const RooProdPdf::Cache
       RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Ratio, output, nEvents,
                                {numerator, denominator});
    } else {
-      RooBatchCompute::VarVector factors;
+      std::vector<std::span<const double>> factors;
       factors.reserve(cache._partList.size());
       for (const RooAbsArg *i : cache._partList) {
          auto span = dataMap.at(i);
          factors.push_back(span);
       }
-      RooBatchCompute::ArgVector special{static_cast<double>(factors.size())};
+      std::array<double, 1> special{static_cast<double>(factors.size())};
       RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::ProdPdf, output, nEvents, factors, special);
    }
 }
diff --git a/roofit/roofitcore/src/RooTruthModel.cxx b/roofit/roofitcore/src/RooTruthModel.cxx
index 2aa7c7bef559d..acf8a505be8cb 100644
--- a/roofit/roofitcore/src/RooTruthModel.cxx
+++ b/roofit/roofitcore/src/RooTruthModel.cxx
@@ -38,6 +38,7 @@ functions used in D mixing have been hand coded for increased execution speed.
 #include <TError.h>
 
 #include <algorithm>
+#include <array>
 #include <cmath>
 #include <limits>
 
@@ -265,7 +266,7 @@ void RooTruthModel::computeBatch(double *output, size_t nEvents, RooFit::Detail:
    auto param2Vals = param2 ? dataMap.at(param2) : std::span<const double>{};
 
    // Return desired basis function
-   RooBatchCompute::ArgVector extraArgs{basisSign};
+   std::array<double, 1> extraArgs{basisSign};
    switch (basisType) {
    case expBasis: {
       RooBatchCompute::compute(config, RooBatchCompute::TruthModelExpBasis, output, nEvents, {xVals, param1Vals},

Backend	Description - *
legacy - default	The original likelihood evaluation method. - * Evaluates the PDF for each single data entry at a time before summing the negative log probabilities. - * This is the default if `EvalBackend()` is not passed. - *
cpu	New vectorized evaluation mode, using faster math functions and auto-vectorisation. - * If all RooAbsArg objects in the model support it, likelihood computations are 2 to 10 times faster, - * unless your dataset is so small that the vectorization is not worth it. - * The relative difference of the single log-likelihoods w.r.t. the legacy mode is usually better than \f$10^{-12}\f$, + *
cpu - default	New vectorized evaluation mode, using faster math functions and auto-vectorisation. + * Since ROOT 6.23, this is the default if `EvalBackend()` is not passed, succeeding the legacy backend. + * If all RooAbsArg objects in the model support vectorized evaluation, + * likelihood computations are 2 to 10 times faster than with the legacy backend + * - unless your dataset is so small that the vectorization is not worth it. + * The relative difference of the single log-likelihoods with respect to the legacy mode is usually better than \f$10^{-12}\f$, * and for fit parameters it's usually better than \f$10^{-6}\f$. In past ROOT releases, this backend could be activated with the now deprecated `BatchMode()` option. *
cuda	Evaluate the likelihood on a GPU that supports CUDA. * This backend re-uses code from the cpu backend, but compiled in CUDA kernels. * Hence, the results are expected to be identical, modulo some numerical differences that can arise from the different order in which the GPU is summing the log probabilities. * This backend can drastically speed up the fit if all RooAbsArg object in the model support it. + *
legacy	The original likelihood evaluation method. + * Evaluates the PDF for each single data entry at a time before summing the negative log probabilities. *
codegen	Experimental - Generates and compiles minimal C++ code for the NLL on-the-fly and wraps it in the returned RooAbsReal. * Also generates and compiles the code for the gradient using Automatic Differentiation (AD) with [Clad](https://github.com/vgvassilev/clad). * This analytic gradient is passed to the minimizer, which can result in significant speedups for many-parameter fits, diff --git a/roofit/roofitcore/src/RooAddModel.cxx b/roofit/roofitcore/src/RooAddModel.cxx index 51d99f2fd44f5..21e659ded6d55 100644 --- a/roofit/roofitcore/src/RooAddModel.cxx +++ b/roofit/roofitcore/src/RooAddModel.cxx @@ -397,8 +397,8 @@ void RooAddModel::computeBatch(double output, size_t nEvents, RooFit::Detail::D _coefCache[i] = coefVals[0]; } - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; + std::vector> pdfs; + std::vector coefs; AddCacheElem cache = getProjCache(nullptr); updateCoefficients(cache, nullptr); diff --git a/roofit/roofitcore/src/RooAddPdf.cxx b/roofit/roofitcore/src/RooAddPdf.cxx index d720cb0c17156..614734b50b6b7 100644 --- a/roofit/roofitcore/src/RooAddPdf.cxx +++ b/roofit/roofitcore/src/RooAddPdf.cxx @@ -574,8 +574,8 @@ void RooAddPdf::computeBatch(double output, size_t nEvents, RooFit::Detail::Dat _coefCache[i] = coefVals[0]; } - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; + std::vector> pdfs; + std::vector coefs; AddCacheElem* cache = getProjCache(nullptr); // We don't sync the coefficient values from the _coefList to the _coefCache // because we have already done it using the dataMap. diff --git a/roofit/roofitcore/src/RooAddition.cxx b/roofit/roofitcore/src/RooAddition.cxx index f29693d50e916..c3d15be3dc365 100644 --- a/roofit/roofitcore/src/RooAddition.cxx +++ b/roofit/roofitcore/src/RooAddition.cxx @@ -143,18 +143,17 @@ double RooAddition::evaluate() const //////////////////////////////////////////////////////////////////////////////// /// Compute addition of PDFs in batches. -void RooAddition::computeBatch(double* output, size_t nEvents, RooFit::Detail::DataMap const& dataMap) const +void RooAddition::computeBatch(double output, size_t nEvents, RooFit::Detail::DataMap const &dataMap) const { - RooBatchCompute::VarVector pdfs; - RooBatchCompute::ArgVector coefs; - pdfs.reserve(_set.size()); - coefs.reserve(_set.size()); - for (const auto arg : _set) - { - pdfs.push_back(dataMap.at(arg)); - coefs.push_back(1.0); - } - RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs); + std::vector> pdfs; + std::vector coefs; + pdfs.reserve(_set.size()); + coefs.reserve(_set.size()); + for (const auto arg : _set) { + pdfs.push_back(dataMap.at(arg)); + coefs.push_back(1.0); + } + RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::AddPdf, output, nEvents, pdfs, coefs); } //////////////////////////////////////////////////////////////////////////////// diff --git a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx index b73f767d465bf..49600ea7977f7 100644 --- a/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx +++ b/roofit/roofitcore/src/RooFit/Detail/DataMap.cxx @@ -16,17 +16,50 @@ #include #include +#include + +namespace { + +// To avoid deleted move assignment. +template +void assignSpan(std::span &to, std::span const &from) +{ + to = from; +} + +} // namespace + namespace RooFit { namespace Detail { std::span DataMap::at(RooAbsArg const arg, RooAbsArg const * /caller/) { + std::span out; + if (!arg->hasDataToken()) { auto var = static_cast(arg); - return {&var->_value, 1}; + assignSpan(out, {&var->_value, 1}); + } else { + std::size_t idx = arg->dataToken(); + out = _dataMap[idx]; } - std::size_t idx = arg->dataToken(); - return _dataMap[idx]; + + if (!_enableVectorBuffers \|\| out.size() != 1) { + return out; + } + + if (_bufferIdx == _buffers.size()) { + _buffers.emplace_back(RooBatchCompute::bufferSize); + } + + double buffer = _buffers[_bufferIdx].data(); + + std::fill_n(buffer, RooBatchCompute::bufferSize, out[0]); + assignSpan(out, {buffer, 1}); + + ++_bufferIdx; + + return out; } void DataMap::setConfig(RooAbsArg const arg, RooBatchCompute::Config const &config) diff --git a/roofit/roofitcore/src/RooFit/Evaluator.cxx b/roofit/roofitcore/src/RooFit/Evaluator.cxx index d7734c6edfbfa..ff018251ff106 100644 --- a/roofit/roofitcore/src/RooFit/Evaluator.cxx +++ b/roofit/roofitcore/src/RooFit/Evaluator.cxx @@ -384,7 +384,12 @@ void Evaluator::computeCPUNode(const RooAbsArg node, NodeInfo &info) buffer = info.buffer->cpuWritePtr(); } _dataMapCPU.set(node, {buffer, nOut}); + if (nOut > 1) { + _dataMapCPU.enableVectorBuffers(true); + } nodeAbsReal->computeBatch(buffer, nOut, _dataMapCPU); + _dataMapCPU.resetVectorBuffers(); + _dataMapCPU.enableVectorBuffers(false); #ifdef ROOFIT_CUDA if (info.copyAfterEvaluation) { _dataMapCUDA.set(node, {info.buffer->gpuReadPtr(), nOut}); diff --git a/roofit/roofitcore/src/RooNormalizedPdf.cxx b/roofit/roofitcore/src/RooNormalizedPdf.cxx index 6138f5ec06d6d..2e0461843f834 100644 --- a/roofit/roofitcore/src/RooNormalizedPdf.cxx +++ b/roofit/roofitcore/src/RooNormalizedPdf.cxx @@ -13,6 +13,8 @@ #include "RooNormalizedPdf.h" #include "RooBatchCompute.h" +#include + /* * \class RooNormalizedPdf * @@ -26,7 +28,7 @@ void RooNormalizedPdf::computeBatch(double output, size_t nEvents, RooFit::Deta auto integralSpan = dataMap.at(_normIntegral); // We use the extraArgs as output parameter to count evaluation errors. - RooBatchCompute::ArgVector extraArgs{0.0, 0.0, 0.0}; + std::array extraArgs{0.0, 0.0, 0.0}; RooBatchCompute::compute(dataMap.config(this), RooBatchCompute::NormalizedPdf, output, nEvents, {nums, integralSpan}, extraArgs); diff --git a/roofit/roofitcore/src/RooPolyVar.cxx b/roofit/roofitcore/src/RooPolyVar.cxx index 1e88f195972b0..ae6a48176e871 100644 --- a/roofit/roofitcore/src/RooPolyVar.cxx +++ b/roofit/roofitcore/src/RooPolyVar.cxx @@ -26,8 +26,6 @@ Class RooPolyvar implements analytical integrals of all polynomials it can define. / -#include - #include "RooPolyVar.h" #include "RooArgList.h" #include "RooMsgService.h" @@ -38,6 +36,10 @@ it can define. #include "TError.h" +#include +#include +#include + ClassImp(RooPolyVar); //////////////////////////////////////////////////////////////////////////////// @@ -132,23 +134,27 @@ void RooPolyVar::computeBatchImpl(RooAbsArg const caller, double output, size_ return; } - RooBatchCompute::VarVector vars; + std::vector> vars; vars.reserve(coefs.size() + 2); // Fill the coefficients for the skipped orders. By a conventions started in // RooPolynomial, if the zero-th order is skipped, it implies a coefficient // for the constant term of one. - const double zero = 1.0; - const double one = 1.0; + std::array zeros; + std::array ones; + std::fill_n(zeros.data(), zeros.size(), 0.0); + std::fill_n(ones.data(), ones.size(), 1.0); + std::span zerosSpan{zeros.data(), 1}; + std::span onesSpan{ones.data(), 1}; for (int i = lowestOrder - 1; i >= 0; --i) { - vars.push_back(i == 0 ? std::span{&one, 1} : std::span{&zero, 1}); + vars.push_back(i == 0 ? onesSpan : zerosSpan); } for (RooAbsArg coef : coefs) { vars.push_back(dataMap.at(coef)); } vars.push_back(dataMap.at(&x)); - RooBatchCompute::ArgVector extraArgs{double(vars.size() - 1)}; + std::array extraArgs{double(vars.size() - 1)}; RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Polynomial, output, nEvents, vars, extraArgs); } diff --git a/roofit/roofitcore/src/RooProdPdf.cxx b/roofit/roofitcore/src/RooProdPdf.cxx index ebe61fda7c483..e8e2b323ec6ac 100644 --- a/roofit/roofitcore/src/RooProdPdf.cxx +++ b/roofit/roofitcore/src/RooProdPdf.cxx @@ -62,9 +62,10 @@ have to appear in any specific place in the list. #include "RooFitImplHelpers.h" #include "strtok.h" +#include +#include #include #include -#include #ifndef _WIN32 #include @@ -418,13 +419,13 @@ void RooProdPdf::calculateBatch(RooAbsArg const caller, const RooProdPdf::Cache RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::Ratio, output, nEvents, {numerator, denominator}); } else { - RooBatchCompute::VarVector factors; + std::vector> factors; factors.reserve(cache._partList.size()); for (const RooAbsArg i : cache._partList) { auto span = dataMap.at(i); factors.push_back(span); } - RooBatchCompute::ArgVector special{static_cast(factors.size())}; + std::array special{static_cast(factors.size())}; RooBatchCompute::compute(dataMap.config(caller), RooBatchCompute::ProdPdf, output, nEvents, factors, special); } } diff --git a/roofit/roofitcore/src/RooTruthModel.cxx b/roofit/roofitcore/src/RooTruthModel.cxx index 2aa7c7bef559d..acf8a505be8cb 100644 --- a/roofit/roofitcore/src/RooTruthModel.cxx +++ b/roofit/roofitcore/src/RooTruthModel.cxx @@ -38,6 +38,7 @@ functions used in D mixing have been hand coded for increased execution speed. #include #include +#include #include #include @@ -265,7 +266,7 @@ void RooTruthModel::computeBatch(double *output, size_t nEvents, RooFit::Detail: auto param2Vals = param2 ? dataMap.at(param2) : std::span{}; // Return desired basis function - RooBatchCompute::ArgVector extraArgs{basisSign}; + std::array extraArgs{basisSign}; switch (basisType) { case expBasis: { RooBatchCompute::compute(config, RooBatchCompute::TruthModelExpBasis, output, nEvents, {xVals, param1Vals},