From 5509db51ac3093ce6e012c9ec1403ed628b5cb6b Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Mon, 9 Sep 2024 21:54:54 +0800
Subject: [PATCH] [EM] Avoid synchronous calls and unnecessary ATS access.

- Pass context into various functions.
- Factor out some CUDA algorithms.
- Use ATS only for update position.
---
 src/common/algorithm.cuh                      | 38 +++++++++-
 src/common/device_helpers.cuh                 | 69 ++++-------------
 src/common/ranking_utils.cu                   |  4 +-
 src/common/threading_utils.cuh                | 24 +++---
 src/data/ellpack_page.cu                      | 25 +-----
 src/metric/auc.cu                             | 24 +++---
 src/metric/elementwise_metric.cu              |  8 +-
 src/tree/constraints.cu                       | 13 ++--
 src/tree/constraints.cuh                      |  2 +-
 src/tree/gpu_hist/evaluate_splits.cuh         |  8 +-
 src/tree/gpu_hist/evaluator.cu                | 19 +++--
 src/tree/gpu_hist/histogram.cu                |  4 +-
 src/tree/gpu_hist/row_partitioner.cuh         |  9 ++-
 src/tree/updater_gpu_hist.cu                  | 76 +++++++++----------
 tests/cpp/common/test_threading_utils.cu      | 16 ++--
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 23 +++---
 16 files changed, 161 insertions(+), 201 deletions(-)
diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh
index b0bec3488979..e88eb1f0c9b1 100644
--- a/src/common/algorithm.cuh
+++ b/src/common/algorithm.cuh
@@ -190,8 +190,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V
 }
 
 template <bool accending, typename IdxT, typename U>
-void ArgSort(xgboost::Context const *ctx, xgboost::common::Span<U> keys,
-             xgboost::common::Span<IdxT> sorted_idx) {
+void ArgSort(Context const *ctx, Span<U> keys, Span<IdxT> sorted_idx) {
   std::size_t bytes = 0;
   auto cuctx = ctx->CUDACtx();
   dh::Iota(sorted_idx, cuctx->Stream());
@@ -272,5 +271,40 @@ void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_f
     out_first = thrust::copy_if(cuctx->CTP(), begin_input, end_input, out_first, pred);
   }
 }
+
+// Go one level down into cub::DeviceScan API to set OffsetT as 64 bit So we don't crash
+// on n > 2^31.
+template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT, typename OffsetT>
+void InclusiveScan(xgboost::Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
+                   ScanOpT scan_op, OffsetT num_items) {
+  auto cuctx = ctx->CUDACtx();
+  std::size_t bytes = 0;
+#if THRUST_MAJOR_VERSION >= 2
+  dh::safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr)));
+#else
+  safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr, false)));
+#endif
+  dh::TemporaryArray<char> storage(bytes);
+#if THRUST_MAJOR_VERSION >= 2
+  dh::safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr)));
+#else
+  safe_cuda((
+      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType, OffsetT>::Dispatch(
+          storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr,
+          false)));
+#endif
+}
+
+template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
+void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out,
+                  OffsetT num_items) {
+  InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items);
+}
 }  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_ALGORITHM_CUH_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 2e5fb5cd91b7..d7b401f684f2 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -372,21 +372,6 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
                                 cudaMemcpyDeviceToHost));
 }
 
-template <class Src, class Dst>
-void CopyTo(Src const &src, Dst *dst) {
-  if (src.empty()) {
-    dst->clear();
-    return;
-  }
-  dst->resize(src.size());
-  using SVT = std::remove_cv_t<typename Src::value_type>;
-  using DVT = std::remove_cv_t<typename Dst::value_type>;
-  static_assert(std::is_same_v<SVT, DVT>,
-                "Host and device containers must have same value type.");
-  dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
-                                src.size() * sizeof(SVT), cudaMemcpyDefault));
-}
-
 // Keep track of pinned memory allocation
 struct PinnedMemory {
   void *temp_storage{nullptr};
@@ -748,45 +733,6 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce
   return aggregate;
 }
 
-// wrapper to avoid integer `num_items`.
-template <typename InputIteratorT, typename OutputIteratorT, typename ScanOpT,
-          typename OffsetT>
-void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op,
-                   OffsetT num_items) {
-  size_t bytes = 0;
-#if THRUST_MAJOR_VERSION >= 2
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
-                                           cub::NullType(), num_items, nullptr)));
-#else
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(nullptr, bytes, d_in, d_out, scan_op,
-                                           cub::NullType(), num_items, nullptr,
-                                           false)));
-#endif
-  TemporaryArray<char> storage(bytes);
-#if THRUST_MAJOR_VERSION >= 2
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
-                                           d_out, scan_op, cub::NullType(),
-                                           num_items, nullptr)));
-#else
-  safe_cuda((
-      cub::DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, cub::NullType,
-                        OffsetT>::Dispatch(storage.data().get(), bytes, d_in,
-                                           d_out, scan_op, cub::NullType(),
-                                           num_items, nullptr, false)));
-#endif
-}
-
-template <typename InputIteratorT, typename OutputIteratorT, typename OffsetT>
-void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) {
-  InclusiveScan(d_in, d_out, cub::Sum(), num_items);
-}
-
 class CUDAStreamView;
 
 class CUDAEvent {
@@ -857,8 +803,23 @@ class CUDAStream {
   [[nodiscard]] cudaStream_t Handle() const { return stream_; }
 
   void Sync() { this->View().Sync(); }
+  void Wait(CUDAEvent const &e) { this->View().Wait(e); }
 };
 
+template <class Src, class Dst>
+void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) {
+  if (src.empty()) {
+    dst->clear();
+    return;
+  }
+  dst->resize(src.size());
+  using SVT = std::remove_cv_t<typename Src::value_type>;
+  using DVT = std::remove_cv_t<typename Dst::value_type>;
+  static_assert(std::is_same_v<SVT, DVT>, "Host and device containers must have same value type.");
+  dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(),
+                                src.size() * sizeof(SVT), cudaMemcpyDefault, stream));
+}
+
 inline auto CachingThrustPolicy() {
   XGBCachingDeviceAllocator<char> alloc;
 #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM)
diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu
index 5ad8a575c468..c67af5571be1 100644
--- a/src/common/ranking_utils.cu
+++ b/src/common/ranking_utils.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2023 by XGBoost Contributors
+ * Copyright 2023-2024, XGBoost Contributors
  */
 #include <thrust/functional.h>                  // for maximum
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
@@ -158,7 +158,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) {
   auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan();
   if (param_.HasTruncation()) {
     n_cuda_threads_ =
-        common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair());
+        common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr, Param().NumPair());
   } else {
     auto n_pairs = Param().NumPair();
     dh::LaunchN(n_groups, cuctx->Stream(),
diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh
index db5fe82f94ac..1a4e29f38645 100644
--- a/src/common/threading_utils.cuh
+++ b/src/common/threading_utils.cuh
@@ -1,20 +1,20 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #ifndef XGBOOST_COMMON_THREADING_UTILS_CUH_
 #define XGBOOST_COMMON_THREADING_UTILS_CUH_
 
-#include <algorithm>           // std::min
-#include <cstddef>             // std::size_t
+#include <algorithm>  // std::min
+#include <cstddef>    // std::size_t
 
 #include "./math.h"            // Sqr
-#include "common.h"
+#include "algorithm.cuh"       // for InclusiveSum
+#include "common.h"            // for safe_cuda
 #include "device_helpers.cuh"  // LaunchN
 #include "xgboost/base.h"      // XGBOOST_DEVICE
 #include "xgboost/span.h"      // Span
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 /**
  * \param n Number of items (length of the base)
  * \param h hight
@@ -43,9 +43,8 @@ XGBOOST_DEVICE inline std::size_t DiscreteTrapezoidArea(std::size_t n, std::size
  * with h <= n
  */
 template <typename U>
-std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
-                                      xgboost::common::Span<std::size_t> out_group_threads_ptr,
-                                      std::size_t h) {
+std::size_t SegmentedTrapezoidThreads(Context const *ctx, Span<U> group_ptr,
+                                      Span<std::size_t> out_group_threads_ptr, std::size_t h) {
   CHECK_GE(group_ptr.size(), 1);
   CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size());
   dh::LaunchN(group_ptr.size(), [=] XGBOOST_DEVICE(std::size_t idx) {
@@ -57,8 +56,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span<U> group_ptr,
     std::size_t cnt = static_cast<std::size_t>(group_ptr[idx] - group_ptr[idx - 1]);
     out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h);
   });
-  dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(),
-                   out_group_threads_ptr.size());
+  InclusiveSum(ctx, out_group_threads_ptr.data(), out_group_threads_ptr.data(),
+               out_group_threads_ptr.size());
   std::size_t total = 0;
   dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1,
                            sizeof(total), cudaMemcpyDeviceToHost));
@@ -82,6 +81,5 @@ XGBOOST_DEVICE inline void UnravelTrapeziodIdx(std::size_t i_idx, std::size_t n,
 
   j = idx - n_elems + i + 1;
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
 #endif  // XGBOOST_COMMON_THREADING_UTILS_CUH_
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 0f4cf3a2edc8..8f8ab0af7d01 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -254,30 +254,7 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
       d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
   thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
 
-  // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit
-  // So we don't crash on n > 2^31
-  size_t temp_storage_bytes = 0;
-  using DispatchScan = cub::DispatchScan<decltype(key_value_index_iter), decltype(out),
-                                         TupleScanOp<Tuple>, cub::NullType, std::int64_t>;
-#if THRUST_MAJOR_VERSION >= 2
-  dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                                       TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                                       ctx->CUDACtx()->Stream()));
-#else
-  DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out,
-                         TupleScanOp<Tuple>(), cub::NullType(), batch.Size(),
-                         nullptr, false);
-#endif
-  dh::TemporaryArray<char> temp_storage(temp_storage_bytes);
-#if THRUST_MAJOR_VERSION >= 2
-  dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                                       key_value_index_iter, out, TupleScanOp<Tuple>(),
-                                       cub::NullType(), batch.Size(), ctx->CUDACtx()->Stream()));
-#else
-  DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes,
-                         key_value_index_iter, out, TupleScanOp<Tuple>(),
-                         cub::NullType(), batch.Size(), nullptr, false);
-#endif
+  common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
 }
 
 void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span<size_t> row_counts) {
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index 4155a7084481..37f089ec6057 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -13,7 +13,7 @@
 #include <utility>
 
 #include "../collective/allreduce.h"
-#include "../common/algorithm.cuh"        // SegmentedArgSort
+#include "../common/algorithm.cuh"        // SegmentedArgSort, InclusiveScan
 #include "../common/optional_weight.h"    // OptionalWeights
 #include "../common/threading_utils.cuh"  // UnravelTrapeziodIdx,SegmentedTrapezoidThreads
 #include "auc.h"
@@ -128,8 +128,8 @@ std::tuple<double, double, double> GPUBinaryAUC(Context const *ctx,
       dh::tbegin(d_unique_idx));
   d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx));
 
-  dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp),
-                    PairPlus<double, double>{}, d_fptp.size());
+  common::InclusiveScan(ctx, dh::tbegin(d_fptp), dh::tbegin(d_fptp), PairPlus<double, double>{},
+                        d_fptp.size());
 
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
   // scatter unique negaive/positive values
@@ -239,7 +239,7 @@ double ScaleClasses(Context const *ctx, bool is_column_split, common::Span<doubl
  * getting class id or group id given scan index.
  */
 template <typename Fn>
-void SegmentedFPTP(common::Span<Pair> d_fptp, Fn segment_id) {
+void SegmentedFPTP(Context const *ctx, common::Span<Pair> d_fptp, Fn segment_id) {
   using Triple = thrust::tuple<uint32_t, double, double>;
   // expand to tuple to include idx
   auto fptp_it_in = dh::MakeTransformIterator<Triple>(
@@ -253,8 +253,8 @@ void SegmentedFPTP(common::Span<Pair> d_fptp, Fn segment_id) {
             thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t));
         return t;
       });
-  dh::InclusiveScan(
-      fptp_it_in, fptp_it_out,
+  common::InclusiveScan(
+      ctx, fptp_it_in, fptp_it_out,
       [=] XGBOOST_DEVICE(Triple const &l, Triple const &r) {
         uint32_t l_gid = segment_id(thrust::get<0>(l));
         uint32_t r_gid = segment_id(thrust::get<0>(r));
@@ -391,7 +391,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info,
   d_unique_idx = d_unique_idx.subspan(0, n_uniques);
 
   auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; };
-  SegmentedFPTP(d_fptp, get_class_id);
+  SegmentedFPTP(ctx, d_fptp, get_class_id);
 
   // scatter unique FP_PREV/TP_PREV values
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
@@ -528,8 +528,8 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
   dh::caching_device_vector<size_t> threads_group_ptr(group_ptr.size(), 0);
   auto d_threads_group_ptr = dh::ToSpan(threads_group_ptr);
   // Use max to represent triangle
-  auto n_threads = common::SegmentedTrapezoidThreads(
-      d_group_ptr, d_threads_group_ptr, std::numeric_limits<size_t>::max());
+  auto n_threads = common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr,
+                                                     std::numeric_limits<std::size_t>::max());
   CHECK_LT(n_threads, std::numeric_limits<int32_t>::max());
   // get the coordinate in nested summation
   auto get_i_j = [=]XGBOOST_DEVICE(size_t idx, size_t query_group_idx) {
@@ -591,8 +591,8 @@ std::pair<double, std::uint32_t> GPURankingAUC(Context const *ctx, common::Span<
         }
         return {};  // discard
       });
-  dh::InclusiveScan(
-      in, out,
+  common::InclusiveScan(
+      ctx, in, out,
       [] XGBOOST_DEVICE(RankScanItem const &l, RankScanItem const &r) {
         if (l.group_id != r.group_id) {
           return r;
@@ -774,7 +774,7 @@ std::pair<double, uint32_t> GPURankingPRAUCImpl(Context const *ctx,
   auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) {
     return dh::SegmentId(d_group_ptr, idx);
   };
-  SegmentedFPTP(d_fptp, get_group_id);
+  SegmentedFPTP(ctx, d_fptp, get_group_id);
 
   // scatter unique FP_PREV/TP_PREV values
   auto d_neg_pos = dh::ToSpan(cache->neg_pos);
diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu
index ec5b9079d7d9..7b662143a89b 100644
--- a/src/metric/elementwise_metric.cu
+++ b/src/metric/elementwise_metric.cu
@@ -12,7 +12,6 @@
 #include <cmath>
 #include <numeric>  // for accumulate
 
-#include "../common/common.h"  // for AssertGPUSupport
 #include "../common/math.h"
 #include "../common/optional_weight.h"  // OptionalWeights
 #include "../common/pseudo_huber.h"
@@ -28,7 +27,9 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>
 
-#include "../common/device_helpers.cuh"
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#else
+#include "../common/common.h"  // for AssertGPUSupport
 #endif  // XGBOOST_USE_CUDA
 
 namespace xgboost::metric {
@@ -48,11 +49,10 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
   auto labels = info.labels.View(ctx->Device());
   if (ctx->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
-    dh::XGBCachingDeviceAllocator<char> alloc;
     thrust::counting_iterator<size_t> begin(0);
     thrust::counting_iterator<size_t> end = begin + labels.Size();
     result = thrust::transform_reduce(
-        thrust::cuda::par(alloc), begin, end,
+        ctx->CUDACtx()->CTP(), begin, end,
         [=] XGBOOST_DEVICE(size_t i) {
           auto idx = linalg::UnravelIndex(i, labels.Shape());
           auto sample_id = std::get<0>(idx);
diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu
index b222402fcfce..183c609a4bef 100644
--- a/src/tree/constraints.cu
+++ b/src/tree/constraints.cu
@@ -6,14 +6,15 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 
-#include <string>
 #include <set>
+#include <string>
 
-#include "xgboost/logging.h"
-#include "xgboost/span.h"
+#include "../common/cuda_context.cuh"  // for CUDAContext
+#include "../common/device_helpers.cuh"
 #include "constraints.cuh"
 #include "param.h"
-#include "../common/device_helpers.cuh"
+#include "xgboost/logging.h"
+#include "xgboost/span.h"
 
 namespace xgboost {
 
@@ -130,9 +131,9 @@ FeatureInteractionConstraintDevice::FeatureInteractionConstraintDevice(
   this->Configure(param, n_features);
 }
 
-void FeatureInteractionConstraintDevice::Reset() {
+void FeatureInteractionConstraintDevice::Reset(Context const* ctx) {
   for (auto& node : node_constraints_storage_) {
-    thrust::fill(node.begin(), node.end(), 0);
+    thrust::fill(ctx->CUDACtx()->CTP(), node.begin(), node.end(), 0);
   }
 }
 
diff --git a/src/tree/constraints.cuh b/src/tree/constraints.cuh
index 94c262240c19..dfd917277e2e 100644
--- a/src/tree/constraints.cuh
+++ b/src/tree/constraints.cuh
@@ -78,7 +78,7 @@ struct FeatureInteractionConstraintDevice {
   FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice const& that) = default;
   FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice&& that) = default;
   /*! \brief Reset before constructing a new tree. */
-  void Reset();
+  void Reset(Context const* ctx);
   /*! \brief Return a list of features given node id */
   common::Span<bst_feature_t> QueryNode(int32_t nid);
   /*!
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 4be8e108f0ce..19e8f2f931d6 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -138,9 +138,9 @@ class GPUHistEvaluator {
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
-  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-             bst_feature_t n_features, TrainParam const &param, bool is_column_split,
-             DeviceOrd device);
+  void Reset(Context const *ctx, common::HistogramCuts const &cuts,
+             common::Span<FeatureType const> ft, bst_feature_t n_features, TrainParam const &param,
+             bool is_column_split);
 
   /**
    * \brief Get host category storage for nidx.  Different from the internal version, this
@@ -154,8 +154,8 @@ class GPUHistEvaluator {
   }
 
   [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) {
-    copy_stream_.View().Sync();
     if (has_categoricals_) {
+      copy_stream_.View().Sync();
       CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_};
       return common::KCatBitField{accessor.GetNodeCatStorage(nidx)};
     } else {
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 6eed74c56e87..ee542a94a825 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -13,14 +13,13 @@
 #include "xgboost/data.h"
 
 namespace xgboost::tree {
-void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
-                             bst_feature_t n_features, TrainParam const &param,
-                             bool is_column_split, DeviceOrd device) {
+void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cuts,
+                             common::Span<FeatureType const> ft, bst_feature_t n_features,
+                             TrainParam const &param, bool is_column_split) {
   param_ = param;
-  tree_evaluator_ = TreeEvaluator{param, n_features, device};
+  tree_evaluator_ = TreeEvaluator{param, n_features, ctx->Device()};
   has_categoricals_ = cuts.HasCategorical();
   if (cuts.HasCategorical()) {
-    dh::XGBCachingDeviceAllocator<char> alloc;
     auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
     auto beg = thrust::make_counting_iterator<size_t>(1ul);
     auto end = thrust::make_counting_iterator<size_t>(ptrs.size());
@@ -29,7 +28,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     // onehot-encoding-based splits.
     // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
     need_sort_histogram_ =
-        thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
+        thrust::any_of(ctx->CUDACtx()->CTP(), beg, end, [=] XGBOOST_DEVICE(size_t i) {
           auto idx = i - 1;
           if (common::IsCat(ft, idx)) {
             auto n_bins = ptrs[i] - ptrs[idx];
@@ -44,8 +43,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     CHECK_NE(node_categorical_storage_size_, 0);
     split_cats_.resize(node_categorical_storage_size_);
     h_split_cats_.resize(node_categorical_storage_size_);
-    dh::safe_cuda(
-        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    dh::safe_cuda(cudaMemsetAsync(split_cats_.data().get(), '\0',
+                                  split_cats_.size() * sizeof(CatST), ctx->CUDACtx()->Stream()));
 
     cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
     sort_input_.resize(cat_sorted_idx_.size());
@@ -57,14 +56,14 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span<Fea
     auto d_fidxes = dh::ToSpan(feature_idx_);
     auto it = thrust::make_counting_iterator(0ul);
     auto values = cuts.cut_values_.ConstDeviceSpan();
-    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
+    thrust::transform(ctx->CUDACtx()->CTP(), it, it + feature_idx_.size(), feature_idx_.begin(),
                       [=] XGBOOST_DEVICE(size_t i) {
                         auto fidx = dh::SegmentId(ptrs, i);
                         return fidx;
                       });
   }
   is_column_split_ = is_column_split;
-  device_ = device;
+  device_ = ctx->Device();
 }
 
 common::Span<bst_feature_t const> GPUHistEvaluator::SortHistogram(
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index dd89238b5d89..7f1f79dee09c 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -66,12 +66,10 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                      MetaInfo const& info) {
   using GradientSumT = GradientPairPrecise;
   using T = typename GradientSumT::ValueT;
-  dh::XGBCachingDeviceAllocator<char> alloc;
 
   thrust::device_ptr<GradientPair const> gpair_beg{gpair.data()};
   auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
-  Pair p =
-      dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
+  Pair p = dh::Reduce(ctx->CUDACtx()->CTP(), beg, beg + gpair.size(), Pair{}, thrust::plus<Pair>{});
   // Treat pair as array of 4 primitive types to allreduce
   using ReduceT = typename decltype(p.first)::ValueT;
   static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements.");
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 3c8dec58e5ea..0101be085b24 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -11,6 +11,7 @@
 #include <cstdint>    // for int32_t, uint32_t
 #include <vector>     // for vector
 
+#include "../../common/cuda_context.cuh"    // for CUDAContext
 #include "../../common/device_helpers.cuh"  // for MakeTransformIterator
 #include "xgboost/base.h"                   // for bst_idx_t
 #include "xgboost/context.h"                // for Context
@@ -356,18 +357,18 @@ class RowPartitioner {
    *           argument and return the new position for this training instance.
    */
   template <typename FinalisePositionOpT>
-  void FinalisePosition(common::Span<bst_node_t> d_out_position, bst_idx_t base_ridx,
-                        FinalisePositionOpT op) const {
+  void FinalisePosition(Context const* ctx, common::Span<bst_node_t> d_out_position,
+                        bst_idx_t base_ridx, FinalisePositionOpT op) const {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
-                                  cudaMemcpyDefault));
+                                  cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
 
     constexpr int kBlockSize = 512;
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<RowIndexT const> d_ridx{ridx_.data(), ridx_.size()};
-    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0>>>(
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, ctx->CUDACtx()->Stream()>>>(
         dh::ToSpan(d_node_info_storage), base_ridx, d_ridx, d_out_position, op);
   }
 };
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 0b6c1c1982ec..95db64f60632 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -64,14 +64,10 @@ struct NodeSplitData {
 };
 static_assert(std::is_trivially_copyable_v<NodeSplitData>);
 
-// To be tuned.
-constexpr double ExtMemPrefetchThresh() { return 4.0; }
-
 // Some nodes we will manually compute histograms, others we will do by subtraction
-[[nodiscard]] bool AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
-                               std::vector<GPUExpandEntry> const& candidates,
-                               common::Span<bst_node_t> nodes_to_build,
-                               common::Span<bst_node_t> nodes_to_sub) {
+void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer,
+                 std::vector<GPUExpandEntry> const& candidates,
+                 common::Span<bst_node_t> nodes_to_build, common::Span<bst_node_t> nodes_to_sub) {
   auto const& tree = *p_tree;
   std::size_t nidx_in_set{0};
   double total{0.0}, smaller{0.0};
@@ -97,12 +93,6 @@ constexpr double ExtMemPrefetchThresh() { return 4.0; }
     }
     ++nidx_in_set;
   }
-
-  if (-kRtEps < smaller && smaller < kRtEps) {  // Too close to 0, don't prefetch.
-    return false;
-  }
-  // Prefetch if these smaller nodes are not quite small.
-  return (total / smaller) < ExtMemPrefetchThresh();
 }
 
 // GPU tree updater implementation.
@@ -201,16 +191,19 @@ struct GPUHistMakerDevice {
   // Reset values for each update iteration
   [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* p_fmat) {
     this->monitor.Start(__func__);
+    common::SetDevice(ctx_->Ordinal());
+
     auto const& info = p_fmat->Info();
+    // backup the gradient
+    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream());
     this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(),
                                 param.colsample_bynode, param.colsample_bylevel,
                                 param.colsample_bytree);
-    common::SetDevice(ctx_->Ordinal());
-
-    this->interaction_constraints.Reset();
+    this->interaction_constraints.Reset(ctx_);
+    this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
+                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit());
 
     // Sampling
-    dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair);  // backup the gradient
     auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat);
     this->gpair = sample.gpair;
     p_fmat = sample.p_fmat;  // Update p_fmat before allocating partitioners
@@ -242,10 +235,6 @@ struct GPUHistMakerDevice {
     }
 
     // Other initializations
-    this->evaluator_.Reset(*cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(),
-                           p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit(),
-                           this->ctx_->Device());
-
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
 
     this->InitFeatureGroupsOnce(info);
@@ -488,8 +477,8 @@ struct GPUHistMakerDevice {
     // Prepare for build hist
     std::vector<bst_node_t> build_nidx(candidates.size());
     std::vector<bst_node_t> subtraction_nidx(candidates.size());
-    auto prefetch_copy =
-        AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
+    AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx);
+    auto prefetch_copy = !build_nidx.empty();
 
     this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx);
 
@@ -534,10 +523,13 @@ struct GPUHistMakerDevice {
     if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) {
       LOG(FATAL) << "Current objective function can not be used with external memory.";
     }
+
+    monitor.Start(__func__);
     if (static_cast<std::size_t>(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) {
       // External memory with concatenation. Not supported.
       p_out_position->Resize(0);
       positions_.clear();
+      monitor.Stop(__func__);
       return;
     }
 
@@ -557,14 +549,16 @@ struct GPUHistMakerDevice {
         CHECK_EQ(part->GetNumNodes(), p_tree->NumNodes());
         auto base_ridx = batch_ptr_[k];
         auto n_samples = batch_ptr_.at(k + 1) - base_ridx;
-        part->FinalisePosition(d_out_position.subspan(base_ridx, n_samples), base_ridx, encode_op);
+        part->FinalisePosition(ctx_, d_out_position.subspan(base_ridx, n_samples), base_ridx,
+                               encode_op);
       }
-      dh::CopyTo(d_out_position, &positions_);
+      dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
+      monitor.Stop(__func__);
       return;
     }
 
     dh::caching_device_vector<uint32_t> categories;
-    dh::CopyTo(p_tree->GetSplitCategories(), &categories);
+    dh::CopyTo(p_tree->GetSplitCategories(), &categories, this->ctx_->CUDACtx()->Stream());
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
     auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
@@ -583,22 +577,24 @@ struct GPUHistMakerDevice {
 
       auto go_left_op = GoLeftOp{d_matrix};
       dh::caching_device_vector<NodeSplitData> d_split_data;
-      dh::CopyTo(split_data, &d_split_data);
+      dh::CopyTo(split_data, &d_split_data, this->ctx_->CUDACtx()->Stream());
       auto s_split_data = dh::ToSpan(d_split_data);
 
-      partitioners_.front()->FinalisePosition(
-          d_out_position, page.BaseRowId(), [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
-            auto split_data = s_split_data[nidx];
-            auto node = split_data.split_node;
-            while (!node.IsLeaf()) {
-              auto go_left = go_left_op(row_id, split_data);
-              nidx = go_left ? node.LeftChild() : node.RightChild();
-              node = s_split_data[nidx].split_node;
-            }
-            return encode_op(row_id, nidx);
-          });
-      dh::CopyTo(d_out_position, &positions_);
+      partitioners_.front()->FinalisePosition(ctx_, d_out_position, page.BaseRowId(),
+                                              [=] __device__(bst_idx_t row_id, bst_node_t nidx) {
+                                                auto split_data = s_split_data[nidx];
+                                                auto node = split_data.split_node;
+                                                while (!node.IsLeaf()) {
+                                                  auto go_left = go_left_op(row_id, split_data);
+                                                  nidx = go_left ? node.LeftChild()
+                                                                 : node.RightChild();
+                                                  node = s_split_data[nidx].split_node;
+                                                }
+                                                return encode_op(row_id, nidx);
+                                              });
+      dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream());
     }
+    monitor.Stop(__func__);
   }
 
   bool UpdatePredictionCache(linalg::MatrixView<float> out_preds_d, RegTree const* p_tree) {
@@ -616,7 +612,7 @@ struct GPUHistMakerDevice {
     // Use the nodes from tree, the leaf value might be changed by the objective since the
     // last update tree call.
     dh::caching_device_vector<RegTree::Node> nodes;
-    dh::CopyTo(p_tree->GetNodes(), &nodes);
+    dh::CopyTo(p_tree->GetNodes(), &nodes, this->ctx_->CUDACtx()->Stream());
     common::Span<RegTree::Node> d_nodes = dh::ToSpan(nodes);
     CHECK_EQ(out_preds_d.Shape(1), 1);
     dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(),
diff --git a/tests/cpp/common/test_threading_utils.cu b/tests/cpp/common/test_threading_utils.cu
index f7160b1b56f9..fc7475698be2 100644
--- a/tests/cpp/common/test_threading_utils.cu
+++ b/tests/cpp/common/test_threading_utils.cu
@@ -1,16 +1,17 @@
 /**
- * Copyright 2021-2023 by XGBoost Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/copy.h>  // thrust::copy
 
 #include "../../../src/common/device_helpers.cuh"
 #include "../../../src/common/threading_utils.cuh"
+#include "../helpers.h"  // for MakeCUDACtx
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
 TEST(SegmentedTrapezoidThreads, Basic) {
   size_t constexpr kElements = 24, kGroups = 3;
+  auto ctx = MakeCUDACtx(0);
   dh::device_vector<size_t> offset_ptr(kGroups + 1, 0);
   offset_ptr[0] = 0;
   offset_ptr[1] = 8;
@@ -19,11 +20,11 @@ TEST(SegmentedTrapezoidThreads, Basic) {
 
   size_t h = 1;
   dh::device_vector<size_t> thread_ptr(kGroups + 1, 0);
-  size_t total = SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  size_t total = SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   ASSERT_EQ(total, kElements - kGroups);
 
   h = 2;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   std::vector<size_t> h_thread_ptr(thread_ptr.size());
   thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
   for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
@@ -31,7 +32,7 @@ TEST(SegmentedTrapezoidThreads, Basic) {
   }
 
   h = 7;
-  SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
+  SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h);
   thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin());
   for (size_t i = 1; i < h_thread_ptr.size(); ++i) {
     ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 28);
@@ -66,5 +67,4 @@ TEST(SegmentedTrapezoidThreads, Unravel) {
   ASSERT_EQ(i, 6);
   ASSERT_EQ(j, 7);
 }
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 72a8b5449f0f..968a6a411cc0 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -60,8 +60,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
 
   GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
 
-  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false,
-                  ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   ASSERT_EQ(result.thresh, 1);
@@ -104,7 +103,7 @@ TEST(GpuHist, PartitionBasic) {
   };
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     // -1.0s go right
@@ -217,7 +216,7 @@ TEST(GpuHist, PartitionTwoFeatures) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -277,10 +276,8 @@ TEST(GpuHist, PartitionTwoNodes) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
 
   {
     auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0});
@@ -336,10 +333,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                           cuts.min_vals_.ConstDeviceSpan(),
                                           false};
 
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()),
-                             ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false,
-                  ctx.Device());
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
@@ -522,7 +517,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   cuts_.cut_values_.SetDevice(ctx.Device());
   cuts_.min_vals_.SetDevice(ctx.Device());
 
-  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device());
+  evaluator.Reset(&ctx, cuts_, dh::ToSpan(ft), info_.num_col_, param_, false);
 
   // Convert the sample histogram to fixed point
   auto quantiser = DummyRoundingFactor(&ctx);
@@ -586,7 +581,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) {
                                           false};
 
   GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), ctx.Device()};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device());
+  evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true);
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);