dmlc · trivialfis · Sep 9, 2024 · Sep 8, 2024 · Sep 8, 2024
diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h
@@ -77,13 +77,11 @@ class CompressedBufferWriter {
   static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
     constexpr int kBitsPerByte = 8;
     size_t compressed_size = static_cast<size_t>(std::ceil(
-        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
-        kBitsPerByte));
+        static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) / kBitsPerByte));
     // Handle atomicOr where input must be unsigned int, hence 4 bytes aligned.
-    size_t ret =
-        std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
-                  static_cast<double>(sizeof(unsigned int))) *
-        sizeof(unsigned int);
+    size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
+                           static_cast<double>(sizeof(std::uint32_t))) *
+                 sizeof(std::uint32_t);
     return ret;
   }
 

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
@@ -43,20 +43,20 @@ struct EllpackDeviceAccessor {
   common::Span<const FeatureType> feature_types;
 
   EllpackDeviceAccessor() = delete;
-  EllpackDeviceAccessor(DeviceOrd device, std::shared_ptr<const common::HistogramCuts> cuts,
-                        bool is_dense, size_t row_stride, size_t base_rowid, size_t n_rows,
+  EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
+                        bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
                         common::CompressedIterator<uint32_t> gidx_iter,
                         common::Span<FeatureType const> feature_types)
-      : is_dense(is_dense),
-        row_stride(row_stride),
-        base_rowid(base_rowid),
-        n_rows(n_rows),
-        gidx_iter(gidx_iter),
+      : is_dense{is_dense},
+        row_stride{row_stride},
+        base_rowid{base_rowid},
+        n_rows{n_rows},
+        gidx_iter{gidx_iter},
         feature_types{feature_types} {
-    if (device.IsCUDA()) {
-      cuts->cut_values_.SetDevice(device);
-      cuts->cut_ptrs_.SetDevice(device);
-      cuts->min_vals_.SetDevice(device);
+    if (ctx->IsCUDA()) {
+      cuts->cut_values_.SetDevice(ctx->Device());
+      cuts->cut_ptrs_.SetDevice(ctx->Device());
+      cuts->min_vals_.SetDevice(ctx->Device());
       gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
       feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
       min_fvalue = cuts->min_vals_.ConstDeviceSpan();
@@ -127,9 +127,6 @@ struct EllpackDeviceAccessor {
   [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
     return row_id >= base_rowid && row_id < base_rowid + n_rows;
   }
-  /*! \brief Return the total number of symbols (total number of bins plus 1 for
-   * not found). */
-  [[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }
 
   [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); }
 
@@ -160,7 +157,7 @@ class EllpackPageImpl {
   EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
   /**
-   * @brief Constructor used for external memory.
+   * @brief Constructor used for external memory with DMatrix.
    */
   EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
                   const SparsePage& page, bool is_dense, size_t row_stride,
@@ -173,12 +170,14 @@ class EllpackPageImpl {
    * in CSR format.
    */
   explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);
-
+  /**
+   * @brief Constructor for Quantile DMatrix using an adapter.
+   */
   template <typename AdapterBatch>
   explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
                            common::Span<size_t> row_counts_span,
                            common::Span<FeatureType const> feature_types, size_t row_stride,
-                           size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
+                           bst_idx_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
   /**
    * @brief Constructor from an existing CPU gradient index.
    */
@@ -214,7 +213,7 @@ class EllpackPageImpl {
 
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
-  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
+  void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts);
 
   [[nodiscard]] bool IsDense() const { return is_dense; }
   /** @return Estimation of memory cost of this page. */
@@ -224,12 +223,14 @@ class EllpackPageImpl {
    * @brief Return the total number of symbols (total number of bins plus 1 for not
    *        found).
    */
-  [[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
+  [[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; }
+  void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; }
+
   /**
    * @brief Get an accessor that can be passed into CUDA kernels.
    */
   [[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
-      DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
+      Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
   /**
    * @brief Get an accessor for host code.
    */
@@ -246,10 +247,9 @@ class EllpackPageImpl {
   /**
    * @brief Compress a single page of CSR data into ELLPACK.
    *
-   * @param device The GPU device to use.
    * @param row_batch The CSR page.
    */
-  void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
+  void CreateHistIndices(Context const* ctx, const SparsePage& row_batch,
                          common::Span<FeatureType const> feature_types);
   /**
    * @brief Initialize the buffer to store compressed features.
@@ -272,6 +272,7 @@ class EllpackPageImpl {
 
  private:
   std::shared_ptr<common::HistogramCuts const> cuts_;
+  bst_idx_t n_symbols_{0};
   common::Monitor monitor_;
 };
 

diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
@@ -55,7 +55,6 @@ template <typename T>
   xgboost_NVTX_FN_RANGE();
   auto* impl = page->Impl();
 
-  impl->SetCuts(this->cuts_);
   RET_IF_NOT(fi->Read(&impl->n_rows));
   RET_IF_NOT(fi->Read(&impl->is_dense));
   RET_IF_NOT(fi->Read(&impl->row_stride));
@@ -66,6 +65,12 @@ template <typename T>
     RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
   }
   RET_IF_NOT(fi->Read(&impl->base_rowid));
+  bst_idx_t n_symbols{0};
+  RET_IF_NOT(fi->Read(&n_symbols));
+  impl->SetNumSymbols(n_symbols);
+
+  impl->SetCuts(this->cuts_);
+
   dh::DefaultStream().Sync();
   return true;
 }
@@ -84,6 +89,8 @@ template <typename T>
   [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
   bytes += common::WriteVec(fo, h_gidx_buffer);
   bytes += fo->Write(impl->base_rowid);
+  bytes += fo->Write(impl->NumSymbols());
+
   dh::DefaultStream().Sync();
   return bytes;
 }
@@ -93,9 +100,10 @@ template <typename T>
 
   auto* impl = page->Impl();
   CHECK(this->cuts_->cut_values_.DeviceCanRead());
-  impl->SetCuts(this->cuts_);
 
   fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
+  impl->SetCuts(this->cuts_);
+
   dh::DefaultStream().Sync();
 
   return true;
@@ -108,8 +116,7 @@ template <typename T>
   fo->Write(page);
   dh::DefaultStream().Sync();
 
-  auto* impl = page.Impl();
-  return impl->MemCostBytes();
+  return page.Impl()->MemCostBytes();
 }
 
 #undef RET_IF_NOT

diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
@@ -81,6 +81,7 @@ class EllpackHostCacheStreamImpl {
     new_impl->is_dense = impl->IsDense();
     new_impl->row_stride = impl->row_stride;
     new_impl->base_rowid = impl->base_rowid;
+    new_impl->SetNumSymbols(impl->NumSymbols());
 
     dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), impl->gidx_buffer.data(),
                                   impl->gidx_buffer.size_bytes(), cudaMemcpyDefault));
@@ -108,6 +109,7 @@ class EllpackHostCacheStreamImpl {
     impl->is_dense = page->IsDense();
     impl->row_stride = page->row_stride;
     impl->base_rowid = page->base_rowid;
+    impl->SetNumSymbols(page->NumSymbols());
   }
 };
 

diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
@@ -58,9 +58,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   /**
    * Generate gradient index.
    */
-  size_t offset = 0;
+  bst_idx_t offset = 0;
   iter.Reset();
-  size_t n_batches_for_verification = 0;
+  bst_idx_t n_batches_for_verification = 0;
   while (iter.Next()) {
     init_page();
     dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
@@ -75,10 +75,11 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
     proxy->Info().feature_types.SetDevice(dh::GetDevice(ctx));
     auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
     auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
-      return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
-                             ext_info.row_stride, rows, cuts);
+      return EllpackPageImpl{
+          &fmat_ctx_,          value, missing, is_dense, row_counts_span, d_feature_types,
+          ext_info.row_stride, rows,  cuts};
     });
-    std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
+    bst_idx_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
     offset += num_elements;
 
     proxy->Info().num_row_ = BatchSamples(proxy);

diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
@@ -927,8 +927,8 @@ class GPUPredictor : public xgboost::Predictor {
       for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
         dmat->Info().feature_types.SetDevice(ctx_->Device());
         auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
-        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
-                              d_model, out_preds, batch_offset);
+        this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
+                              out_preds, batch_offset);
         batch_offset += page.Size() * model.learner_model_param->OutputLength();
       }
     }
@@ -1068,7 +1068,7 @@ class GPUPredictor : public xgboost::Predictor {
       }
     } else {
       for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
-        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                                std::numeric_limits<float>::quiet_NaN()};
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
@@ -1139,8 +1139,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
         auto impl = batch.Impl();
-        auto acc =
-            impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
+        auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
         auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
         auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
                                std::numeric_limits<float>::quiet_NaN()};
@@ -1225,7 +1224,7 @@ class GPUPredictor : public xgboost::Predictor {
     } else {
       bst_idx_t batch_offset = 0;
       for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
-        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
+        EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
         auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
         launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
         batch_offset += batch.Size();

diff --git a/src/tree/fit_stump.cu b/src/tree/fit_stump.cu
@@ -9,6 +9,7 @@
 #include <cstddef>  // std::size_t
 
 #include "../collective/aggregator.cuh"  // for GlobalSum
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"  // dh::MakeTransformIterator
 #include "fit_stump.h"
 #include "xgboost/base.h"     // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
@@ -39,17 +40,15 @@ void FitStump(Context const* ctx, MetaInfo const& info,
   auto d_sum = sum.View(ctx->Device());
   CHECK(d_sum.CContiguous());
 
-  dh::XGBCachingDeviceAllocator<char> alloc;
-  auto policy = thrust::cuda::par(alloc);
-  thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
+  thrust::reduce_by_key(ctx->CUDACtx()->CTP(), key_it, key_it + gpair.Size(), grad_it,
                         thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));
 
   auto rc = collective::GlobalSum(ctx, info,
                                   linalg::MakeVec(reinterpret_cast<double*>(d_sum.Values().data()),
                                                   d_sum.Size() * 2, ctx->Device()));
   SafeColl(rc);
 
-  thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
+  thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_targets,
                      [=] XGBOOST_DEVICE(std::size_t i) mutable {
                        out(i) = static_cast<float>(
                            CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));

diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
@@ -186,7 +186,7 @@ class HistogramAgent {
   // Increases the throughput of this kernel significantly
   __device__ void ProcessFullTileShared(std::size_t offset) {
     std::size_t idx[kItemsPerThread];
-    int ridx[kItemsPerThread];
+    Idx ridx[kItemsPerThread];
     int gidx[kItemsPerThread];
     GradientPair gpair[kItemsPerThread];
 #pragma unroll

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -338,7 +338,7 @@ struct GPUHistMakerDevice {
     monitor.Start(__func__);
     auto d_node_hist = histogram_.GetNodeHistogram(nidx);
     auto batch = page.Impl();
-    auto acc = batch->GetDeviceAccessor(ctx_->Device());
+    auto acc = batch->GetDeviceAccessor(ctx_);
 
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
     this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
@@ -497,7 +497,7 @@ struct GPUHistMakerDevice {
 
     std::int32_t k{0};
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
+      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_);
       auto go_left = GoLeftOp{d_matrix};
 
       // Partition histogram.
@@ -567,9 +567,10 @@ struct GPUHistMakerDevice {
     dh::CopyTo(p_tree->GetSplitCategories(), &categories);
     auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
     auto d_categories = dh::ToSpan(categories);
+    auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
 
     for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
-      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
+      auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_, ft);
 
       std::vector<NodeSplitData> split_data(p_tree->NumNodes());
       auto const& tree = *p_tree;

diff --git a/tests/cpp/collective/test_worker.h b/tests/cpp/collective/test_worker.h
@@ -203,7 +203,8 @@ class BaseMGPUTest : public ::testing::Test {
    *                          available.
    */
   template <typename Fn>
-  auto DoTest(Fn&& fn, bool is_federated, [[maybe_unused]] bool emulate_if_single = false) const {
+  auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
+              [[maybe_unused]] bool emulate_if_single = false) const {
     auto n_gpus = common::AllVisibleGPUs();
     if (is_federated) {
 #if defined(XGBOOST_USE_FEDERATED)

diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
@@ -19,7 +19,7 @@ TEST(EllpackPage, EmptyDMatrix) {
   constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
   constexpr float kSparsity = 0;
   auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
-  Context ctx{MakeCUDACtx(0)};
+  auto ctx = MakeCUDACtx(0);
   auto& page = *dmat->GetBatches<EllpackPage>(
                         &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
                     .begin();
@@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
   Context ctx{MakeCUDACtx(0)};
   auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
   auto ellpack = EllpackPage(&ctx, m.get(), p);
-  auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
+  auto accessor = ellpack.Impl()->GetDeviceAccessor(&ctx);
   ASSERT_EQ(kCats, accessor.NumBins());
 
   auto x_copy = x;
@@ -167,11 +167,11 @@ TEST(EllpackPage, Copy) {
     EXPECT_EQ(impl->base_rowid, current_row);
 
     for (size_t i = 0; i < impl->Size(); i++) {
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
-                                         row_d.data().get()));
+      dh::LaunchN(kCols,
+                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), current_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());
 
@@ -223,12 +223,12 @@ TEST(EllpackPage, Compact) {
         continue;
       }
 
-      dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
-                                         row_d.data().get()));
+      dh::LaunchN(kCols,
+                  ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
       dh::safe_cuda(cudaDeviceSynchronize());
       thrust::copy(row_d.begin(), row_d.end(), row.begin());
 
-      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
+      dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), compacted_row,
                                          row_result_d.data().get()));
       thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());