Merge branch 'master' into ranker

dmlc · Sep 20, 2024 · b62bb8c · b62bb8c
2 parents da969b7 + 2a37a88
commit b62bb8c
Show file tree

Hide file tree

Showing 33 changed files with 545 additions and 336 deletions.
diff --git a/.gitignore b/.gitignore
@@ -33,6 +33,7 @@ ipch
 *.filters
 *.user
 *log
+rmm_log.txt
 Debug
 *suo
 .Rhistory

diff --git a/doc/tutorials/model.rst b/doc/tutorials/model.rst
@@ -3,7 +3,7 @@ Introduction to Boosted Trees
 #############################
 XGBoost stands for "Extreme Gradient Boosting", where the term "Gradient Boosting" originates from the paper *Greedy Function Approximation: A Gradient Boosting Machine*, by Friedman.
 
-The **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
+The term **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
 This tutorial will explain boosted trees in a self-contained and principled way using the elements of supervised learning.
 We think this explanation is cleaner, more formal, and motivates the model formulation used in XGBoost.
 
@@ -119,13 +119,16 @@ Let the following be the objective function (remember it always needs to contain
 
 .. math::
 
-  \text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\omega(f_i)
+  \text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{k=1}^t\omega(f_k)
+
+in which :math:`t` is the number of trees in our ensemble.
+(Each training step will add one new tree, so that at step :math:`t` the ensemble contains :math:`K=t` trees).
 
 Additive Training
 =================
 
 The first question we want to ask: what are the **parameters** of trees?
-You can find that what we need to learn are those functions :math:`f_i`, each containing the structure
+You can find that what we need to learn are those functions :math:`f_k`, each containing the structure
 of the tree and the leaf scores. Learning tree structure is much harder than traditional optimization problem where you can simply take the gradient.
 It is intractable to learn all the trees at once.
 Instead, we use an additive strategy: fix what we have learned, and add one new tree at a time.
@@ -150,7 +153,7 @@ If we consider using mean squared error (MSE) as our loss function, the objectiv
 
 .. math::
 
-  \text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\omega(f_i) \\
+  \text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{k=1}^t\omega(f_k) \\
             & = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \omega(f_t) + \mathrm{constant}
 
 The form of MSE is friendly, with a first order term (usually called the residual) and a quadratic term.

diff --git a/python-package/xgboost/testing/updater.py b/python-package/xgboost/testing/updater.py
@@ -218,8 +218,13 @@ def check_extmem_qdm(
         )
 
     booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
-    X, y, w = it.as_arrays()
-    Xy = xgb.QuantileDMatrix(X, y, weight=w)
+    it = tm.IteratorForTest(
+        *tm.make_batches(
+            n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
+        ),
+        cache=None,
+    )
+    Xy = xgb.QuantileDMatrix(it)
     booster = xgb.train({"device": device}, Xy, num_boost_round=8)
 
     if device == "cpu":

diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
@@ -10,6 +10,7 @@
 
 #include <cstddef>  // for size_t
 #include <limits>   // for numeric_limits
+#include <new>      // for bad_array_new_length
 
 #include "common.h"
 
@@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     pointer result(nullptr);
     dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
@@ -78,14 +79,14 @@ struct SamAllocPolicy {
   using size_type = std::size_t;   // NOLINT: The type used for the size of the allocation
   using value_type = T;            // NOLINT: The type of the elements in the allocator
 
-  size_type max_size() const {  // NOLINT
+  [[nodiscard]] constexpr size_type max_size() const {  // NOLINT
     return std::numeric_limits<size_type>::max() / sizeof(value_type);
   }
 
   [[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const {  // NOLINT
     if (cnt > this->max_size()) {
-      throw std::bad_alloc{};
-    }  // end if
+      throw std::bad_array_new_length{};
+    }
 
     size_type n_bytes = cnt * sizeof(value_type);
     pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
@@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
 };
 
 template <typename T>
-using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;  // NOLINT
+using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;
 
 template <typename T>
-using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;  // NOLINT
+using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;
 
 template <typename T>
 using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;

diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh
@@ -177,8 +177,10 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     pointer thrust_ptr;
     if (use_cub_allocator_) {
       T *raw_ptr{nullptr};
+      // NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
       auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
                                                              n * sizeof(T));
+      // NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
       if (errc != cudaSuccess) {
         detail::ThrowOOMError("Caching allocator", n * sizeof(T));
       }
@@ -290,13 +292,13 @@ LoggingResource *GlobalLoggingResource();
 /**
  * @brief Container class that doesn't initialize the data when RMM is used.
  */
-template <typename T>
-class DeviceUVector {
+template <typename T, bool is_caching>
+class DeviceUVectorImpl {
  private:
 #if defined(XGBOOST_USE_RMM)
   rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
 #else
-  ::dh::device_vector<T> data_;
+  std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
 #endif  // defined(XGBOOST_USE_RMM)
 
  public:
@@ -307,12 +309,12 @@ class DeviceUVector {
   using const_reference = value_type const &;  // NOLINT
 
  public:
-  DeviceUVector() = default;
-  explicit DeviceUVector(std::size_t n) { this->resize(n); }
-  DeviceUVector(DeviceUVector const &that) = delete;
-  DeviceUVector &operator=(DeviceUVector const &that) = delete;
-  DeviceUVector(DeviceUVector &&that) = default;
-  DeviceUVector &operator=(DeviceUVector &&that) = default;
+  DeviceUVectorImpl() = default;
+  explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
+  DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
+  DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
+  DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
+  DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;
 
   void resize(std::size_t n) {  // NOLINT
 #if defined(XGBOOST_USE_RMM)
@@ -356,4 +358,10 @@ class DeviceUVector {
   [[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); }        // NOLINT
   [[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); }  // NOLINT
 };
+
+template <typename T>
+using DeviceUVector = DeviceUVectorImpl<T, false>;
+
+template <typename T>
+using CachingDeviceUVector = DeviceUVectorImpl<T, true>;
 }  // namespace dh
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
@@ -1,19 +1,18 @@
 /**
- *  Copyright 2019-2023 by XGBoost Contributors
+ *  Copyright 2019-2024, XGBoost Contributors
  * \file device_adapter.cuh
  */
 #ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
 #define XGBOOST_DATA_DEVICE_ADAPTER_H_
 #include <thrust/iterator/counting_iterator.h>  // for make_counting_iterator
 #include <thrust/logical.h>                     // for none_of
 
-#include <cstddef>                              // for size_t
+#include <cstddef>  // for size_t
 #include <limits>
-#include <memory>
 #include <string>
 
+#include "../common/cuda_context.cuh"
 #include "../common/device_helpers.cuh"
-#include "../common/math.h"
 #include "adapter.h"
 #include "array_interface.h"
 
@@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
 
 // Returns maximum row length
 template <typename AdapterBatchT>
-bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
-                       float missing) {
+bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
+                       common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
   dh::safe_cuda(cudaSetDevice(device.ordinal));
   IsValidFunctor is_valid(missing);
-  dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
+  dh::safe_cuda(
+      cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));
 
   auto n_samples = batch.NumRows();
   bst_feature_t n_features = batch.NumCols();
@@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
   }
 
   // Count elements per row
-  dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
+  dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
     bst_idx_t cnt{0};
     auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
     SPAN_CHECK(ridx < n_samples);
@@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
                   &offset[ridx]),
               static_cast<unsigned long long>(cnt));  // NOLINT
   });
-  dh::XGBCachingDeviceAllocator<char> alloc;
   bst_idx_t row_stride =
-      dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
+      dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
                  thrust::device_pointer_cast(offset.data()) + offset.size(),
                  static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
   return row_stride;