Skip to content

Commit

Permalink
Merge branch 'master' into ranker
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Sep 20, 2024
2 parents da969b7 + 2a37a88 commit b62bb8c
Show file tree
Hide file tree
Showing 33 changed files with 545 additions and 336 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ipch
*.filters
*.user
*log
rmm_log.txt
Debug
*suo
.Rhistory
Expand Down
11 changes: 7 additions & 4 deletions doc/tutorials/model.rst
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Introduction to Boosted Trees
#############################
XGBoost stands for "Extreme Gradient Boosting", where the term "Gradient Boosting" originates from the paper *Greedy Function Approximation: A Gradient Boosting Machine*, by Friedman.

The **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
The term **gradient boosted trees** has been around for a while, and there are a lot of materials on the topic.
This tutorial will explain boosted trees in a self-contained and principled way using the elements of supervised learning.
We think this explanation is cleaner, more formal, and motivates the model formulation used in XGBoost.

Expand Down Expand Up @@ -119,13 +119,16 @@ Let the following be the objective function (remember it always needs to contain

.. math::
\text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\omega(f_i)
\text{obj} = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{k=1}^t\omega(f_k)
in which :math:`t` is the number of trees in our ensemble.
(Each training step will add one new tree, so that at step :math:`t` the ensemble contains :math:`K=t` trees).

Additive Training
=================

The first question we want to ask: what are the **parameters** of trees?
You can find that what we need to learn are those functions :math:`f_i`, each containing the structure
You can find that what we need to learn are those functions :math:`f_k`, each containing the structure
of the tree and the leaf scores. Learning tree structure is much harder than traditional optimization problem where you can simply take the gradient.
It is intractable to learn all the trees at once.
Instead, we use an additive strategy: fix what we have learned, and add one new tree at a time.
Expand All @@ -150,7 +153,7 @@ If we consider using mean squared error (MSE) as our loss function, the objectiv

.. math::
\text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\omega(f_i) \\
\text{obj}^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{k=1}^t\omega(f_k) \\
& = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \omega(f_t) + \mathrm{constant}
The form of MSE is friendly, with a first order term (usually called the residual) and a quadratic term.
Expand Down
9 changes: 7 additions & 2 deletions python-package/xgboost/testing/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,13 @@ def check_extmem_qdm(
)

booster_it = xgb.train({"device": device}, Xy_it, num_boost_round=8)
X, y, w = it.as_arrays()
Xy = xgb.QuantileDMatrix(X, y, weight=w)
it = tm.IteratorForTest(
*tm.make_batches(
n_samples_per_batch, n_features, n_batches, use_cupy=device != "cpu"
),
cache=None,
)
Xy = xgb.QuantileDMatrix(it)
booster = xgb.train({"device": device}, Xy, num_boost_round=8)

if device == "cpu":
Expand Down
23 changes: 12 additions & 11 deletions src/common/cuda_pinned_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <new> // for bad_array_new_length

#include "common.h"

Expand All @@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -78,14 +79,14 @@ struct SamAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

size_type n_bytes = cnt * sizeof(value_type);
pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
Expand Down Expand Up @@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
};

template <typename T>
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;

template <typename T>
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;

template <typename T>
using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
Expand Down
26 changes: 17 additions & 9 deletions src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,10 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
pointer thrust_ptr;
if (use_cub_allocator_) {
T *raw_ptr{nullptr};
// NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
// NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
if (errc != cudaSuccess) {
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
}
Expand Down Expand Up @@ -290,13 +292,13 @@ LoggingResource *GlobalLoggingResource();
/**
* @brief Container class that doesn't initialize the data when RMM is used.
*/
template <typename T>
class DeviceUVector {
template <typename T, bool is_caching>
class DeviceUVectorImpl {
private:
#if defined(XGBOOST_USE_RMM)
rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
#else
::dh::device_vector<T> data_;
std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
#endif // defined(XGBOOST_USE_RMM)

public:
Expand All @@ -307,12 +309,12 @@ class DeviceUVector {
using const_reference = value_type const &; // NOLINT

public:
DeviceUVector() = default;
explicit DeviceUVector(std::size_t n) { this->resize(n); }
DeviceUVector(DeviceUVector const &that) = delete;
DeviceUVector &operator=(DeviceUVector const &that) = delete;
DeviceUVector(DeviceUVector &&that) = default;
DeviceUVector &operator=(DeviceUVector &&that) = default;
DeviceUVectorImpl() = default;
explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;

void resize(std::size_t n) { // NOLINT
#if defined(XGBOOST_USE_RMM)
Expand Down Expand Up @@ -356,4 +358,10 @@ class DeviceUVector {
[[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
[[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
};

template <typename T>
using DeviceUVector = DeviceUVectorImpl<T, false>;

template <typename T>
using CachingDeviceUVector = DeviceUVectorImpl<T, true>;
} // namespace dh
19 changes: 9 additions & 10 deletions src/data/device_adapter.cuh
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
* \file device_adapter.cuh
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of

#include <cstddef> // for size_t
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>

#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh"
#include "../common/math.h"
#include "adapter.h"
#include "array_interface.h"

Expand Down Expand Up @@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

// Returns maximum row length
template <typename AdapterBatchT>
bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
float missing) {
bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
dh::safe_cuda(
cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));

auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
Expand All @@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
}

// Count elements per row
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
bst_idx_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
Expand All @@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
bst_idx_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
return row_stride;
Expand Down
Loading

0 comments on commit b62bb8c

Please sign in to comment.