Skip to content

Commit

Permalink
[EM] Compress dense ellpack. (#10821)
Browse files Browse the repository at this point in the history
This helps reduce the memory copying needed for dense data. In addition, it helps reduce memory usage even if external memory is not used.

- Decouple the number of symbols needed in the compressor with the number of features when the data is dense.
- Remove the fetch call in the `at_end_` iteration.
- Reduce synchronization and kernel launches by using the `uvector` and ctx.
  • Loading branch information
trivialfis committed Sep 20, 2024
1 parent d5e1c41 commit 24241ed
Show file tree
Hide file tree
Showing 28 changed files with 485 additions and 285 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ipch
*.filters
*.user
*log
rmm_log.txt
Debug
*suo
.Rhistory
Expand Down
23 changes: 12 additions & 11 deletions src/common/cuda_pinned_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#include <cstddef> // for size_t
#include <limits> // for numeric_limits
#include <new> // for bad_array_new_length

#include "common.h"

Expand All @@ -28,14 +29,14 @@ struct PinnedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -52,14 +53,14 @@ struct ManagedAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

pointer result(nullptr);
dh::safe_cuda(cudaMallocManaged(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
Expand All @@ -78,14 +79,14 @@ struct SamAllocPolicy {
using size_type = std::size_t; // NOLINT: The type used for the size of the allocation
using value_type = T; // NOLINT: The type of the elements in the allocator

size_type max_size() const { // NOLINT
[[nodiscard]] constexpr size_type max_size() const { // NOLINT
return std::numeric_limits<size_type>::max() / sizeof(value_type);
}

[[nodiscard]] pointer allocate(size_type cnt, const_pointer = nullptr) const { // NOLINT
if (cnt > this->max_size()) {
throw std::bad_alloc{};
} // end if
throw std::bad_array_new_length{};
}

size_type n_bytes = cnt * sizeof(value_type);
pointer result = reinterpret_cast<pointer>(std::malloc(n_bytes));
Expand Down Expand Up @@ -139,10 +140,10 @@ class CudaHostAllocatorImpl : public Policy<T> {
};

template <typename T>
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>; // NOLINT
using PinnedAllocator = CudaHostAllocatorImpl<T, PinnedAllocPolicy>;

template <typename T>
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>; // NOLINT
using ManagedAllocator = CudaHostAllocatorImpl<T, ManagedAllocPolicy>;

template <typename T>
using SamAllocator = CudaHostAllocatorImpl<T, SamAllocPolicy>;
Expand Down
26 changes: 17 additions & 9 deletions src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,10 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
pointer thrust_ptr;
if (use_cub_allocator_) {
T *raw_ptr{nullptr};
// NOLINTBEGIN(clang-analyzer-unix.BlockInCriticalSection)
auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&raw_ptr),
n * sizeof(T));
// NOLINTEND(clang-analyzer-unix.BlockInCriticalSection)
if (errc != cudaSuccess) {
detail::ThrowOOMError("Caching allocator", n * sizeof(T));
}
Expand Down Expand Up @@ -290,13 +292,13 @@ LoggingResource *GlobalLoggingResource();
/**
* @brief Container class that doesn't initialize the data when RMM is used.
*/
template <typename T>
class DeviceUVector {
template <typename T, bool is_caching>
class DeviceUVectorImpl {
private:
#if defined(XGBOOST_USE_RMM)
rmm::device_uvector<T> data_{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()};
#else
::dh::device_vector<T> data_;
std::conditional_t<is_caching, ::dh::caching_device_vector<T>, ::dh::device_vector<T>> data_;
#endif // defined(XGBOOST_USE_RMM)

public:
Expand All @@ -307,12 +309,12 @@ class DeviceUVector {
using const_reference = value_type const &; // NOLINT

public:
DeviceUVector() = default;
explicit DeviceUVector(std::size_t n) { this->resize(n); }
DeviceUVector(DeviceUVector const &that) = delete;
DeviceUVector &operator=(DeviceUVector const &that) = delete;
DeviceUVector(DeviceUVector &&that) = default;
DeviceUVector &operator=(DeviceUVector &&that) = default;
DeviceUVectorImpl() = default;
explicit DeviceUVectorImpl(std::size_t n) { this->resize(n); }
DeviceUVectorImpl(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl &operator=(DeviceUVectorImpl const &that) = delete;
DeviceUVectorImpl(DeviceUVectorImpl &&that) = default;
DeviceUVectorImpl &operator=(DeviceUVectorImpl &&that) = default;

void resize(std::size_t n) { // NOLINT
#if defined(XGBOOST_USE_RMM)
Expand Down Expand Up @@ -356,4 +358,10 @@ class DeviceUVector {
[[nodiscard]] auto data() { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
[[nodiscard]] auto data() const { return thrust::raw_pointer_cast(data_.data()); } // NOLINT
};

template <typename T>
using DeviceUVector = DeviceUVectorImpl<T, false>;

template <typename T>
using CachingDeviceUVector = DeviceUVectorImpl<T, true>;
} // namespace dh
19 changes: 9 additions & 10 deletions src/data/device_adapter.cuh
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
/**
* Copyright 2019-2023 by XGBoost Contributors
* Copyright 2019-2024, XGBoost Contributors
* \file device_adapter.cuh
*/
#ifndef XGBOOST_DATA_DEVICE_ADAPTER_H_
#define XGBOOST_DATA_DEVICE_ADAPTER_H_
#include <thrust/iterator/counting_iterator.h> // for make_counting_iterator
#include <thrust/logical.h> // for none_of

#include <cstddef> // for size_t
#include <cstddef> // for size_t
#include <limits>
#include <memory>
#include <string>

#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh"
#include "../common/math.h"
#include "adapter.h"
#include "array_interface.h"

Expand Down Expand Up @@ -208,11 +207,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {

// Returns maximum row length
template <typename AdapterBatchT>
bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset, DeviceOrd device,
float missing) {
bst_idx_t GetRowCounts(Context const* ctx, const AdapterBatchT batch,
common::Span<bst_idx_t> offset, DeviceOrd device, float missing) {
dh::safe_cuda(cudaSetDevice(device.ordinal));
IsValidFunctor is_valid(missing);
dh::safe_cuda(cudaMemsetAsync(offset.data(), '\0', offset.size_bytes()));
dh::safe_cuda(
cudaMemsetAsync(offset.data(), '\0', offset.size_bytes(), ctx->CUDACtx()->Stream()));

auto n_samples = batch.NumRows();
bst_feature_t n_features = batch.NumCols();
Expand All @@ -230,7 +230,7 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
}

// Count elements per row
dh::LaunchN(n_samples * stride, [=] __device__(std::size_t idx) {
dh::LaunchN(n_samples * stride, ctx->CUDACtx()->Stream(), [=] __device__(std::size_t idx) {
bst_idx_t cnt{0};
auto [ridx, fbeg] = linalg::UnravelIndex(idx, n_samples, stride);
SPAN_CHECK(ridx < n_samples);
Expand All @@ -244,9 +244,8 @@ bst_idx_t GetRowCounts(const AdapterBatchT batch, common::Span<bst_idx_t> offset
&offset[ridx]),
static_cast<unsigned long long>(cnt)); // NOLINT
});
dh::XGBCachingDeviceAllocator<char> alloc;
bst_idx_t row_stride =
dh::Reduce(thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
dh::Reduce(ctx->CUDACtx()->CTP(), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(),
static_cast<bst_idx_t>(0), thrust::maximum<bst_idx_t>());
return row_stride;
Expand Down
Loading

0 comments on commit 24241ed

Please sign in to comment.