Skip to content

Commit

Permalink
[EM] Refactor ellpack contruction.
Browse files Browse the repository at this point in the history
- Remove the calculation of n_symbols in the accessor.
- Pass the context into various ctors.
- Specialization for dense data to prepare for further compression.
  • Loading branch information
trivialfis committed Sep 8, 2024
1 parent c69c4ad commit 81e5c0f
Show file tree
Hide file tree
Showing 15 changed files with 187 additions and 157 deletions.
10 changes: 4 additions & 6 deletions src/common/compressed_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,11 @@ class CompressedBufferWriter {
static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
constexpr int kBitsPerByte = 8;
size_t compressed_size = static_cast<size_t>(std::ceil(
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
kBitsPerByte));
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) / kBitsPerByte));
// Handle atomicOr where input must be unsigned int, hence 4 bytes aligned.
size_t ret =
std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
static_cast<double>(sizeof(unsigned int))) *
sizeof(unsigned int);
size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
static_cast<double>(sizeof(std::uint32_t))) *
sizeof(std::uint32_t);
return ret;
}

Expand Down
179 changes: 100 additions & 79 deletions src/data/ellpack_page.cu

Large diffs are not rendered by default.

47 changes: 24 additions & 23 deletions src/data/ellpack_page.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,20 @@ struct EllpackDeviceAccessor {
common::Span<const FeatureType> feature_types;

EllpackDeviceAccessor() = delete;
EllpackDeviceAccessor(DeviceOrd device, std::shared_ptr<const common::HistogramCuts> cuts,
bool is_dense, size_t row_stride, size_t base_rowid, size_t n_rows,
common::CompressedIterator<uint32_t> gidx_iter,
EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
common::CompressedIterator<uint32_t> gidx_iter, bst_idx_t n_symbols,
common::Span<FeatureType const> feature_types)
: is_dense(is_dense),
row_stride(row_stride),
base_rowid(base_rowid),
n_rows(n_rows),
gidx_iter(gidx_iter),
: is_dense{is_dense},
row_stride{row_stride},
base_rowid{base_rowid},
n_rows{n_rows},
gidx_iter{gidx_iter},
feature_types{feature_types} {
if (device.IsCUDA()) {
cuts->cut_values_.SetDevice(device);
cuts->cut_ptrs_.SetDevice(device);
cuts->min_vals_.SetDevice(device);
if (ctx->IsCUDA()) {
cuts->cut_values_.SetDevice(ctx->Device());
cuts->cut_ptrs_.SetDevice(ctx->Device());
cuts->min_vals_.SetDevice(ctx->Device());
gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
min_fvalue = cuts->min_vals_.ConstDeviceSpan();
Expand Down Expand Up @@ -127,9 +127,6 @@ struct EllpackDeviceAccessor {
[[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
return row_id >= base_rowid && row_id < base_rowid + n_rows;
}
/*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */
[[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }

[[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); }

Expand Down Expand Up @@ -160,7 +157,7 @@ class EllpackPageImpl {
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
/**
* @brief Constructor used for external memory.
* @brief Constructor used for external memory with DMatrix.
*/
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
const SparsePage& page, bool is_dense, size_t row_stride,
Expand All @@ -173,12 +170,14 @@ class EllpackPageImpl {
* in CSR format.
*/
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);

/**
* @brief Constructor for Quantile DMatrix using an adapter.
*/
template <typename AdapterBatch>
explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
bst_idx_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
/**
* @brief Constructor from an existing CPU gradient index.
*/
Expand Down Expand Up @@ -214,7 +213,7 @@ class EllpackPageImpl {

[[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
[[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts);

[[nodiscard]] bool IsDense() const { return is_dense; }
/** @return Estimation of memory cost of this page. */
Expand All @@ -224,12 +223,14 @@ class EllpackPageImpl {
* @brief Return the total number of symbols (total number of bins plus 1 for not
* found).
*/
[[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
[[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; }
void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; }

/**
* @brief Get an accessor that can be passed into CUDA kernels.
*/
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
/**
* @brief Get an accessor for host code.
*/
Expand All @@ -246,10 +247,9 @@ class EllpackPageImpl {
/**
* @brief Compress a single page of CSR data into ELLPACK.
*
* @param device The GPU device to use.
* @param row_batch The CSR page.
*/
void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
void CreateHistIndices(Context const* ctx, const SparsePage& row_batch,
common::Span<FeatureType const> feature_types);
/**
* @brief Initialize the buffer to store compressed features.
Expand All @@ -272,6 +272,7 @@ class EllpackPageImpl {

private:
std::shared_ptr<common::HistogramCuts const> cuts_;
bst_idx_t n_symbols_{0};
common::Monitor monitor_;
};

Expand Down
10 changes: 8 additions & 2 deletions src/data/ellpack_page_raw_format.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ template <typename T>
xgboost_NVTX_FN_RANGE();
auto* impl = page->Impl();

impl->SetCuts(this->cuts_);
RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride));
Expand All @@ -66,6 +65,10 @@ template <typename T>
RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
}
RET_IF_NOT(fi->Read(&impl->base_rowid));
bst_idx_t n_symbols{0};
RET_IF_NOT(fi->Read(&n_symbols));

impl->SetCuts(this->cuts_);
dh::DefaultStream().Sync();
return true;
}
Expand All @@ -84,6 +87,8 @@ template <typename T>
[[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
bytes += common::WriteVec(fo, h_gidx_buffer);
bytes += fo->Write(impl->base_rowid);
bytes += fo->Write(impl->NumSymbols());

dh::DefaultStream().Sync();
return bytes;
}
Expand All @@ -93,9 +98,10 @@ template <typename T>

auto* impl = page->Impl();
CHECK(this->cuts_->cut_values_.DeviceCanRead());
impl->SetCuts(this->cuts_);

fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
impl->SetCuts(this->cuts_);

dh::DefaultStream().Sync();

return true;
Expand Down
2 changes: 2 additions & 0 deletions src/data/ellpack_page_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class EllpackHostCacheStreamImpl {
new_impl->is_dense = impl->IsDense();
new_impl->row_stride = impl->row_stride;
new_impl->base_rowid = impl->base_rowid;
new_impl->SetNumSymbols(impl->NumSymbols());

dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), impl->gidx_buffer.data(),
impl->gidx_buffer.size_bytes(), cudaMemcpyDefault));
Expand Down Expand Up @@ -108,6 +109,7 @@ class EllpackHostCacheStreamImpl {
impl->is_dense = page->IsDense();
impl->row_stride = page->row_stride;
impl->base_rowid = page->base_rowid;
impl->SetNumSymbols(page->NumSymbols());
}
};

Expand Down
12 changes: 7 additions & 5 deletions src/data/iterative_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
/**
* Generate gradient index.
*/
size_t offset = 0;
bst_idx_t offset = 0;
iter.Reset();
size_t n_batches_for_verification = 0;
bst_idx_t n_batches_for_verification = 0;
while (iter.Next()) {
init_page();
dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
Expand All @@ -71,14 +71,16 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
return GetRowCounts(value, row_counts_span, dh::GetDevice(ctx), missing);
});
auto is_dense = this->IsDense();
CHECK(is_dense) << "not implemented";

proxy->Info().feature_types.SetDevice(dh::GetDevice(ctx));
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
ext_info.row_stride, rows, cuts);
return EllpackPageImpl{
&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
ext_info.row_stride, rows, cuts};
});
std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
bst_idx_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
offset += num_elements;

proxy->Info().num_row_ = BatchSamples(proxy);
Expand Down
11 changes: 5 additions & 6 deletions src/predictor/gpu_predictor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -927,8 +927,8 @@ class GPUPredictor : public xgboost::Predictor {
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
d_model, out_preds, batch_offset);
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
out_preds, batch_offset);
batch_offset += page.Size() * model.learner_model_param->OutputLength();
}
}
Expand Down Expand Up @@ -1068,7 +1068,7 @@ class GPUPredictor : public xgboost::Predictor {
}
} else {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
Expand Down Expand Up @@ -1139,8 +1139,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
auto impl = batch.Impl();
auto acc =
impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
Expand Down Expand Up @@ -1225,7 +1224,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
bst_idx_t batch_offset = 0;
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
batch_offset += batch.Size();
Expand Down
7 changes: 3 additions & 4 deletions src/tree/fit_stump.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <cstddef> // std::size_t

#include "../collective/aggregator.cuh" // for GlobalSum
#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh" // dh::MakeTransformIterator
#include "fit_stump.h"
#include "xgboost/base.h" // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
Expand Down Expand Up @@ -39,17 +40,15 @@ void FitStump(Context const* ctx, MetaInfo const& info,
auto d_sum = sum.View(ctx->Device());
CHECK(d_sum.CContiguous());

dh::XGBCachingDeviceAllocator<char> alloc;
auto policy = thrust::cuda::par(alloc);
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
thrust::reduce_by_key(ctx->CUDACtx()->CTP(), key_it, key_it + gpair.Size(), grad_it,
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));

auto rc = collective::GlobalSum(ctx, info,
linalg::MakeVec(reinterpret_cast<double*>(d_sum.Values().data()),
d_sum.Size() * 2, ctx->Device()));
SafeColl(rc);

thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_targets,
[=] XGBOOST_DEVICE(std::size_t i) mutable {
out(i) = static_cast<float>(
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class HistogramAgent {
// Increases the throughput of this kernel significantly
__device__ void ProcessFullTileShared(std::size_t offset) {
std::size_t idx[kItemsPerThread];
int ridx[kItemsPerThread];
Idx ridx[kItemsPerThread];
int gidx[kItemsPerThread];
GradientPair gpair[kItemsPerThread];
#pragma unroll
Expand Down
7 changes: 4 additions & 3 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ struct GPUHistMakerDevice {
monitor.Start(__func__);
auto d_node_hist = histogram_.GetNodeHistogram(nidx);
auto batch = page.Impl();
auto acc = batch->GetDeviceAccessor(ctx_->Device());
auto acc = batch->GetDeviceAccessor(ctx_);

auto d_ridx = partitioners_.at(k)->GetRows(nidx);
this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
Expand Down Expand Up @@ -497,7 +497,7 @@ struct GPUHistMakerDevice {

std::int32_t k{0};
for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_);
auto go_left = GoLeftOp{d_matrix};

// Partition histogram.
Expand Down Expand Up @@ -567,9 +567,10 @@ struct GPUHistMakerDevice {
dh::CopyTo(p_tree->GetSplitCategories(), &categories);
auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
auto d_categories = dh::ToSpan(categories);
auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();

for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_, ft);

std::vector<NodeSplitData> split_data(p_tree->NumNodes());
auto const& tree = *p_tree;
Expand Down
3 changes: 2 additions & 1 deletion tests/cpp/collective/test_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ class BaseMGPUTest : public ::testing::Test {
* available.
*/
template <typename Fn>
auto DoTest(Fn&& fn, bool is_federated, [[maybe_unused]] bool emulate_if_single = false) const {
auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
[[maybe_unused]] bool emulate_if_single = false) const {
auto n_gpus = common::AllVisibleGPUs();
if (is_federated) {
#if defined(XGBOOST_USE_FEDERATED)
Expand Down
16 changes: 8 additions & 8 deletions tests/cpp/data/test_ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ TEST(EllpackPage, EmptyDMatrix) {
constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
constexpr float kSparsity = 0;
auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
Context ctx{MakeCUDACtx(0)};
auto ctx = MakeCUDACtx(0);
auto& page = *dmat->GetBatches<EllpackPage>(
&ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
.begin();
Expand Down Expand Up @@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
Context ctx{MakeCUDACtx(0)};
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
auto ellpack = EllpackPage(&ctx, m.get(), p);
auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
auto accessor = ellpack.Impl()->GetDeviceAccessor(&ctx);
ASSERT_EQ(kCats, accessor.NumBins());

auto x_copy = x;
Expand Down Expand Up @@ -167,11 +167,11 @@ TEST(EllpackPage, Copy) {
EXPECT_EQ(impl->base_rowid, current_row);

for (size_t i = 0; i < impl->Size(); i++) {
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
row_d.data().get()));
dh::LaunchN(kCols,
ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin());

dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), current_row,
row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

Expand Down Expand Up @@ -223,12 +223,12 @@ TEST(EllpackPage, Compact) {
continue;
}

dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
row_d.data().get()));
dh::LaunchN(kCols,
ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
dh::safe_cuda(cudaDeviceSynchronize());
thrust::copy(row_d.begin(), row_d.end(), row.begin());

dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), compacted_row,
row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

Expand Down
Loading

0 comments on commit 81e5c0f

Please sign in to comment.