Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[EM] Refactor ellpack construction. #10810

Merged
merged 2 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions src/common/compressed_iterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,11 @@ class CompressedBufferWriter {
static size_t CalculateBufferSize(size_t num_elements, size_t num_symbols) {
constexpr int kBitsPerByte = 8;
size_t compressed_size = static_cast<size_t>(std::ceil(
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) /
kBitsPerByte));
static_cast<double>(detail::SymbolBits(num_symbols) * num_elements) / kBitsPerByte));
// Handle atomicOr where input must be unsigned int, hence 4 bytes aligned.
size_t ret =
std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
static_cast<double>(sizeof(unsigned int))) *
sizeof(unsigned int);
size_t ret = std::ceil(static_cast<double>(compressed_size + detail::kPadding) /
static_cast<double>(sizeof(std::uint32_t))) *
sizeof(std::uint32_t);
return ret;
}

Expand Down
178 changes: 99 additions & 79 deletions src/data/ellpack_page.cu

Large diffs are not rendered by default.

45 changes: 23 additions & 22 deletions src/data/ellpack_page.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,20 @@ struct EllpackDeviceAccessor {
common::Span<const FeatureType> feature_types;

EllpackDeviceAccessor() = delete;
EllpackDeviceAccessor(DeviceOrd device, std::shared_ptr<const common::HistogramCuts> cuts,
bool is_dense, size_t row_stride, size_t base_rowid, size_t n_rows,
EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
common::CompressedIterator<uint32_t> gidx_iter,
common::Span<FeatureType const> feature_types)
: is_dense(is_dense),
row_stride(row_stride),
base_rowid(base_rowid),
n_rows(n_rows),
gidx_iter(gidx_iter),
: is_dense{is_dense},
row_stride{row_stride},
base_rowid{base_rowid},
n_rows{n_rows},
gidx_iter{gidx_iter},
feature_types{feature_types} {
if (device.IsCUDA()) {
cuts->cut_values_.SetDevice(device);
cuts->cut_ptrs_.SetDevice(device);
cuts->min_vals_.SetDevice(device);
if (ctx->IsCUDA()) {
cuts->cut_values_.SetDevice(ctx->Device());
cuts->cut_ptrs_.SetDevice(ctx->Device());
cuts->min_vals_.SetDevice(ctx->Device());
gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
min_fvalue = cuts->min_vals_.ConstDeviceSpan();
Expand Down Expand Up @@ -127,9 +127,6 @@ struct EllpackDeviceAccessor {
[[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
return row_id >= base_rowid && row_id < base_rowid + n_rows;
}
/*! \brief Return the total number of symbols (total number of bins plus 1 for
* not found). */
[[nodiscard]] XGBOOST_DEVICE size_t NumSymbols() const { return gidx_fvalue_map.size() + 1; }

[[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); }

Expand Down Expand Up @@ -160,7 +157,7 @@ class EllpackPageImpl {
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows);
/**
* @brief Constructor used for external memory.
* @brief Constructor used for external memory with DMatrix.
*/
EllpackPageImpl(Context const* ctx, std::shared_ptr<common::HistogramCuts const> cuts,
const SparsePage& page, bool is_dense, size_t row_stride,
Expand All @@ -173,12 +170,14 @@ class EllpackPageImpl {
* in CSR format.
*/
explicit EllpackPageImpl(Context const* ctx, DMatrix* dmat, const BatchParam& parm);

/**
* @brief Constructor for Quantile DMatrix using an adapter.
*/
template <typename AdapterBatch>
explicit EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, bool is_dense,
common::Span<size_t> row_counts_span,
common::Span<FeatureType const> feature_types, size_t row_stride,
size_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
bst_idx_t n_rows, std::shared_ptr<common::HistogramCuts const> cuts);
/**
* @brief Constructor from an existing CPU gradient index.
*/
Expand Down Expand Up @@ -214,7 +213,7 @@ class EllpackPageImpl {

[[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
[[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts) { cuts_ = cuts; }
void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts);

[[nodiscard]] bool IsDense() const { return is_dense; }
/** @return Estimation of memory cost of this page. */
Expand All @@ -224,12 +223,14 @@ class EllpackPageImpl {
* @brief Return the total number of symbols (total number of bins plus 1 for not
* found).
*/
[[nodiscard]] std::size_t NumSymbols() const { return cuts_->TotalBins() + 1; }
[[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; }
void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; }

/**
* @brief Get an accessor that can be passed into CUDA kernels.
*/
[[nodiscard]] EllpackDeviceAccessor GetDeviceAccessor(
DeviceOrd device, common::Span<FeatureType const> feature_types = {}) const;
Context const* ctx, common::Span<FeatureType const> feature_types = {}) const;
/**
* @brief Get an accessor for host code.
*/
Expand All @@ -246,10 +247,9 @@ class EllpackPageImpl {
/**
* @brief Compress a single page of CSR data into ELLPACK.
*
* @param device The GPU device to use.
* @param row_batch The CSR page.
*/
void CreateHistIndices(DeviceOrd device, const SparsePage& row_batch,
void CreateHistIndices(Context const* ctx, const SparsePage& row_batch,
common::Span<FeatureType const> feature_types);
/**
* @brief Initialize the buffer to store compressed features.
Expand All @@ -272,6 +272,7 @@ class EllpackPageImpl {

private:
std::shared_ptr<common::HistogramCuts const> cuts_;
bst_idx_t n_symbols_{0};
common::Monitor monitor_;
};

Expand Down
15 changes: 11 additions & 4 deletions src/data/ellpack_page_raw_format.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ template <typename T>
xgboost_NVTX_FN_RANGE();
auto* impl = page->Impl();

impl->SetCuts(this->cuts_);
RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride));
Expand All @@ -66,6 +65,12 @@ template <typename T>
RET_IF_NOT(common::ReadVec(fi, &impl->gidx_buffer));
}
RET_IF_NOT(fi->Read(&impl->base_rowid));
bst_idx_t n_symbols{0};
RET_IF_NOT(fi->Read(&n_symbols));
impl->SetNumSymbols(n_symbols);

impl->SetCuts(this->cuts_);

dh::DefaultStream().Sync();
return true;
}
Expand All @@ -84,6 +89,8 @@ template <typename T>
[[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
bytes += common::WriteVec(fo, h_gidx_buffer);
bytes += fo->Write(impl->base_rowid);
bytes += fo->Write(impl->NumSymbols());

dh::DefaultStream().Sync();
return bytes;
}
Expand All @@ -93,9 +100,10 @@ template <typename T>

auto* impl = page->Impl();
CHECK(this->cuts_->cut_values_.DeviceCanRead());
impl->SetCuts(this->cuts_);

fi->Read(page, this->param_.prefetch_copy || !this->has_hmm_ats_);
impl->SetCuts(this->cuts_);

dh::DefaultStream().Sync();

return true;
Expand All @@ -108,8 +116,7 @@ template <typename T>
fo->Write(page);
dh::DefaultStream().Sync();

auto* impl = page.Impl();
return impl->MemCostBytes();
return page.Impl()->MemCostBytes();
}

#undef RET_IF_NOT
Expand Down
2 changes: 2 additions & 0 deletions src/data/ellpack_page_source.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ class EllpackHostCacheStreamImpl {
new_impl->is_dense = impl->IsDense();
new_impl->row_stride = impl->row_stride;
new_impl->base_rowid = impl->base_rowid;
new_impl->SetNumSymbols(impl->NumSymbols());

dh::safe_cuda(cudaMemcpyAsync(new_impl->gidx_buffer.data(), impl->gidx_buffer.data(),
impl->gidx_buffer.size_bytes(), cudaMemcpyDefault));
Expand Down Expand Up @@ -108,6 +109,7 @@ class EllpackHostCacheStreamImpl {
impl->is_dense = page->IsDense();
impl->row_stride = page->row_stride;
impl->base_rowid = page->base_rowid;
impl->SetNumSymbols(page->NumSymbols());
}
};

Expand Down
11 changes: 6 additions & 5 deletions src/data/iterative_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,9 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
/**
* Generate gradient index.
*/
size_t offset = 0;
bst_idx_t offset = 0;
iter.Reset();
size_t n_batches_for_verification = 0;
bst_idx_t n_batches_for_verification = 0;
while (iter.Next()) {
init_page();
dh::safe_cuda(cudaSetDevice(dh::GetDevice(ctx).ordinal));
Expand All @@ -75,10 +75,11 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
proxy->Info().feature_types.SetDevice(dh::GetDevice(ctx));
auto d_feature_types = proxy->Info().feature_types.ConstDeviceSpan();
auto new_impl = cuda_impl::Dispatch(proxy, [&](auto const& value) {
return EllpackPageImpl(&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
ext_info.row_stride, rows, cuts);
return EllpackPageImpl{
&fmat_ctx_, value, missing, is_dense, row_counts_span, d_feature_types,
ext_info.row_stride, rows, cuts};
});
std::size_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
bst_idx_t num_elements = ellpack_->Impl()->Copy(&fmat_ctx_, &new_impl, offset);
offset += num_elements;

proxy->Info().num_row_ = BatchSamples(proxy);
Expand Down
11 changes: 5 additions & 6 deletions src/predictor/gpu_predictor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -927,8 +927,8 @@ class GPUPredictor : public xgboost::Predictor {
for (auto const& page : dmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
dmat->Info().feature_types.SetDevice(ctx_->Device());
auto feature_types = dmat->Info().feature_types.ConstDeviceSpan();
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_->Device(), feature_types),
d_model, out_preds, batch_offset);
this->PredictInternal(page.Impl()->GetDeviceAccessor(ctx_, feature_types), d_model,
out_preds, batch_offset);
batch_offset += page.Size() * model.learner_model_param->OutputLength();
}
}
Expand Down Expand Up @@ -1068,7 +1068,7 @@ class GPUPredictor : public xgboost::Predictor {
}
} else {
for (auto& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
EllpackDeviceAccessor acc{batch.Impl()->GetDeviceAccessor(ctx_)};
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
Expand Down Expand Up @@ -1139,8 +1139,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, {})) {
auto impl = batch.Impl();
auto acc =
impl->GetDeviceAccessor(ctx_->Device(), p_fmat->Info().feature_types.ConstDeviceSpan());
auto acc = impl->GetDeviceAccessor(ctx_, p_fmat->Info().feature_types.ConstDeviceSpan());
auto begin = dh::tbegin(phis) + batch.BaseRowId() * dim_size;
auto X = EllpackLoader{acc, true, model.learner_model_param->num_feature, batch.Size(),
std::numeric_limits<float>::quiet_NaN()};
Expand Down Expand Up @@ -1225,7 +1224,7 @@ class GPUPredictor : public xgboost::Predictor {
} else {
bst_idx_t batch_offset = 0;
for (auto const& batch : p_fmat->GetBatches<EllpackPage>(ctx_, BatchParam{})) {
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_->Device())};
EllpackDeviceAccessor data{batch.Impl()->GetDeviceAccessor(ctx_)};
auto grid = static_cast<std::uint32_t>(common::DivRoundUp(batch.Size(), kBlockThreads));
launch(PredictLeafKernel<EllpackLoader, EllpackDeviceAccessor>, grid, data, batch_offset);
batch_offset += batch.Size();
Expand Down
7 changes: 3 additions & 4 deletions src/tree/fit_stump.cu
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include <cstddef> // std::size_t

#include "../collective/aggregator.cuh" // for GlobalSum
#include "../common/cuda_context.cuh"
#include "../common/device_helpers.cuh" // dh::MakeTransformIterator
#include "fit_stump.h"
#include "xgboost/base.h" // GradientPairPrecise, GradientPair, XGBOOST_DEVICE
Expand Down Expand Up @@ -39,17 +40,15 @@ void FitStump(Context const* ctx, MetaInfo const& info,
auto d_sum = sum.View(ctx->Device());
CHECK(d_sum.CContiguous());

dh::XGBCachingDeviceAllocator<char> alloc;
auto policy = thrust::cuda::par(alloc);
thrust::reduce_by_key(policy, key_it, key_it + gpair.Size(), grad_it,
thrust::reduce_by_key(ctx->CUDACtx()->CTP(), key_it, key_it + gpair.Size(), grad_it,
thrust::make_discard_iterator(), dh::tbegin(d_sum.Values()));

auto rc = collective::GlobalSum(ctx, info,
linalg::MakeVec(reinterpret_cast<double*>(d_sum.Values().data()),
d_sum.Size() * 2, ctx->Device()));
SafeColl(rc);

thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), n_targets,
thrust::for_each_n(ctx->CUDACtx()->CTP(), thrust::make_counting_iterator(0ul), n_targets,
[=] XGBOOST_DEVICE(std::size_t i) mutable {
out(i) = static_cast<float>(
CalcUnregularizedWeight(d_sum(i).GetGrad(), d_sum(i).GetHess()));
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class HistogramAgent {
// Increases the throughput of this kernel significantly
__device__ void ProcessFullTileShared(std::size_t offset) {
std::size_t idx[kItemsPerThread];
int ridx[kItemsPerThread];
Idx ridx[kItemsPerThread];
int gidx[kItemsPerThread];
GradientPair gpair[kItemsPerThread];
#pragma unroll
Expand Down
7 changes: 4 additions & 3 deletions src/tree/updater_gpu_hist.cu
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@ struct GPUHistMakerDevice {
monitor.Start(__func__);
auto d_node_hist = histogram_.GetNodeHistogram(nidx);
auto batch = page.Impl();
auto acc = batch->GetDeviceAccessor(ctx_->Device());
auto acc = batch->GetDeviceAccessor(ctx_);

auto d_ridx = partitioners_.at(k)->GetRows(nidx);
this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
Expand Down Expand Up @@ -497,7 +497,7 @@ struct GPUHistMakerDevice {

std::int32_t k{0};
for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(prefetch_copy))) {
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_);
auto go_left = GoLeftOp{d_matrix};

// Partition histogram.
Expand Down Expand Up @@ -567,9 +567,10 @@ struct GPUHistMakerDevice {
dh::CopyTo(p_tree->GetSplitCategories(), &categories);
auto const& cat_segments = p_tree->GetSplitCategoriesPtr();
auto d_categories = dh::ToSpan(categories);
auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();

for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx_, StaticBatch(true))) {
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_->Device());
auto d_matrix = page.Impl()->GetDeviceAccessor(ctx_, ft);

std::vector<NodeSplitData> split_data(p_tree->NumNodes());
auto const& tree = *p_tree;
Expand Down
3 changes: 2 additions & 1 deletion tests/cpp/collective/test_worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,8 @@ class BaseMGPUTest : public ::testing::Test {
* available.
*/
template <typename Fn>
auto DoTest(Fn&& fn, bool is_federated, [[maybe_unused]] bool emulate_if_single = false) const {
auto DoTest([[maybe_unused]] Fn&& fn, bool is_federated,
[[maybe_unused]] bool emulate_if_single = false) const {
auto n_gpus = common::AllVisibleGPUs();
if (is_federated) {
#if defined(XGBOOST_USE_FEDERATED)
Expand Down
16 changes: 8 additions & 8 deletions tests/cpp/data/test_ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ TEST(EllpackPage, EmptyDMatrix) {
constexpr int kNRows = 0, kNCols = 0, kMaxBin = 256;
constexpr float kSparsity = 0;
auto dmat = RandomDataGenerator(kNRows, kNCols, kSparsity).GenerateDMatrix();
Context ctx{MakeCUDACtx(0)};
auto ctx = MakeCUDACtx(0);
auto& page = *dmat->GetBatches<EllpackPage>(
&ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
.begin();
Expand Down Expand Up @@ -94,7 +94,7 @@ TEST(EllpackPage, FromCategoricalBasic) {
Context ctx{MakeCUDACtx(0)};
auto p = BatchParam{max_bins, tree::TrainParam::DftSparseThreshold()};
auto ellpack = EllpackPage(&ctx, m.get(), p);
auto accessor = ellpack.Impl()->GetDeviceAccessor(ctx.Device());
auto accessor = ellpack.Impl()->GetDeviceAccessor(&ctx);
ASSERT_EQ(kCats, accessor.NumBins());

auto x_copy = x;
Expand Down Expand Up @@ -167,11 +167,11 @@ TEST(EllpackPage, Copy) {
EXPECT_EQ(impl->base_rowid, current_row);

for (size_t i = 0; i < impl->Size(); i++) {
dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
row_d.data().get()));
dh::LaunchN(kCols,
ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
thrust::copy(row_d.begin(), row_d.end(), row.begin());

dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), current_row,
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), current_row,
row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

Expand Down Expand Up @@ -223,12 +223,12 @@ TEST(EllpackPage, Compact) {
continue;
}

dh::LaunchN(kCols, ReadRowFunction(impl->GetDeviceAccessor(ctx.Device()), current_row,
row_d.data().get()));
dh::LaunchN(kCols,
ReadRowFunction(impl->GetDeviceAccessor(&ctx), current_row, row_d.data().get()));
dh::safe_cuda(cudaDeviceSynchronize());
thrust::copy(row_d.begin(), row_d.end(), row.begin());

dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(ctx.Device()), compacted_row,
dh::LaunchN(kCols, ReadRowFunction(result.GetDeviceAccessor(&ctx), compacted_row,
row_result_d.data().get()));
thrust::copy(row_result_d.begin(), row_result_d.end(), row_result.begin());

Expand Down
Loading
Loading