From 6b8b8592b4f845aa9da6025e9fbb5a4b5a25905f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 13 Sep 2024 18:06:01 +0800 Subject: [PATCH 1/2] [EM] Suport quantile objectives for GPU-based external memory. - Improved error message for memory usage. - Support quantile-based objectives for GPU external memory. --- src/common/common.cc | 18 ++++ src/common/common.h | 3 + src/common/device_helpers.cuh | 41 ++++---- src/common/device_vector.cu | 8 +- src/common/device_vector.cuh | 7 +- src/gbm/gbtree.cc | 4 - src/tree/common_row_partitioner.h | 5 +- src/tree/updater_gpu_hist.cu | 7 +- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 96 ++++++++++++++++++- tests/python-gpu/test_gpu_data_iterator.py | 25 +++++ 10 files changed, 177 insertions(+), 37 deletions(-) diff --git a/src/common/common.cc b/src/common/common.cc index 10a667070da9..ee5c07754427 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -5,9 +5,11 @@ #include // for ThreadLocalStore +#include // for pow #include // for uint8_t #include // for snprintf, size_t #include // for string +#include // for pair #include "./random.h" // for GlobalRandomEngine, GlobalRandom @@ -54,4 +56,20 @@ void EscapeU8(std::string const &string, std::string *p_buffer) { } } } + +std::string HumanMemUnit(std::size_t n_bytes) { + auto n_bytes_f64 = static_cast(n_bytes); + double constexpr k1024 = 1024.0; + using P = std::pair; + std::stringstream ss; + for (auto pu : {P{3, "GB"}, P{2, "MB"}, P{1, "KB"}}) { + auto const [power, unit] = pu; + if (n_bytes_f64 >= (std::pow(k1024, power))) { + ss << (n_bytes_f64 / std::pow(k1024, power)) << unit; + return ss.str(); + } + } + ss << n_bytes_f64 << "B"; + return ss.str(); +} } // namespace xgboost::common diff --git a/src/common/common.h b/src/common/common.h index 93151670b7be..7cd131e1159a 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -188,5 +188,8 @@ template XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) { return indptr[group + 1] - 1; } + +// Convert the number of bytes to a human readable unit. +std::string HumanMemUnit(std::size_t n_bytes); } // namespace xgboost::common #endif // XGBOOST_COMMON_COMMON_H_ diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index d7b401f684f2..d3515b5b192e 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -15,8 +15,7 @@ #include #include // for size_t #include -#include // for UnitWord -#include +#include // for UnitWord, DoubleBuffer #include #include "common.h" @@ -635,7 +634,7 @@ size_t SegmentedUnique(const thrust::detail::execution_policy_base event_; public: - CUDAEvent() { dh::safe_cuda(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); } - ~CUDAEvent() { - if (event_) { - dh::safe_cuda(cudaEventDestroy(event_)); - } + CUDAEvent() + : event_{[] { + auto e = new cudaEvent_t; + dh::safe_cuda(cudaEventCreateWithFlags(e, cudaEventDisableTiming)); + return e; + }(), + [](cudaEvent_t *e) { + if (e) { + dh::safe_cuda(cudaEventDestroy(*e)); + delete e; + } + }} {} + + inline void Record(CUDAStreamView stream); // NOLINT + // Define swap-based ctor to make sure an event is always valid. + CUDAEvent(CUDAEvent &&e) : CUDAEvent() { std::swap(this->event_, e.event_); } + CUDAEvent &operator=(CUDAEvent &&e) { + std::swap(this->event_, e.event_); + return *this; } - CUDAEvent(CUDAEvent const &that) = delete; - CUDAEvent &operator=(CUDAEvent const &that) = delete; - - inline void Record(CUDAStreamView stream); // NOLINT - - operator cudaEvent_t() const { return event_; } // NOLINT + operator cudaEvent_t() const { return *event_; } // NOLINT + cudaEvent_t const *data() const { return this->event_.get(); } // NOLINT }; class CUDAStreamView { @@ -785,7 +794,7 @@ class CUDAStreamView { }; inline void CUDAEvent::Record(CUDAStreamView stream) { // NOLINT - dh::safe_cuda(cudaEventRecord(event_, cudaStream_t{stream})); + dh::safe_cuda(cudaEventRecord(*event_, cudaStream_t{stream})); } // Changing this has effect on prediction return, where we need to pass the pointer to diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu index 50922d8f978e..0cfa947ba2ac 100644 --- a/src/common/device_vector.cu +++ b/src/common/device_vector.cu @@ -2,18 +2,20 @@ * Copyright 2017-2024, XGBoost contributors */ #include "../collective/communicator-inl.h" // for GetRank +#include "common.h" // for HumanMemUnit #include "device_helpers.cuh" // for CurrentDevice #include "device_vector.cuh" namespace dh { namespace detail { -void ThrowOOMError(std::string const &err, size_t bytes) { +void ThrowOOMError(std::string const &err, std::size_t bytes) { auto device = CurrentDevice(); auto rank = xgboost::collective::GetRank(); + using xgboost::common::HumanMemUnit; std::stringstream ss; ss << "Memory allocation error on worker " << rank << ": " << err << "\n" - << "- Free memory: " << dh::AvailableMemory(device) << "\n" - << "- Requested memory: " << bytes << std::endl; + << "- Free memory: " << HumanMemUnit(dh::AvailableMemory(device)) << "\n" + << "- Requested memory: " << HumanMemUnit(bytes) << std::endl; LOG(FATAL) << ss.str(); } } // namespace detail diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index 9abcbb1d1a8b..46265c765491 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -31,7 +31,7 @@ #include // for map #include // for unique_ptr -#include "common.h" // for safe_cuda +#include "common.h" // for safe_cuda, HumanMemUnit #include "xgboost/logging.h" namespace dh { @@ -97,12 +97,13 @@ class MemoryLogger { dh::safe_cuda(cudaGetDevice(¤t_device)); LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " << " ========"; - LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB"; + LOG(CONSOLE) << "Peak memory usage: " + << xgboost::common::HumanMemUnit(stats_.peak_allocated_bytes); LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; } }; -void ThrowOOMError(std::string const &err, size_t bytes); +void ThrowOOMError(std::string const &err, std::size_t bytes); } // namespace detail inline detail::MemoryLogger &GlobalMemoryLogger() { diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc index 9ada1ff01eb2..80f319f46e0f 100644 --- a/src/gbm/gbtree.cc +++ b/src/gbm/gbtree.cc @@ -218,10 +218,6 @@ void GBTree::DoBoost(DMatrix* p_fmat, linalg::Matrix* in_gpair, model_.learner_model_param->OutputLength()); CHECK_NE(n_groups, 0); - if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf() && this->ctx_->IsCUDA()) { - LOG(FATAL) << "Current objective doesn't support external memory."; - } - // The node position for each row, 1 HDV for each tree in the forest. Note that the // position is negated if the row is sampled out. std::vector> node_position; diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h index 3e7c1123f46c..281861a367a1 100644 --- a/src/tree/common_row_partitioner.h +++ b/src/tree/common_row_partitioner.h @@ -148,9 +148,10 @@ class CommonRowPartitioner { template static void FindSplitConditions(const std::vector& nodes, const RegTree& tree, GHistIndexMatrixT const& gmat, - std::vector* split_conditions) { + std::vector* p_split_conditions) { auto const& ptrs = gmat.cut.Ptrs(); auto const& vals = gmat.cut.Values(); + auto& split_conditions = *p_split_conditions; for (std::size_t i = 0; i < nodes.size(); ++i) { bst_node_t const nidx = nodes[i].nid; @@ -167,7 +168,7 @@ class CommonRowPartitioner { split_cond = static_cast(bound); } } - (*split_conditions)[i] = split_cond; + split_conditions[i] = split_cond; } } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 95db64f60632..283a8af1b62a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -520,12 +520,11 @@ struct GPUHistMakerDevice { // prediction cache void FinalisePosition(DMatrix* p_fmat, RegTree const* p_tree, ObjInfo task, HostDeviceVector* p_out_position) { - if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) { - LOG(FATAL) << "Current objective function can not be used with external memory."; - } - monitor.Start(__func__); if (static_cast(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) { + if (task.UpdateTreeLeaf()) { + LOG(FATAL) << "Current objective function can not be used with concatenated pages."; + } // External memory with concatenation. Not supported. p_out_position->Resize(0); positions_.clear(); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index ec8372815a7c..48e916efb53e 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -3,14 +3,21 @@ */ #include #include +#include // for sort +#include // for unique +#include +#include // for RegTree -#include // for size_t -#include // for uint32_t -#include // for vector +#include // for size_t +#include // for uint32_t +#include // for distance +#include // for vector +#include "../../../../src/data/ellpack_page.cuh" +#include "../../../../src/tree/gpu_hist/expand_entry.cuh" // for GPUExpandEntry #include "../../../../src/tree/gpu_hist/row_partitioner.cuh" -#include "../../helpers.h" -#include "xgboost/base.h" +#include "../../../../src/tree/param.h" // for TrainParam +#include "../../helpers.h" // for RandomDataGenerator namespace xgboost::tree { void TestUpdatePositionBatch() { @@ -91,4 +98,83 @@ TEST(GpuHist, SortPositionBatch) { TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}}); TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); } + +namespace { +void GetSplit(RegTree* tree, float split_value, std::vector* candidates) { + CHECK(!tree->IsMultiTarget()); + tree->ExpandNode( + /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value, + /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + /*left_sum=*/0.0f, + /*right_sum=*/0.0f); + candidates->front().nid = 0; + candidates->front().depth = 0; + candidates->front().split.fvalue = split_value; + candidates->front().split.findex = 0; +} + +void TestExternalMemory() { + auto ctx = MakeCUDACtx(0); + + bst_bin_t max_bin = 32; + auto p_fmat = + RandomDataGenerator{256, 16, 0.0f}.Batches(4).GenerateSparsePageDMatrix("temp", true); + + std::vector> partitioners; + RegTree tree; + std::vector candidates(1); + + auto param = BatchParam{max_bin, TrainParam::DftSparseThreshold()}; + float split_value{0.0f}; + bst_feature_t const split_ind = 0; + dh::device_vector position(p_fmat->Info().num_row_, 0); + + auto encode_op = [=] __device__(bst_idx_t, bst_node_t nidx) { + return nidx; + }; // NOLINT + + for (auto const& page : p_fmat->GetBatches(&ctx, param)) { + if (partitioners.empty()) { + auto ptr = page.Impl()->Cuts().Ptrs()[split_ind + 1]; + split_value = page.Impl()->Cuts().Values().at(ptr / 2); + GetSplit(&tree, split_value, &candidates); + } + + partitioners.emplace_back(std::make_unique()); + partitioners.back()->Reset(&ctx, page.Size(), page.BaseRowId()); + std::vector splits{tree[0]}; + auto acc = page.Impl()->GetDeviceAccessor(&ctx); + partitioners.back()->UpdatePositionBatch( + {0}, {1}, {2}, splits, + [=] __device__(bst_idx_t ridx, std::int32_t nidx_in_batch, RegTree::Node const& node) { + auto fvalue = acc.GetFvalue(ridx, node.SplitIndex()); + return fvalue <= node.SplitCond(); + }); + partitioners.back()->FinalisePosition( + &ctx, dh::ToSpan(position).subspan(page.BaseRowId(), page.Size()), page.BaseRowId(), + encode_op); + } + + bst_idx_t n_left{0}; + for (auto const& page : p_fmat->GetBatches()) { + auto batch = page.GetView(); + for (size_t i = 0; i < batch.Size(); ++i) { + if (batch[i][split_ind].fvalue < split_value) { + n_left++; + } + } + } + + RegTree::Node node = tree[RegTree::kRoot]; + auto n_left_pos = + thrust::count_if(position.cbegin(), position.cend(), + [=] XGBOOST_DEVICE(bst_node_t v) { return v == node.LeftChild(); }); + ASSERT_EQ(n_left, n_left_pos); + thrust::sort(position.begin(), position.end()); + auto end_it = thrust::unique(position.begin(), position.end()); + ASSERT_EQ(std::distance(position.begin(), end_it), 2); +} +} // anonymous namespace + +TEST(RowPartitioner, LeafPartitionExternalMemory) { TestExternalMemory(); } } // namespace xgboost::tree diff --git a/tests/python-gpu/test_gpu_data_iterator.py b/tests/python-gpu/test_gpu_data_iterator.py index e039e0348c3a..76811675b682 100644 --- a/tests/python-gpu/test_gpu_data_iterator.py +++ b/tests/python-gpu/test_gpu_data_iterator.py @@ -70,3 +70,28 @@ def test_extmem_qdm( n_samples_per_batch: int, n_features: int, n_batches: int, on_host: bool ) -> None: check_extmem_qdm(n_samples_per_batch, n_features, n_batches, "cuda", on_host) + + +@given( + strategies.integers(1, 64), + strategies.integers(1, 8), + strategies.integers(1, 4), +) +@settings(deadline=None, max_examples=10, print_blob=True) +def test_quantile_objective( + n_samples_per_batch: int, n_features: int, n_batches: int +) -> None: + check_quantile_loss_extmem( + n_samples_per_batch, + n_features, + n_batches, + "hist", + "cuda", + ) + check_quantile_loss_extmem( + n_samples_per_batch, + n_features, + n_batches, + "approx", + "cuda", + ) From 57d4a92a207317b9c3cefb00a549748b1278f85f Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 13 Sep 2024 21:10:19 +0800 Subject: [PATCH 2/2] cpplint. --- src/common/common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/common.cc b/src/common/common.cc index ee5c07754427..4609a93528e2 100644 --- a/src/common/common.cc +++ b/src/common/common.cc @@ -63,7 +63,7 @@ std::string HumanMemUnit(std::size_t n_bytes) { using P = std::pair; std::stringstream ss; for (auto pu : {P{3, "GB"}, P{2, "MB"}, P{1, "KB"}}) { - auto const [power, unit] = pu; + auto const [power, unit] = pu; // NOLINT if (n_bytes_f64 >= (std::pow(k1024, power))) { ss << (n_bytes_f64 / std::pow(k1024, power)) << unit; return ss.str();