From 5509db51ac3093ce6e012c9ec1403ed628b5cb6b Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 9 Sep 2024 21:54:54 +0800 Subject: [PATCH] [EM] Avoid synchronous calls and unnecessary ATS access. - Pass context into various functions. - Factor out some CUDA algorithms. - Use ATS only for update position. --- src/common/algorithm.cuh | 38 +++++++++- src/common/device_helpers.cuh | 69 ++++------------- src/common/ranking_utils.cu | 4 +- src/common/threading_utils.cuh | 24 +++--- src/data/ellpack_page.cu | 25 +----- src/metric/auc.cu | 24 +++--- src/metric/elementwise_metric.cu | 8 +- src/tree/constraints.cu | 13 ++-- src/tree/constraints.cuh | 2 +- src/tree/gpu_hist/evaluate_splits.cuh | 8 +- src/tree/gpu_hist/evaluator.cu | 19 +++-- src/tree/gpu_hist/histogram.cu | 4 +- src/tree/gpu_hist/row_partitioner.cuh | 9 ++- src/tree/updater_gpu_hist.cu | 76 +++++++++---------- tests/cpp/common/test_threading_utils.cu | 16 ++-- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 23 +++--- 16 files changed, 161 insertions(+), 201 deletions(-) diff --git a/src/common/algorithm.cuh b/src/common/algorithm.cuh index b0bec3488979..e88eb1f0c9b1 100644 --- a/src/common/algorithm.cuh +++ b/src/common/algorithm.cuh @@ -190,8 +190,7 @@ void SegmentedArgMergeSort(Context const *ctx, SegIt seg_begin, SegIt seg_end, V } template -void ArgSort(xgboost::Context const *ctx, xgboost::common::Span keys, - xgboost::common::Span sorted_idx) { +void ArgSort(Context const *ctx, Span keys, Span sorted_idx) { std::size_t bytes = 0; auto cuctx = ctx->CUDACtx(); dh::Iota(sorted_idx, cuctx->Stream()); @@ -272,5 +271,40 @@ void CopyIf(CUDAContext const *cuctx, InIt in_first, InIt in_second, OutIt out_f out_first = thrust::copy_if(cuctx->CTP(), begin_input, end_input, out_first, pred); } } + +// Go one level down into cub::DeviceScan API to set OffsetT as 64 bit So we don't crash +// on n > 2^31. +template +void InclusiveScan(xgboost::Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out, + ScanOpT scan_op, OffsetT num_items) { + auto cuctx = ctx->CUDACtx(); + std::size_t bytes = 0; +#if THRUST_MAJOR_VERSION >= 2 + dh::safe_cuda(( + cub::DispatchScan::Dispatch( + nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr))); +#else + safe_cuda(( + cub::DispatchScan::Dispatch( + nullptr, bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr, false))); +#endif + dh::TemporaryArray storage(bytes); +#if THRUST_MAJOR_VERSION >= 2 + dh::safe_cuda(( + cub::DispatchScan::Dispatch( + storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr))); +#else + safe_cuda(( + cub::DispatchScan::Dispatch( + storage.data().get(), bytes, d_in, d_out, scan_op, cub::NullType(), num_items, nullptr, + false))); +#endif +} + +template +void InclusiveSum(Context const *ctx, InputIteratorT d_in, OutputIteratorT d_out, + OffsetT num_items) { + InclusiveScan(ctx, d_in, d_out, cub::Sum{}, num_items); +} } // namespace xgboost::common #endif // XGBOOST_COMMON_ALGORITHM_CUH_ diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 2e5fb5cd91b7..d7b401f684f2 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -372,21 +372,6 @@ void CopyDeviceSpanToVector(std::vector *dst, xgboost::common::Span cudaMemcpyDeviceToHost)); } -template -void CopyTo(Src const &src, Dst *dst) { - if (src.empty()) { - dst->clear(); - return; - } - dst->resize(src.size()); - using SVT = std::remove_cv_t; - using DVT = std::remove_cv_t; - static_assert(std::is_same_v, - "Host and device containers must have same value type."); - dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(), - src.size() * sizeof(SVT), cudaMemcpyDefault)); -} - // Keep track of pinned memory allocation struct PinnedMemory { void *temp_storage{nullptr}; @@ -748,45 +733,6 @@ auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce return aggregate; } -// wrapper to avoid integer `num_items`. -template -void InclusiveScan(InputIteratorT d_in, OutputIteratorT d_out, ScanOpT scan_op, - OffsetT num_items) { - size_t bytes = 0; -#if THRUST_MAJOR_VERSION >= 2 - safe_cuda(( - cub::DispatchScan::Dispatch(nullptr, bytes, d_in, d_out, scan_op, - cub::NullType(), num_items, nullptr))); -#else - safe_cuda(( - cub::DispatchScan::Dispatch(nullptr, bytes, d_in, d_out, scan_op, - cub::NullType(), num_items, nullptr, - false))); -#endif - TemporaryArray storage(bytes); -#if THRUST_MAJOR_VERSION >= 2 - safe_cuda(( - cub::DispatchScan::Dispatch(storage.data().get(), bytes, d_in, - d_out, scan_op, cub::NullType(), - num_items, nullptr))); -#else - safe_cuda(( - cub::DispatchScan::Dispatch(storage.data().get(), bytes, d_in, - d_out, scan_op, cub::NullType(), - num_items, nullptr, false))); -#endif -} - -template -void InclusiveSum(InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items) { - InclusiveScan(d_in, d_out, cub::Sum(), num_items); -} - class CUDAStreamView; class CUDAEvent { @@ -857,8 +803,23 @@ class CUDAStream { [[nodiscard]] cudaStream_t Handle() const { return stream_; } void Sync() { this->View().Sync(); } + void Wait(CUDAEvent const &e) { this->View().Wait(e); } }; +template +void CopyTo(Src const &src, Dst *dst, CUDAStreamView stream = DefaultStream()) { + if (src.empty()) { + dst->clear(); + return; + } + dst->resize(src.size()); + using SVT = std::remove_cv_t; + using DVT = std::remove_cv_t; + static_assert(std::is_same_v, "Host and device containers must have same value type."); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(dst->data()), src.data(), + src.size() * sizeof(SVT), cudaMemcpyDefault, stream)); +} + inline auto CachingThrustPolicy() { XGBCachingDeviceAllocator alloc; #if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) diff --git a/src/common/ranking_utils.cu b/src/common/ranking_utils.cu index 5ad8a575c468..c67af5571be1 100644 --- a/src/common/ranking_utils.cu +++ b/src/common/ranking_utils.cu @@ -1,5 +1,5 @@ /** - * Copyright 2023 by XGBoost Contributors + * Copyright 2023-2024, XGBoost Contributors */ #include // for maximum #include // for make_counting_iterator @@ -158,7 +158,7 @@ void RankingCache::InitOnCUDA(Context const* ctx, MetaInfo const& info) { auto d_threads_group_ptr = threads_group_ptr_.DeviceSpan(); if (param_.HasTruncation()) { n_cuda_threads_ = - common::SegmentedTrapezoidThreads(d_group_ptr, d_threads_group_ptr, Param().NumPair()); + common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr, Param().NumPair()); } else { auto n_pairs = Param().NumPair(); dh::LaunchN(n_groups, cuctx->Stream(), diff --git a/src/common/threading_utils.cuh b/src/common/threading_utils.cuh index db5fe82f94ac..1a4e29f38645 100644 --- a/src/common/threading_utils.cuh +++ b/src/common/threading_utils.cuh @@ -1,20 +1,20 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2024, XGBoost Contributors */ #ifndef XGBOOST_COMMON_THREADING_UTILS_CUH_ #define XGBOOST_COMMON_THREADING_UTILS_CUH_ -#include // std::min -#include // std::size_t +#include // std::min +#include // std::size_t #include "./math.h" // Sqr -#include "common.h" +#include "algorithm.cuh" // for InclusiveSum +#include "common.h" // for safe_cuda #include "device_helpers.cuh" // LaunchN #include "xgboost/base.h" // XGBOOST_DEVICE #include "xgboost/span.h" // Span -namespace xgboost { -namespace common { +namespace xgboost::common { /** * \param n Number of items (length of the base) * \param h hight @@ -43,9 +43,8 @@ XGBOOST_DEVICE inline std::size_t DiscreteTrapezoidArea(std::size_t n, std::size * with h <= n */ template -std::size_t SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, - xgboost::common::Span out_group_threads_ptr, - std::size_t h) { +std::size_t SegmentedTrapezoidThreads(Context const *ctx, Span group_ptr, + Span out_group_threads_ptr, std::size_t h) { CHECK_GE(group_ptr.size(), 1); CHECK_EQ(group_ptr.size(), out_group_threads_ptr.size()); dh::LaunchN(group_ptr.size(), [=] XGBOOST_DEVICE(std::size_t idx) { @@ -57,8 +56,8 @@ std::size_t SegmentedTrapezoidThreads(xgboost::common::Span group_ptr, std::size_t cnt = static_cast(group_ptr[idx] - group_ptr[idx - 1]); out_group_threads_ptr[idx] = DiscreteTrapezoidArea(cnt, h); }); - dh::InclusiveSum(out_group_threads_ptr.data(), out_group_threads_ptr.data(), - out_group_threads_ptr.size()); + InclusiveSum(ctx, out_group_threads_ptr.data(), out_group_threads_ptr.data(), + out_group_threads_ptr.size()); std::size_t total = 0; dh::safe_cuda(cudaMemcpy(&total, out_group_threads_ptr.data() + out_group_threads_ptr.size() - 1, sizeof(total), cudaMemcpyDeviceToHost)); @@ -82,6 +81,5 @@ XGBOOST_DEVICE inline void UnravelTrapeziodIdx(std::size_t i_idx, std::size_t n, j = idx - n_elems + i + 1; } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common #endif // XGBOOST_COMMON_THREADING_UTILS_CUH_ diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index 0f4cf3a2edc8..8f8ab0af7d01 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -254,30 +254,7 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch, d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid}; thrust::transform_output_iterator out(discard, functor); - // Go one level down into cub::DeviceScan API to set OffsetT as 64 bit - // So we don't crash on n > 2^31 - size_t temp_storage_bytes = 0; - using DispatchScan = cub::DispatchScan, cub::NullType, std::int64_t>; -#if THRUST_MAJOR_VERSION >= 2 - dh::safe_cuda(DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out, - TupleScanOp(), cub::NullType(), batch.Size(), - ctx->CUDACtx()->Stream())); -#else - DispatchScan::Dispatch(nullptr, temp_storage_bytes, key_value_index_iter, out, - TupleScanOp(), cub::NullType(), batch.Size(), - nullptr, false); -#endif - dh::TemporaryArray temp_storage(temp_storage_bytes); -#if THRUST_MAJOR_VERSION >= 2 - dh::safe_cuda(DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes, - key_value_index_iter, out, TupleScanOp(), - cub::NullType(), batch.Size(), ctx->CUDACtx()->Stream())); -#else - DispatchScan::Dispatch(temp_storage.data().get(), temp_storage_bytes, - key_value_index_iter, out, TupleScanOp(), - cub::NullType(), batch.Size(), nullptr, false); -#endif + common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp{}, batch.Size()); } void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span row_counts) { diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 4155a7084481..37f089ec6057 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -13,7 +13,7 @@ #include #include "../collective/allreduce.h" -#include "../common/algorithm.cuh" // SegmentedArgSort +#include "../common/algorithm.cuh" // SegmentedArgSort, InclusiveScan #include "../common/optional_weight.h" // OptionalWeights #include "../common/threading_utils.cuh" // UnravelTrapeziodIdx,SegmentedTrapezoidThreads #include "auc.h" @@ -128,8 +128,8 @@ std::tuple GPUBinaryAUC(Context const *ctx, dh::tbegin(d_unique_idx)); d_unique_idx = d_unique_idx.subspan(0, end_unique.second - dh::tbegin(d_unique_idx)); - dh::InclusiveScan(dh::tbegin(d_fptp), dh::tbegin(d_fptp), - PairPlus{}, d_fptp.size()); + common::InclusiveScan(ctx, dh::tbegin(d_fptp), dh::tbegin(d_fptp), PairPlus{}, + d_fptp.size()); auto d_neg_pos = dh::ToSpan(cache->neg_pos); // scatter unique negaive/positive values @@ -239,7 +239,7 @@ double ScaleClasses(Context const *ctx, bool is_column_split, common::Span -void SegmentedFPTP(common::Span d_fptp, Fn segment_id) { +void SegmentedFPTP(Context const *ctx, common::Span d_fptp, Fn segment_id) { using Triple = thrust::tuple; // expand to tuple to include idx auto fptp_it_in = dh::MakeTransformIterator( @@ -253,8 +253,8 @@ void SegmentedFPTP(common::Span d_fptp, Fn segment_id) { thrust::make_pair(thrust::get<1>(t), thrust::get<2>(t)); return t; }); - dh::InclusiveScan( - fptp_it_in, fptp_it_out, + common::InclusiveScan( + ctx, fptp_it_in, fptp_it_out, [=] XGBOOST_DEVICE(Triple const &l, Triple const &r) { uint32_t l_gid = segment_id(thrust::get<0>(l)); uint32_t r_gid = segment_id(thrust::get<0>(r)); @@ -391,7 +391,7 @@ double GPUMultiClassAUCOVR(Context const *ctx, MetaInfo const &info, d_unique_idx = d_unique_idx.subspan(0, n_uniques); auto get_class_id = [=] XGBOOST_DEVICE(size_t idx) { return idx / n_samples; }; - SegmentedFPTP(d_fptp, get_class_id); + SegmentedFPTP(ctx, d_fptp, get_class_id); // scatter unique FP_PREV/TP_PREV values auto d_neg_pos = dh::ToSpan(cache->neg_pos); @@ -528,8 +528,8 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< dh::caching_device_vector threads_group_ptr(group_ptr.size(), 0); auto d_threads_group_ptr = dh::ToSpan(threads_group_ptr); // Use max to represent triangle - auto n_threads = common::SegmentedTrapezoidThreads( - d_group_ptr, d_threads_group_ptr, std::numeric_limits::max()); + auto n_threads = common::SegmentedTrapezoidThreads(ctx, d_group_ptr, d_threads_group_ptr, + std::numeric_limits::max()); CHECK_LT(n_threads, std::numeric_limits::max()); // get the coordinate in nested summation auto get_i_j = [=]XGBOOST_DEVICE(size_t idx, size_t query_group_idx) { @@ -591,8 +591,8 @@ std::pair GPURankingAUC(Context const *ctx, common::Span< } return {}; // discard }); - dh::InclusiveScan( - in, out, + common::InclusiveScan( + ctx, in, out, [] XGBOOST_DEVICE(RankScanItem const &l, RankScanItem const &r) { if (l.group_id != r.group_id) { return r; @@ -774,7 +774,7 @@ std::pair GPURankingPRAUCImpl(Context const *ctx, auto get_group_id = [=] XGBOOST_DEVICE(size_t idx) { return dh::SegmentId(d_group_ptr, idx); }; - SegmentedFPTP(d_fptp, get_group_id); + SegmentedFPTP(ctx, d_fptp, get_group_id); // scatter unique FP_PREV/TP_PREV values auto d_neg_pos = dh::ToSpan(cache->neg_pos); diff --git a/src/metric/elementwise_metric.cu b/src/metric/elementwise_metric.cu index ec5b9079d7d9..7b662143a89b 100644 --- a/src/metric/elementwise_metric.cu +++ b/src/metric/elementwise_metric.cu @@ -12,7 +12,6 @@ #include #include // for accumulate -#include "../common/common.h" // for AssertGPUSupport #include "../common/math.h" #include "../common/optional_weight.h" // OptionalWeights #include "../common/pseudo_huber.h" @@ -28,7 +27,9 @@ #include #include -#include "../common/device_helpers.cuh" +#include "../common/cuda_context.cuh" // for CUDAContext +#else +#include "../common/common.h" // for AssertGPUSupport #endif // XGBOOST_USE_CUDA namespace xgboost::metric { @@ -48,11 +49,10 @@ PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) { auto labels = info.labels.View(ctx->Device()); if (ctx->IsCUDA()) { #if defined(XGBOOST_USE_CUDA) - dh::XGBCachingDeviceAllocator alloc; thrust::counting_iterator begin(0); thrust::counting_iterator end = begin + labels.Size(); result = thrust::transform_reduce( - thrust::cuda::par(alloc), begin, end, + ctx->CUDACtx()->CTP(), begin, end, [=] XGBOOST_DEVICE(size_t i) { auto idx = linalg::UnravelIndex(i, labels.Shape()); auto sample_id = std::get<0>(idx); diff --git a/src/tree/constraints.cu b/src/tree/constraints.cu index b222402fcfce..183c609a4bef 100644 --- a/src/tree/constraints.cu +++ b/src/tree/constraints.cu @@ -6,14 +6,15 @@ #include #include -#include #include +#include -#include "xgboost/logging.h" -#include "xgboost/span.h" +#include "../common/cuda_context.cuh" // for CUDAContext +#include "../common/device_helpers.cuh" #include "constraints.cuh" #include "param.h" -#include "../common/device_helpers.cuh" +#include "xgboost/logging.h" +#include "xgboost/span.h" namespace xgboost { @@ -130,9 +131,9 @@ FeatureInteractionConstraintDevice::FeatureInteractionConstraintDevice( this->Configure(param, n_features); } -void FeatureInteractionConstraintDevice::Reset() { +void FeatureInteractionConstraintDevice::Reset(Context const* ctx) { for (auto& node : node_constraints_storage_) { - thrust::fill(node.begin(), node.end(), 0); + thrust::fill(ctx->CUDACtx()->CTP(), node.begin(), node.end(), 0); } } diff --git a/src/tree/constraints.cuh b/src/tree/constraints.cuh index 94c262240c19..dfd917277e2e 100644 --- a/src/tree/constraints.cuh +++ b/src/tree/constraints.cuh @@ -78,7 +78,7 @@ struct FeatureInteractionConstraintDevice { FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice const& that) = default; FeatureInteractionConstraintDevice(FeatureInteractionConstraintDevice&& that) = default; /*! \brief Reset before constructing a new tree. */ - void Reset(); + void Reset(Context const* ctx); /*! \brief Return a list of features given node id */ common::Span QueryNode(int32_t nid); /*! diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 4be8e108f0ce..19e8f2f931d6 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -138,9 +138,9 @@ class GPUHistEvaluator { /** * \brief Reset the evaluator, should be called before any use. */ - void Reset(common::HistogramCuts const &cuts, common::Span ft, - bst_feature_t n_features, TrainParam const ¶m, bool is_column_split, - DeviceOrd device); + void Reset(Context const *ctx, common::HistogramCuts const &cuts, + common::Span ft, bst_feature_t n_features, TrainParam const ¶m, + bool is_column_split); /** * \brief Get host category storage for nidx. Different from the internal version, this @@ -154,8 +154,8 @@ class GPUHistEvaluator { } [[nodiscard]] auto GetDeviceNodeCats(bst_node_t nidx) { - copy_stream_.View().Sync(); if (has_categoricals_) { + copy_stream_.View().Sync(); CatAccessor accessor = {dh::ToSpan(split_cats_), node_categorical_storage_size_}; return common::KCatBitField{accessor.GetNodeCatStorage(nidx)}; } else { diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 6eed74c56e87..ee542a94a825 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -13,14 +13,13 @@ #include "xgboost/data.h" namespace xgboost::tree { -void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span ft, - bst_feature_t n_features, TrainParam const ¶m, - bool is_column_split, DeviceOrd device) { +void GPUHistEvaluator::Reset(Context const *ctx, common::HistogramCuts const &cuts, + common::Span ft, bst_feature_t n_features, + TrainParam const ¶m, bool is_column_split) { param_ = param; - tree_evaluator_ = TreeEvaluator{param, n_features, device}; + tree_evaluator_ = TreeEvaluator{param, n_features, ctx->Device()}; has_categoricals_ = cuts.HasCategorical(); if (cuts.HasCategorical()) { - dh::XGBCachingDeviceAllocator alloc; auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); auto beg = thrust::make_counting_iterator(1ul); auto end = thrust::make_counting_iterator(ptrs.size()); @@ -29,7 +28,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::SpanCUDACtx()->CTP(), beg, end, [=] XGBOOST_DEVICE(size_t i) { auto idx = i - 1; if (common::IsCat(ft, idx)) { auto n_bins = ptrs[i] - ptrs[idx]; @@ -44,8 +43,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::SpanCUDACtx()->Stream())); cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. sort_input_.resize(cat_sorted_idx_.size()); @@ -57,14 +56,14 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::SpanCUDACtx()->CTP(), it, it + feature_idx_.size(), feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) { auto fidx = dh::SegmentId(ptrs, i); return fidx; }); } is_column_split_ = is_column_split; - device_ = device; + device_ = ctx->Device(); } common::Span GPUHistEvaluator::SortHistogram( diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index dd89238b5d89..7f1f79dee09c 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -66,12 +66,10 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span alloc; thrust::device_ptr gpair_beg{gpair.data()}; auto beg = thrust::make_transform_iterator(gpair_beg, Clip()); - Pair p = - dh::Reduce(thrust::cuda::par(alloc), beg, beg + gpair.size(), Pair{}, thrust::plus{}); + Pair p = dh::Reduce(ctx->CUDACtx()->CTP(), beg, beg + gpair.size(), Pair{}, thrust::plus{}); // Treat pair as array of 4 primitive types to allreduce using ReduceT = typename decltype(p.first)::ValueT; static_assert(sizeof(Pair) == sizeof(ReduceT) * 4, "Expected to reduce four elements."); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 3c8dec58e5ea..0101be085b24 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -11,6 +11,7 @@ #include // for int32_t, uint32_t #include // for vector +#include "../../common/cuda_context.cuh" // for CUDAContext #include "../../common/device_helpers.cuh" // for MakeTransformIterator #include "xgboost/base.h" // for bst_idx_t #include "xgboost/context.h" // for Context @@ -356,18 +357,18 @@ class RowPartitioner { * argument and return the new position for this training instance. */ template - void FinalisePosition(common::Span d_out_position, bst_idx_t base_ridx, - FinalisePositionOpT op) const { + void FinalisePosition(Context const* ctx, common::Span d_out_position, + bst_idx_t base_ridx, FinalisePositionOpT op) const { dh::TemporaryArray d_node_info_storage(ridx_segments_.size()); dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), sizeof(NodePositionInfo) * ridx_segments_.size(), - cudaMemcpyDefault)); + cudaMemcpyDefault, ctx->CUDACtx()->Stream())); constexpr int kBlockSize = 512; const int kItemsThread = 8; const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); common::Span d_ridx{ridx_.data(), ridx_.size()}; - FinalisePositionKernel<<>>( + FinalisePositionKernel<<CUDACtx()->Stream()>>>( dh::ToSpan(d_node_info_storage), base_ridx, d_ridx, d_out_position, op); } }; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 0b6c1c1982ec..95db64f60632 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -64,14 +64,10 @@ struct NodeSplitData { }; static_assert(std::is_trivially_copyable_v); -// To be tuned. -constexpr double ExtMemPrefetchThresh() { return 4.0; } - // Some nodes we will manually compute histograms, others we will do by subtraction -[[nodiscard]] bool AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer, - std::vector const& candidates, - common::Span nodes_to_build, - common::Span nodes_to_sub) { +void AssignNodes(RegTree const* p_tree, GradientQuantiser const* quantizer, + std::vector const& candidates, + common::Span nodes_to_build, common::Span nodes_to_sub) { auto const& tree = *p_tree; std::size_t nidx_in_set{0}; double total{0.0}, smaller{0.0}; @@ -97,12 +93,6 @@ constexpr double ExtMemPrefetchThresh() { return 4.0; } } ++nidx_in_set; } - - if (-kRtEps < smaller && smaller < kRtEps) { // Too close to 0, don't prefetch. - return false; - } - // Prefetch if these smaller nodes are not quite small. - return (total / smaller) < ExtMemPrefetchThresh(); } // GPU tree updater implementation. @@ -201,16 +191,19 @@ struct GPUHistMakerDevice { // Reset values for each update iteration [[nodiscard]] DMatrix* Reset(HostDeviceVector* dh_gpair, DMatrix* p_fmat) { this->monitor.Start(__func__); + common::SetDevice(ctx_->Ordinal()); + auto const& info = p_fmat->Info(); + // backup the gradient + dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair, ctx_->CUDACtx()->Stream()); this->column_sampler_->Init(ctx_, p_fmat->Info().num_col_, info.feature_weights.HostVector(), param.colsample_bynode, param.colsample_bylevel, param.colsample_bytree); - common::SetDevice(ctx_->Ordinal()); - - this->interaction_constraints.Reset(); + this->interaction_constraints.Reset(ctx_); + this->evaluator_.Reset(this->ctx_, *cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(), + p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit()); // Sampling - dh::CopyTo(dh_gpair->ConstDeviceSpan(), &this->d_gpair); // backup the gradient auto sample = this->sampler->Sample(ctx_, dh::ToSpan(d_gpair), p_fmat); this->gpair = sample.gpair; p_fmat = sample.p_fmat; // Update p_fmat before allocating partitioners @@ -242,10 +235,6 @@ struct GPUHistMakerDevice { } // Other initializations - this->evaluator_.Reset(*cuts_, p_fmat->Info().feature_types.ConstDeviceSpan(), - p_fmat->Info().num_col_, this->param, p_fmat->Info().IsColumnSplit(), - this->ctx_->Device()); - quantiser = std::make_unique(ctx_, this->gpair, p_fmat->Info()); this->InitFeatureGroupsOnce(info); @@ -488,8 +477,8 @@ struct GPUHistMakerDevice { // Prepare for build hist std::vector build_nidx(candidates.size()); std::vector subtraction_nidx(candidates.size()); - auto prefetch_copy = - AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx); + AssignNodes(p_tree, this->quantiser.get(), candidates, build_nidx, subtraction_nidx); + auto prefetch_copy = !build_nidx.empty(); this->histogram_.AllocateHistograms(ctx_, build_nidx, subtraction_nidx); @@ -534,10 +523,13 @@ struct GPUHistMakerDevice { if (!p_fmat->SingleColBlock() && task.UpdateTreeLeaf()) { LOG(FATAL) << "Current objective function can not be used with external memory."; } + + monitor.Start(__func__); if (static_cast(p_fmat->NumBatches() + 1) != this->batch_ptr_.size()) { // External memory with concatenation. Not supported. p_out_position->Resize(0); positions_.clear(); + monitor.Stop(__func__); return; } @@ -557,14 +549,16 @@ struct GPUHistMakerDevice { CHECK_EQ(part->GetNumNodes(), p_tree->NumNodes()); auto base_ridx = batch_ptr_[k]; auto n_samples = batch_ptr_.at(k + 1) - base_ridx; - part->FinalisePosition(d_out_position.subspan(base_ridx, n_samples), base_ridx, encode_op); + part->FinalisePosition(ctx_, d_out_position.subspan(base_ridx, n_samples), base_ridx, + encode_op); } - dh::CopyTo(d_out_position, &positions_); + dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream()); + monitor.Stop(__func__); return; } dh::caching_device_vector categories; - dh::CopyTo(p_tree->GetSplitCategories(), &categories); + dh::CopyTo(p_tree->GetSplitCategories(), &categories, this->ctx_->CUDACtx()->Stream()); auto const& cat_segments = p_tree->GetSplitCategoriesPtr(); auto d_categories = dh::ToSpan(categories); auto ft = p_fmat->Info().feature_types.ConstDeviceSpan(); @@ -583,22 +577,24 @@ struct GPUHistMakerDevice { auto go_left_op = GoLeftOp{d_matrix}; dh::caching_device_vector d_split_data; - dh::CopyTo(split_data, &d_split_data); + dh::CopyTo(split_data, &d_split_data, this->ctx_->CUDACtx()->Stream()); auto s_split_data = dh::ToSpan(d_split_data); - partitioners_.front()->FinalisePosition( - d_out_position, page.BaseRowId(), [=] __device__(bst_idx_t row_id, bst_node_t nidx) { - auto split_data = s_split_data[nidx]; - auto node = split_data.split_node; - while (!node.IsLeaf()) { - auto go_left = go_left_op(row_id, split_data); - nidx = go_left ? node.LeftChild() : node.RightChild(); - node = s_split_data[nidx].split_node; - } - return encode_op(row_id, nidx); - }); - dh::CopyTo(d_out_position, &positions_); + partitioners_.front()->FinalisePosition(ctx_, d_out_position, page.BaseRowId(), + [=] __device__(bst_idx_t row_id, bst_node_t nidx) { + auto split_data = s_split_data[nidx]; + auto node = split_data.split_node; + while (!node.IsLeaf()) { + auto go_left = go_left_op(row_id, split_data); + nidx = go_left ? node.LeftChild() + : node.RightChild(); + node = s_split_data[nidx].split_node; + } + return encode_op(row_id, nidx); + }); + dh::CopyTo(d_out_position, &positions_, this->ctx_->CUDACtx()->Stream()); } + monitor.Stop(__func__); } bool UpdatePredictionCache(linalg::MatrixView out_preds_d, RegTree const* p_tree) { @@ -616,7 +612,7 @@ struct GPUHistMakerDevice { // Use the nodes from tree, the leaf value might be changed by the objective since the // last update tree call. dh::caching_device_vector nodes; - dh::CopyTo(p_tree->GetNodes(), &nodes); + dh::CopyTo(p_tree->GetNodes(), &nodes, this->ctx_->CUDACtx()->Stream()); common::Span d_nodes = dh::ToSpan(nodes); CHECK_EQ(out_preds_d.Shape(1), 1); dh::LaunchN(d_position.size(), ctx_->CUDACtx()->Stream(), diff --git a/tests/cpp/common/test_threading_utils.cu b/tests/cpp/common/test_threading_utils.cu index f7160b1b56f9..fc7475698be2 100644 --- a/tests/cpp/common/test_threading_utils.cu +++ b/tests/cpp/common/test_threading_utils.cu @@ -1,16 +1,17 @@ /** - * Copyright 2021-2023 by XGBoost Contributors + * Copyright 2021-2024, XGBoost Contributors */ #include #include // thrust::copy #include "../../../src/common/device_helpers.cuh" #include "../../../src/common/threading_utils.cuh" +#include "../helpers.h" // for MakeCUDACtx -namespace xgboost { -namespace common { +namespace xgboost::common { TEST(SegmentedTrapezoidThreads, Basic) { size_t constexpr kElements = 24, kGroups = 3; + auto ctx = MakeCUDACtx(0); dh::device_vector offset_ptr(kGroups + 1, 0); offset_ptr[0] = 0; offset_ptr[1] = 8; @@ -19,11 +20,11 @@ TEST(SegmentedTrapezoidThreads, Basic) { size_t h = 1; dh::device_vector thread_ptr(kGroups + 1, 0); - size_t total = SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); + size_t total = SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); ASSERT_EQ(total, kElements - kGroups); h = 2; - SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); + SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); std::vector h_thread_ptr(thread_ptr.size()); thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin()); for (size_t i = 1; i < h_thread_ptr.size(); ++i) { @@ -31,7 +32,7 @@ TEST(SegmentedTrapezoidThreads, Basic) { } h = 7; - SegmentedTrapezoidThreads(dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); + SegmentedTrapezoidThreads(&ctx, dh::ToSpan(offset_ptr), dh::ToSpan(thread_ptr), h); thrust::copy(thread_ptr.cbegin(), thread_ptr.cend(), h_thread_ptr.begin()); for (size_t i = 1; i < h_thread_ptr.size(); ++i) { ASSERT_EQ(h_thread_ptr[i] - h_thread_ptr[i - 1], 28); @@ -66,5 +67,4 @@ TEST(SegmentedTrapezoidThreads, Unravel) { ASSERT_EQ(i, 6); ASSERT_EQ(j, 7); } -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 72a8b5449f0f..968a6a411cc0 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -60,8 +60,7 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { GPUHistEvaluator evaluator{param_, static_cast(feature_set.size()), ctx.Device()}; - evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false, - ctx.Device()); + evaluator.Reset(&ctx, cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, false); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; ASSERT_EQ(result.thresh, 1); @@ -104,7 +103,7 @@ TEST(GpuHist, PartitionBasic) { }; GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device()); + evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { // -1.0s go right @@ -217,7 +216,7 @@ TEST(GpuHist, PartitionTwoFeatures) { false}; GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, ctx.Device()); + evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); @@ -277,10 +276,8 @@ TEST(GpuHist, PartitionTwoNodes) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), - ctx.Device()}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, - ctx.Device()); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); { auto parent_sum = quantiser.ToFixedPoint(GradientPairPrecise{-6.0, 3.0}); @@ -336,10 +333,8 @@ void TestEvaluateSingleSplit(bool is_categorical) { cuts.min_vals_.ConstDeviceSpan(), false}; - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), - ctx.Device()}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false, - ctx.Device()); + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; + evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, false); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1); @@ -522,7 +517,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { cuts_.cut_values_.SetDevice(ctx.Device()); cuts_.min_vals_.SetDevice(ctx.Device()); - evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, false, ctx.Device()); + evaluator.Reset(&ctx, cuts_, dh::ToSpan(ft), info_.num_col_, param_, false); // Convert the sample histogram to fixed point auto quantiser = DummyRoundingFactor(&ctx); @@ -586,7 +581,7 @@ void VerifyColumnSplitEvaluateSingleSplit(bool is_categorical) { false}; GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), ctx.Device()}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true, ctx.Device()); + evaluator.Reset(&ctx, cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, true); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(&ctx, input, shared_inputs).split; EXPECT_EQ(result.findex, 1);