diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 2d481cded622..8e932cdaf8ba 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -453,10 +453,15 @@ xgboost::common::Span ToSpan(VectorT &vec, IndexT offset = 0, } template -xgboost::common::Span ToSpan(thrust::device_vector &vec, size_t offset, size_t size) { +xgboost::common::Span ToSpan(device_vector &vec, size_t offset, size_t size) { return ToSpan(vec, offset, size); } +template +xgboost::common::Span> ToSpan(device_vector const &vec) { + return {thrust::raw_pointer_cast(vec.data()), vec.size()}; +} + template xgboost::common::Span ToSpan(DeviceUVector &vec) { return {vec.data(), vec.size()}; diff --git a/src/data/cat_container.cuh b/src/data/cat_container.cuh new file mode 100644 index 000000000000..a16d1e0c6f02 --- /dev/null +++ b/src/data/cat_container.cuh @@ -0,0 +1,51 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#pragma once +#include "../common/device_helpers.cuh" // for ToSpan +#include "../common/device_vector.cuh" // for device_vector, XGBDeviceAllocator +#include "../encoder/ordinal.h" // for CatCharT + +namespace xgboost::cuda_impl { +struct CatStrArray { + dh::device_vector offsets; + dh::device_vector values; + + CatStrArray() = default; + CatStrArray(CatStrArray const& that) = delete; + CatStrArray& operator=(CatStrArray const& that) = delete; + + CatStrArray(CatStrArray&& that) = default; + CatStrArray& operator=(CatStrArray&& that) = default; + + [[nodiscard]] explicit operator enc::CatStrArrayView() const { + return {dh::ToSpan(offsets), dh::ToSpan(values)}; + } + [[nodiscard]] std::size_t size() const { // NOLINT + return enc::CatStrArrayView(*this).size(); + } +}; + +template +struct ViewToStorageImpl; + +template <> +struct ViewToStorageImpl { + using Type = CatStrArray; +}; + +template +struct ViewToStorageImpl> { + using Type = dh::device_vector; +}; + +template +struct ViewToStorage; + +template +struct ViewToStorage> { + using Type = std::tuple::Type...>; +}; + +using CatIndexTypes = ViewToStorage::Type; +} // namespace xgboost::cuda_impl diff --git a/src/data/cat_container.h b/src/data/cat_container.h new file mode 100644 index 000000000000..60e9aebc7bc6 --- /dev/null +++ b/src/data/cat_container.h @@ -0,0 +1,48 @@ +/** + * Copyright 2025, XGBoost Contributors + */ +#pragma once + +#include // for int32_t, int8_t +#include // for tuple +#include // for vector + +#include "../encoder/ordinal.h" // for DictionaryView + +namespace xgboost { +namespace cpu_impl { +struct CatStrArray { + std::vector offsets; + std::vector values; + + [[nodiscard]] explicit operator enc::CatStrArrayView() const { return {offsets, values}; } + [[nodiscard]] std::size_t size() const { // NOLINT + return enc::CatStrArrayView(*this).size(); + } +}; + +template +struct ViewToStorageImpl; + +template <> +struct ViewToStorageImpl { + using Type = CatStrArray; +}; + +template +struct ViewToStorageImpl> { + using Type = std::vector; +}; + +template +struct ViewToStorage; + +template +struct ViewToStorage> { + using Type = std::tuple::Type...>; +}; + +using CatIndexTypes = ViewToStorage::Type; +using ColumnType = enc::cpu_impl::TupToVarT; +} // namespace cpu_impl +} // namespace xgboost diff --git a/src/encoder/ordinal.cuh b/src/encoder/ordinal.cuh new file mode 100644 index 000000000000..282441d4a0d3 --- /dev/null +++ b/src/encoder/ordinal.cuh @@ -0,0 +1,303 @@ +/** + * Copyright 2025, XGBoost contributors + */ +#pragma once + +#include // for lower_bound +#include // for copy +#include // for device_vector +#include // for find_if +#include // for for_each_n +#include // for make_counting_iterator +#include // for make_transform_iterator +#include // for sort + +#include // for size_t +#include // for int32_t, int8_t +#include // for proclaim_return_type +#include // for make_pair, pair +#include // for get +#include // for stringstream + +#include "../common/device_helpers.cuh" +#include "ordinal.h" +#include "types.h" // for Overloaded + +namespace enc { +namespace cuda_impl { +struct SegmentedSearchSortedStrOp { + DeviceColumnsView haystack_v; // The training set + Span ref_sorted_idx; // Sorted index for the training set + DeviceColumnsView needles_v; // Keys + std::size_t f_idx; // Feature (segment) index + + [[nodiscard]] __device__ std::int32_t operator()(std::int32_t i) const { + using detail::SearchKey; + auto haystack = cuda::std::get(haystack_v.columns[f_idx]); + auto needles = cuda::std::get(needles_v.columns[f_idx]); + // Get the search key + auto idx = i - needles_v.feature_segments[f_idx]; // index local to the feature + auto begin = needles.offsets[idx]; + auto end = needles.offsets[idx + 1]; + auto needle = needles.values.subspan(begin, end - begin); + + // Search the key from the training set + auto it = thrust::make_counting_iterator(0); + auto f_sorted_idx = ref_sorted_idx.subspan( + haystack_v.feature_segments[f_idx], + haystack_v.feature_segments[f_idx + 1] - haystack_v.feature_segments[f_idx]); + auto end_it = it + f_sorted_idx.size(); + auto ret_it = thrust::lower_bound(thrust::seq, it, end_it, SearchKey(), [&](auto l, auto r) { + Span l_str; + if (l == SearchKey()) { + l_str = needle; + } else { + auto l_idx = f_sorted_idx[l]; + auto l_beg = haystack.offsets[l_idx]; + auto l_end = haystack.offsets[l_idx + 1]; + l_str = haystack.values.subspan(l_beg, l_end - l_beg); + } + + Span r_str; + if (r == SearchKey()) { + r_str = needle; + } else { + auto r_idx = f_sorted_idx[r]; + auto r_beg = haystack.offsets[r_idx]; + auto r_end = haystack.offsets[r_idx + 1]; + r_str = haystack.values.subspan(r_beg, r_end - r_beg); + } + + return l_str < r_str; + }); + if (ret_it == it + f_sorted_idx.size()) { + return detail::NotFound(); + } + return *ret_it; + } +}; + +template +struct SegmentedSearchSortedNumOp { + DeviceColumnsView haystack_v; // The training set + Span ref_sorted_idx; // Sorted index for the training set + DeviceColumnsView needles_v; // Keys + std::size_t f_idx; // Feature (segment) index + + [[nodiscard]] __device__ std::int32_t operator()(std::int32_t i) const { + using detail::SearchKey; + auto haystack = cuda::std::get>(haystack_v.columns[f_idx]); + auto needles = cuda::std::get>(needles_v.columns[f_idx]); + // Get the search key + auto idx = i - needles_v.feature_segments[f_idx]; // index local to the feature + auto needle = needles[idx]; + // Search the key from the training set + auto it = thrust::make_counting_iterator(0); + auto f_sorted_idx = ref_sorted_idx.subspan( + haystack_v.feature_segments[f_idx], + haystack_v.feature_segments[f_idx + 1] - haystack_v.feature_segments[f_idx]); + auto end_it = it + f_sorted_idx.size(); + auto ret_it = thrust::lower_bound(thrust::seq, it, end_it, SearchKey(), [&](auto l, auto r) { + T l_value = l == SearchKey() ? needle : haystack[ref_sorted_idx[l]]; + T r_value = r == SearchKey() ? needle : haystack[ref_sorted_idx[r]]; + return l_value < r_value; + }); + if (ret_it == it + f_sorted_idx.size()) { + return detail::NotFound(); + } + return *ret_it; + } +}; + +template +void SegmentedIota(ThrustExec const& policy, Span d_offset_ptr, Span out_sequence) { + thrust::for_each_n(policy, thrust::make_counting_iterator(0ul), out_sequence.size(), + [out_sequence, d_offset_ptr] __device__(std::size_t idx) { + auto group = dh::SegmentId(d_offset_ptr, idx); + out_sequence[idx] = idx - d_offset_ptr[group]; + }); +} + +struct DftThrustPolicy { + template + using ThrustAllocator = thrust::device_allocator; + + auto ThrustPolicy() const { return thrust::cuda::par_nosync; } +}; +} // namespace cuda_impl + +/** + * @brief Default exection policy for the device implementation. Users are expected to + * customize it. + */ +using DftDevicePolicy = Policy; + +/** + * @brief Sort the categories for the training set. Returns a list of sorted index. + * + * @tparam ExecPolicy The @ref Policy class, accepts an error policy and a thrust exec policy. + * + * @param policy The execution policy. + * @param orig_enc The encoding scheme of the training set. + * @param sorted_idx The output sorted index. + */ +template +void SortNames(ExecPolicy const& policy, DeviceColumnsView orig_enc, + Span sorted_idx) { + auto n_total_cats = orig_enc.n_total_cats; + if (static_cast(sorted_idx.size()) != orig_enc.n_total_cats) { + policy.Error("`sorted_idx` should have the same size as `n_total_cats`."); + } + auto d_sorted_idx = dh::ToSpan(sorted_idx); + cuda_impl::SegmentedIota(policy.ThrustPolicy(), orig_enc.feature_segments, d_sorted_idx); + + // + using Pair = cuda::std::pair; + using Alloc = typename ExecPolicy::template ThrustAllocator; + thrust::device_vector keys(n_total_cats); + auto key_it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type([=] __device__(std::int32_t i) { + auto seg = dh::SegmentId(orig_enc.feature_segments, i); + auto idx = d_sorted_idx[i]; + return cuda::std::make_pair(static_cast(seg), idx); + })); + thrust::copy(policy.ThrustPolicy(), key_it, key_it + n_total_cats, keys.begin()); + + thrust::sort(policy.ThrustPolicy(), keys.begin(), keys.end(), + cuda::proclaim_return_type([=] __device__(Pair const& l, Pair const& r) { + if (l.first == r.first) { // same feature + auto const& col = orig_enc.columns[l.first]; + return cuda::std::visit( + Overloaded{[&l, &r](CatStrArrayView const& str) -> bool { + auto l_beg = str.offsets[l.second]; + auto l_end = str.offsets[l.second + 1]; + auto l_str = str.values.subspan(l_beg, l_end - l_beg); + + auto r_beg = str.offsets[r.second]; + auto r_end = str.offsets[r.second + 1]; + auto r_str = str.values.subspan(r_beg, r_end - r_beg); + return l_str < r_str; + }, + [&](auto&& values) { + return values[l.second] < values[r.second]; + }}, + col); + } + return l.first < r.first; + })); + + // Extract the sorted index out from sorted keys. + auto s_keys = dh::ToSpan(keys); + auto it = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), + cuda::proclaim_return_type( + [=] __device__(std::int32_t i) { return s_keys[i].second; })); + thrust::copy(policy.ThrustPolicy(), it, it + sorted_idx.size(), dh::tbegin(sorted_idx)); +} + +/** + * @brief Calculate a mapping for recoding the data given old and new encoding. + * + * @tparam ExecPolicy The @ref Policy class, accepts an error policy and a thrust exec policy + * + * @param policy The execution policy. + * @param orig_enc The encoding scheme of the training set. + * @param sorted_idx The sorted index of the training set encoding scheme, produced by + * @ref SortNames . + * @param new_enc The scheme that needs to be recoded. + * @param mapping The output mapping. + */ +template +void Recode(ExecPolicy const& policy, DeviceColumnsView orig_enc, + Span sorted_idx, DeviceColumnsView new_enc, + Span mapping) { + auto exec = policy.ThrustPolicy(); + detail::BasicChecks(policy, orig_enc, sorted_idx, new_enc, mapping); + + /** + * search the index for the new encoding + */ + thrust::for_each_n( + exec, thrust::make_counting_iterator(0), new_enc.n_total_cats, + [=] __device__(std::int32_t i) { + auto f_idx = dh::SegmentId(new_enc.feature_segments, i); + std::int32_t searched_idx{-1}; + auto const& col = orig_enc.columns[f_idx]; + cuda::std::visit(Overloaded{[&](CatStrArrayView const& str) { + auto op = cuda_impl::SegmentedSearchSortedStrOp{ + orig_enc, sorted_idx, new_enc, f_idx}; + searched_idx = op(i); + }, + [&](auto&& values) { + using T = typename std::decay_t::value_type; + auto op = cuda_impl::SegmentedSearchSortedNumOp{ + orig_enc, sorted_idx, new_enc, f_idx}; + searched_idx = op(i); + }}, + col); + + auto f_sorted_idx = sorted_idx.subspan( + orig_enc.feature_segments[f_idx], + orig_enc.feature_segments[f_idx + 1] - orig_enc.feature_segments[f_idx]); + + std::int32_t idx = -1; + if (searched_idx != detail::NotFound()) { + idx = f_sorted_idx[searched_idx]; + } + + auto f_beg = new_enc.feature_segments[f_idx]; + auto f_end = new_enc.feature_segments[f_idx + 1]; + auto f_mapping = mapping.subspan(f_beg, f_end - f_beg); + f_mapping[i - f_beg] = idx; + }); + + auto err_it = thrust::find_if(exec, dh::tcbegin(mapping), dh::tcend(mapping), + cuda::proclaim_return_type([=] __device__(std::int32_t v) { + return v == detail::NotFound(); + })); + + if (err_it != dh::tcend(mapping)) { + // Report missing cat. + std::vector h_mapping(mapping.size()); + thrust::copy_n(dh::tcbegin(mapping), mapping.size(), h_mapping.begin()); + std::vector h_feature_segments( + new_enc.feature_segments.size()); + thrust::copy(dh::tcbegin(new_enc.feature_segments), dh::tcend(new_enc.feature_segments), + h_feature_segments.begin()); + auto h_idx = std::distance(dh::tcbegin(mapping), err_it); + auto f_idx = dh::SegmentId(Span{h_feature_segments}, h_idx); + auto f_beg = h_feature_segments[f_idx]; + auto f_local_idx = h_idx - f_beg; + + std::vector h_columns(new_enc.columns.size()); + thrust::copy_n(dh::tcbegin(new_enc.columns), new_enc.columns.size(), h_columns.begin()); + + std::stringstream name; + auto const& col = h_columns[f_idx]; + cuda::std::visit( + Overloaded{[&](CatStrArrayView const& str) { + std::vector values(str.values.size()); + std::vector offsets(str.offsets.size()); + thrust::copy_n(dh::tcbegin(str.values), str.values.size(), values.data()); + thrust::copy_n(dh::tcbegin(str.offsets), str.offsets.size(), offsets.data()); + + auto cat = Span{values}.subspan( + offsets[f_local_idx], offsets[f_local_idx + 1] - offsets[f_local_idx]); + for (auto v : cat) { + name.put(v); + } + }, + [&](auto&& values) { + using T = typename std::decay_t::value_type; + std::vector> h_values(values.size()); + thrust::copy_n(dh::tcbegin(values), values.size(), h_values.data()); + auto cat = h_values[f_local_idx]; + name << cat; + }}, + col); + + detail::ReportMissing(policy, name.str(), f_idx); + } +} +} // namespace enc diff --git a/src/encoder/ordinal.h b/src/encoder/ordinal.h new file mode 100644 index 000000000000..83269d3c913f --- /dev/null +++ b/src/encoder/ordinal.h @@ -0,0 +1,414 @@ +/** + * Copyright 2025, XGBoost contributors + * + * @brief Orindal re-coder for categorical features. + * + * For training with dataframes, we use the default encoding provided by the dataframe + * implementation. However, we need a way to ensure the encoding is consistent at test + * time, which is often not the case. This module re-codes the test data given the train + * time encoding (mapping between categories to dense discrete integers starting from 0). + * + * The algorithm proceeds as follow: + * + * Given the categories used for training [c, b, d, a], the ordering of this list is the + * encoding, c maps to 0, b maps to 1, so on and so forth. At test time, we recieve an + * encoding [c, a, b], which differs from the encoding used for training and we need to + * re-code the data. + * + * First, we perform an `argsort` on the training categories in the increasing order, + * obtaining a list of index: [3, 1, 0, 2], which corresponds to [a, b, c, d] as a sorted + * list. Then we perform binary search for each category in the test time encoding [c, a, + * b] with the training encoding as the sorted haystack. Since c is the third item of + * sorted training encoding, we have an index 2 (0-based) for c, index 0 for a, and index + * 1 for b. After the bianry search, we obtain a new list of index [2, 0, 1]. Using this + * index list, we can recover the training encoding for the test dataset [0, 3, 1]. This + * has O(NlogN) complexity with N as the number of categories (assuming the length of the + * strings as constant). Originally, the encoding for test data set is [0, 1, 2] for [c, + * a, b], now we have a mapping {0 -> 0, 1 -> 3, 2 -> 1} for re-coding the data. + * + * This module exposes 2 functions and an execution policy: + * - @ref Recode + * - @ref SortNames + * Each of them has a device counterpart. + */ + +#pragma once +#include // for stable_sort, lower_bound +#include // for size_t +#include // for int32_t, int8_t +#include // for iterator_traits, distance +#include // for accumulate, iota +#include // for stringstream +#include // for logic_error +#include // for string +#include // for tuple +#include // for decay_t +#include // for forward +#include // for variant, visit +#include // for vector + +#include "../common/transform_iterator.h" // for MakeIndexTransformIter +#include "types.h" // for Overloaded, TupToVarT +#include "xgboost/span.h" // for Span + +namespace enc { +using xgboost::common::MakeIndexTransformIter; +using xgboost::common::Span; + +using CatCharT = std::int8_t; + +/** + * @brief String names of categorical data. Represented in the arrow StringArray format. + */ +struct CatStrArrayView { + Span offsets; + Span values; + + [[nodiscard]] ENC_DEVICE bool empty() const { return offsets.empty(); } // NOLINT + [[nodiscard]] ENC_DEVICE std::size_t size() const { // NOLINT + return this->empty() ? 0 : this->offsets.size() - 1; + } + + [[nodiscard]] std::size_t SizeBytes() const { + return this->offsets.size_bytes() + values.size_bytes(); + } +}; + +// We keep a single type list here for supported types and use various transformations to +// add specializations. This way we can modify the type list with ease. + +/** + * @brief All the primitive types supported by the encoder. + */ +using CatPrimIndexTypes = + std::tuple; + +/** + * @brief All the column types supported by the encoder. + */ +using CatIndexViewTypes = + decltype(std::tuple_cat(std::tuple{}, PrimToSpan::Type{})); + +/** + * @brief Host categories view for a single column. + */ +using HostCatIndexView = cpu_impl::TupToVarT; + +#if defined(XGBOOST_USE_CUDA) +/** + * @brief Device categories view for a single column. + */ +using DeviceCatIndexView = cuda_impl::TupToVarT; +#endif // defined(XGBOOST_USE_CUDA) + +/** + * @brief Container for the execution policies used by the encoder. + * + * Accepted policies: + * + * - A class with a `ThrustPolicy` method that returns a thrust execution policy, along with a + * `ThrustAllocator` template type. This is only used for the GPU implementation. + * + * - An error handling policy that exposes a single `Error` method, which takes a single + * string parameter for error message. + */ +template +struct Policy : public Derived... {}; + +namespace detail { +constexpr std::int32_t SearchKey() { return -1; } +constexpr std::int32_t NotFound() { return -1; } + +template +struct ColumnsViewImpl { + using VariantT = Variant; + + Span columns; + + // Segment pointer for features, each segment represents the number of categories in a feature. + Span feature_segments; + // The total number of cats in all features, equals feature_segments.back() + std::int32_t n_total_cats{0}; + + [[nodiscard]] std::size_t Size() const { return columns.size(); } + [[nodiscard]] bool Empty() const { return this->Size() == 0; } + [[nodiscard]] auto operator[](std::size_t i) const { return columns[i]; } +}; + +struct DftErrorHandler { + void Error(std::string &&msg) const { throw std::logic_error{std::forward(msg)}; } +}; + +template +void ReportMissing(ExecPolicy const &policy, std::string const &name, std::size_t f_idx) { + std::stringstream ss; + ss << "Found a category not in the training set for the " << f_idx << "th (0-based) column: `" + << name << "`"; + policy.Error(ss.str()); +} +} // namespace detail + +/** + * @brief Host view of the encoding scheme for all columns. + */ +using HostColumnsView = detail::ColumnsViewImpl; +#if defined(XGBOOST_USE_CUDA) +/** + * @brief Device view of the encoding scheme for all columns. + */ +using DeviceColumnsView = detail::ColumnsViewImpl; +#endif // defined(XGBOOST_USE_CUDA) + +namespace detail { +template +void BasicChecks(ExecPolicy const &policy, detail::ColumnsViewImpl orig_enc, + Span sorted_idx, detail::ColumnsViewImpl new_enc, + Span mapping) { + if (orig_enc.Size() != new_enc.Size()) { + policy.Error("New and old encoding should have the same number of columns."); + } + if (static_cast(mapping.size()) != new_enc.n_total_cats) { + policy.Error("`mapping` should have the same size as `new_enc.n_total_cats`."); + } + if (static_cast(sorted_idx.size()) != orig_enc.n_total_cats) { + policy.Error("`sorted_idx` should have the same size as `orig_enc.n_total_cats`."); + } + if (orig_enc.feature_segments.size() != orig_enc.columns.size() + 1) { + policy.Error("Invalid original encoding."); + } + if (new_enc.feature_segments.size() != new_enc.columns.size() + 1) { + policy.Error("Invalid new encoding."); + } +} +} // namespace detail + +/** + * @brief The result encoding. User needs to construct it from the offsets from the new + * dictionary along with the mapping returned by the recode function. + */ +struct MappingView { + Span offsets; + Span mapping; + + /** + * @brief Get the encoding for a specific feature. + */ + [[nodiscard]] ENC_DEVICE auto operator[](std::size_t f_idx) const { + return mapping.subspan(offsets[f_idx], offsets[f_idx + 1] - offsets[f_idx]); + } + [[nodiscard]] ENC_DEVICE bool Empty() const { return offsets.empty(); } +}; + +namespace cpu_impl { +template +void ArgSort(InIt in_first, InIt in_last, OutIt out_first, Comp comp = std::less{}) { + auto n = std::distance(in_first, in_last); + using Idx = typename std::iterator_traits::value_type; + + auto out_last = out_first + n; + std::iota(out_first, out_last, 0); + auto op = [&](Idx const &l, Idx const &r) { + return comp(in_first[l], in_first[r]); + }; + std::stable_sort(out_first, out_last, op); +} + +[[nodiscard]] inline std::int32_t SearchSorted(CatStrArrayView haystack, + Span ref_sorted_idx, + Span needle) { + auto it = MakeIndexTransformIter([](auto i) { return static_cast(i); }); + auto const h_off = haystack.offsets; + auto const h_data = haystack.values; + using detail::SearchKey; + auto ret_it = std::lower_bound(it, it + haystack.size(), SearchKey(), [&](auto l, auto r) { + Span l_str; + if (l == SearchKey()) { + l_str = needle; + } else { + auto l_idx = ref_sorted_idx[l]; + auto l_beg = h_off[l_idx]; + auto l_end = h_off[l_idx + 1]; + l_str = h_data.subspan(l_beg, l_end - l_beg); + } + + Span r_str; + if (r == SearchKey()) { + r_str = needle; + } else { + auto r_idx = ref_sorted_idx[r]; + auto r_beg = h_off[r_idx]; + auto r_end = h_off[r_idx + 1]; + r_str = h_data.subspan(r_beg, r_end - r_beg); + } + + return l_str < r_str; + }); + if (ret_it == it + haystack.size()) { + return detail::NotFound(); + } + return *ret_it; +} + +template +[[nodiscard]] std::enable_if_t || std::is_floating_point_v, std::int32_t> +SearchSorted(Span haystack, Span ref_sorted_idx, T needle) { + using detail::SearchKey; + auto it = MakeIndexTransformIter([](auto i) { return static_cast(i); }); + auto ret_it = std::lower_bound(it, it + haystack.size(), SearchKey(), [&](auto l, auto r) { + T l_value = l == SearchKey() ? needle : haystack[ref_sorted_idx[l]]; + T r_value = r == SearchKey() ? needle : haystack[ref_sorted_idx[r]]; + return l_value < r_value; + }); + if (ret_it == it + haystack.size()) { + return detail::NotFound(); + } + return *ret_it; +} + +template +void SortNames(ExecPolicy const &policy, HostCatIndexView const &cats, + Span sorted_idx) { + auto it = MakeIndexTransformIter([](auto i) { return i; }); + using T = typename std::iterator_traits::value_type; + auto n_categories = std::visit([](auto &&arg) { return arg.size(); }, cats); + if (sorted_idx.size() != n_categories) { + policy.Error("Invalid size of sorted index."); + } + std::visit(Overloaded{[&](CatStrArrayView const &str) { + cpu_impl::ArgSort(it, it + str.size(), sorted_idx.begin(), [&](T l, T r) { + auto l_beg = str.offsets[l]; + auto l_str = str.values.subspan(l_beg, str.offsets[l + 1] - l_beg); + + auto r_beg = str.offsets[r]; + auto r_str = str.values.subspan(r_beg, str.offsets[r + 1] - r_beg); + + return l_str < r_str; + }); + }, + [&](auto &&values) { + cpu_impl::ArgSort(it, it + values.size(), sorted_idx.begin(), + [&](T l, T r) { return values[l] < values[r]; }); + }}, + cats); +} +} // namespace cpu_impl + +/** + * @brief Sort the categories for the training set. Returns a list of sorted index. + * + * @tparam ExecPolicy The @ref Policy class, only an error policy is needed for the CPU + * implementation. + * + * @param policy The execution policy. + * @param orig_enc The encoding scheme of the training set. + * @param sorted_idx The output sorted index. + */ +template +void SortNames(ExecPolicy const &policy, HostColumnsView orig_enc, Span sorted_idx) { + if (static_cast(sorted_idx.size()) != orig_enc.n_total_cats) { + policy.Error("`sorted_idx` should have the same size as `n_total_cats`."); + } + for (std::size_t f_idx = 0, n = orig_enc.Size(); f_idx < n; ++f_idx) { + auto beg = orig_enc.feature_segments[f_idx]; + auto f_sorted_idx = sorted_idx.subspan(beg, orig_enc.feature_segments[f_idx + 1] - beg); + cpu_impl::SortNames(policy, orig_enc.columns[f_idx], f_sorted_idx); + } +} + +/** + * @brief Default exection policy for the host implementation. Users are expected to + * customize it. + */ +using DftHostPolicy = Policy; + +/** + * @brief Calculate a mapping for recoding the data given old and new encoding. + * + * @tparam ExecPolicy The @ref Policy class, only an error policy is needed for the CPU + * implementation. + * + * @param policy The execution policy. + * @param orig_enc The encoding scheme of the training set. + * @param sorted_idx The sorted index of the training set encoding scheme, produced by + * @ref SortNames . + * @param new_enc The scheme that needs to be recoded. + * @param mapping The output mapping. + */ +template +void Recode(ExecPolicy const &policy, HostColumnsView orig_enc, Span sorted_idx, + HostColumnsView new_enc, Span mapping) { + detail::BasicChecks(policy, orig_enc, sorted_idx, new_enc, mapping); + + std::size_t out_idx = 0; + for (std::size_t f_idx = 0, n_features = orig_enc.Size(); f_idx < n_features; f_idx++) { + bool is_empty = std::visit([](auto &&arg) { return arg.empty(); }, orig_enc.columns[f_idx]); + if (is_empty) { + continue; + } + + auto f_beg = orig_enc.feature_segments[f_idx]; + auto ref_sorted_idx = sorted_idx.subspan(f_beg, orig_enc.feature_segments[f_idx + 1] - f_beg); + + auto n_new_categories = + std::visit([](auto &&arg) { return arg.size(); }, new_enc.columns[f_idx]); + std::vector searched_idx(n_new_categories, -1); + auto const &col = new_enc.columns[f_idx]; + std::visit(Overloaded{[&](CatStrArrayView const &str) { + for (std::size_t j = 1, m = n_new_categories + 1; j < m; ++j) { + auto begin = str.offsets[j - 1]; + auto end = str.offsets[j]; + auto needle = str.values.subspan(begin, end - begin); + searched_idx[j - 1] = cpu_impl::SearchSorted( + std::get(orig_enc.columns[f_idx]), + ref_sorted_idx, needle); + if (searched_idx[j - 1] == detail::NotFound()) { + std::stringstream ss; + for (auto c : needle) { + ss.put(c); + } + detail::ReportMissing(policy, ss.str(), f_idx); + } + } + }, + [&](auto &&values) { + using T = typename std::decay_t::value_type; + for (std::size_t j = 0; j < n_new_categories; ++j) { + auto needle = values[j]; + searched_idx[j] = cpu_impl::SearchSorted( + std::get>>(orig_enc.columns[f_idx]), + ref_sorted_idx, needle); + if (searched_idx[j] == detail::NotFound()) { + std::stringstream ss; + ss << needle; + detail::ReportMissing(policy, ss.str(), f_idx); + } + } + }}, + col); + + for (auto i : searched_idx) { + auto idx = ref_sorted_idx[i]; + mapping[out_idx++] = idx; + } + } +} + +inline std::ostream &operator<<(std::ostream &os, CatStrArrayView const &strings) { + auto const &offset = strings.offsets; + auto const &data = strings.values; + os << "["; + for (std::size_t i = 1, n = offset.size(); i < n; ++i) { + auto begin = offset[i - 1]; + auto end = offset[i]; + auto str = data.subspan(begin, end - begin); + for (auto v : str) { + os.put(v); + } + if (i != n - 1) { + os << ", "; + } + } + os << "]"; + return os; +} +} // namespace enc diff --git a/src/encoder/types.h b/src/encoder/types.h new file mode 100644 index 000000000000..b9d9b00647a8 --- /dev/null +++ b/src/encoder/types.h @@ -0,0 +1,77 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#pragma once + +#if defined(__CUDA__) || defined(__NVCC__) +#define ENC_DEVICE __host__ __device__ +#else +#define ENC_DEVICE +#endif // defined (__CUDA__) || defined(__NVCC__) + +#include // for tuple +#include // for variant + +#include "xgboost/span.h" // for Span + +#if defined(XGBOOST_USE_CUDA) + +#include // for variant + +#endif // defined(XGBOOST_USE_CUDA) + +namespace enc { +template +struct Overloaded : Ts... { + using Ts::operator()...; +}; + +template +ENC_DEVICE Overloaded(Ts...) -> Overloaded; + +// Whether a type is a member of a type list (a.k.a tuple). +template +struct MemberOf; + +template +struct MemberOf> : public std::disjunction...> {}; + +// Convert primitive types to span types. +template +struct PrimToSpan; + +template +struct PrimToSpan> { + using Type = std::tuple>...>; +}; + +namespace cpu_impl { +// Convert tuple of types to variant of types. +template +struct TupToVar; + +template +struct TupToVar> { + using Type = std::variant; +}; + +template +using TupToVarT = typename TupToVar::Type; +} // namespace cpu_impl + +#if defined(XGBOOST_USE_CUDA) +namespace cuda_impl { +// Convert tuple of types to CUDA variant of types. +template +struct TupToVar {}; + +template +struct TupToVar> { + using Type = cuda::std::variant; +}; + +template +using TupToVarT = typename TupToVar::Type; +} // namespace cuda_impl +#endif // defined(XGBOOST_USE_CUDA) +} // namespace enc diff --git a/tests/cpp/encoder/test_ordinal.cc b/tests/cpp/encoder/test_ordinal.cc new file mode 100644 index 000000000000..51583a87c380 --- /dev/null +++ b/tests/cpp/encoder/test_ordinal.cc @@ -0,0 +1,102 @@ +/** + * Copyright 2025, XGBoost contributors + */ +#include "test_ordinal.h" + +#include +#include + +#include // for partial_sum +#include + +#include "../../../src/encoder/ordinal.h" + +namespace enc { +namespace { +using ColumnType = xgboost::cpu_impl::ColumnType; + +class DfTest { + private: + std::vector columns_; + std::vector columns_v_; + std::vector segments_; + + std::vector mapping_; + + template + static auto MakeImpl(std::vector* p_out, std::vector* p_sizes, + Head&& col) { + p_out->emplace_back(col); + p_sizes->push_back(col.size()); + p_sizes->insert(p_sizes->begin(), 0); + std::partial_sum(p_sizes->cbegin(), p_sizes->cend(), p_sizes->begin()); + } + + template + static void MakeImpl(std::vector* p_out, std::vector* p_sizes, + Head&& col, Col&&... columns) { + p_out->emplace_back(col); + p_sizes->push_back(col.size()); + + MakeImpl(p_out, p_sizes, columns...); + } + + public: + template + static DfTest Make(Col&&... columns) { + DfTest df; + MakeImpl(&df.columns_, &df.segments_, std::forward(columns)...); + for (std::size_t i = 0; i < df.columns_.size(); ++i) { + auto const& col = df.columns_[i]; + std::visit(Overloaded{[&](xgboost::cpu_impl::CatStrArray const& str) { + df.columns_v_.emplace_back(enc::CatStrArrayView(str)); + }, + [&](auto&& args) { + df.columns_v_.emplace_back(Span{args}); + }}, + col); + } + auto check = [&] { + // the macro needs to return void. + ASSERT_EQ(df.columns_v_.size(), sizeof...(columns)); + }; + check(); + df.mapping_.resize(df.segments_.back()); + return df; + } + + template + static auto MakeStrs(Strs&&... strs) { + return MakeStrArrayImpl(std::forward(strs)...); + } + + template + static auto MakeInts(Ints&&... names) { + return std::vector{names...}; + } + + auto View() const { return enc::HostColumnsView{Span{columns_v_}, segments_, segments_.back()}; } + + auto Segment() const { return Span{segments_}; } + auto MappingView() { return Span{mapping_}; } + auto const& Mapping() { return mapping_; } +}; + +class OrdRecoderTest { + public: + void Recode(HostColumnsView orig_enc, HostColumnsView new_enc, Span mapping) { + std::vector sorted_idx(orig_enc.n_total_cats); + SortNames(DftHostPolicy{}, orig_enc, sorted_idx); + ::enc::Recode(DftHostPolicy{}, orig_enc, sorted_idx, new_enc, mapping); + } +}; +} // namespace + +TEST(CategoricalEncoder, Str) { TestOrdinalEncoderStrs(); } + +TEST(CategoricalEncoder, Int) { TestOrdinalEncoderInts(); } + +TEST(CategoricalEncoder, Mixed) { TestOrdinalEncoderMixed(); } + +TEST(CategoricalEncoder, Empty) { TestOrdinalEncoderEmpty(); } +} // namespace enc diff --git a/tests/cpp/encoder/test_ordinal.cu b/tests/cpp/encoder/test_ordinal.cu new file mode 100644 index 000000000000..0242dfa47853 --- /dev/null +++ b/tests/cpp/encoder/test_ordinal.cu @@ -0,0 +1,109 @@ +/** + * Copyright 2025, XGBoost contributors + */ +#include +#include + +#include "../../src/data/cat_container.cuh" // for CatIndexTypes +#include "../../src/encoder/ordinal.cuh" +#include "test_ordinal.h" + +namespace enc::cuda_impl { +namespace { +using CatIndexTypes = ::xgboost::cuda_impl::CatIndexTypes; +using ColumnType = enc::cpu_impl::TupToVarT; + +class DfTest { + public: + template + using Vector = dh::device_vector; + + private: + std::vector columns_; + dh::device_vector columns_v_; + dh::device_vector segments_; + std::vector h_segments_; + + dh::device_vector mapping_; + + template + static void MakeImpl(std::vector* p_out, dh::device_vector* p_sizes, + Head&& col) { + p_sizes->push_back(col.size()); + p_out->emplace_back(std::forward(col)); + + p_sizes->insert(p_sizes->begin(), 0); + thrust::inclusive_scan(p_sizes->cbegin(), p_sizes->cend(), p_sizes->begin()); + } + + template + static void MakeImpl(std::vector* p_out, dh::device_vector* p_sizes, + Head&& col, Col&&... columns) { + p_sizes->push_back(col.size()); + p_out->emplace_back(std::forward(col)); + MakeImpl(p_out, p_sizes, std::forward(columns)...); + } + + public: + template + static DfTest Make(Col&&... columns) { + DfTest df; + MakeImpl(&df.columns_, &df.segments_, std::forward(columns)...); + for (std::size_t i = 0; i < df.columns_.size(); ++i) { + auto const& col = df.columns_[i]; + std::visit(Overloaded{[&](xgboost::cuda_impl::CatStrArray const& str) { + df.columns_v_.push_back(enc::CatStrArrayView(str)); + }, + [&](auto&& args) { + df.columns_v_.push_back(dh::ToSpan(args)); + }}, + col); + } + CHECK_EQ(df.columns_v_.size(), sizeof...(columns)); + df.h_segments_.resize(df.segments_.size()); + thrust::copy_n(df.segments_.cbegin(), df.segments_.size(), df.h_segments_.begin()); + df.mapping_.resize(df.h_segments_.back()); + return df; + } + + template + static auto MakeStrs(Strs&&... strs) { + auto array = MakeStrArrayImpl(std::forward(strs)...); + return xgboost::cuda_impl::CatStrArray{array.offsets, array.values}; + } + + template + static auto MakeInts(Ints&&... names) { + return dh::device_vector{names...}; + } + + auto View() const { + return enc::DeviceColumnsView{dh::ToSpan(this->columns_v_), dh::ToSpan(segments_), + h_segments_.back()}; + } + auto Segment() const { return Span{h_segments_}; } + + auto MappingView() { return dh::ToSpan(mapping_); } + auto const& Mapping() { return mapping_; } +}; + +class OrdRecoderTest { + public: + void Recode(DeviceColumnsView orig_enc, DeviceColumnsView new_enc, Span mapping) { + auto policy = DftDevicePolicy{}; + thrust::device_vector ref_sorted_idx(orig_enc.n_total_cats); + SortNames(policy, orig_enc, dh::ToSpan(ref_sorted_idx)); + auto d_sorted_idx = dh::ToSpan(ref_sorted_idx); + ::enc::Recode(policy, orig_enc, d_sorted_idx, new_enc, mapping); + } +}; +} // namespace + +TEST(CategoricalEncoder, StrGpu) { TestOrdinalEncoderStrs(); } + +TEST(CategoricalEncoder, IntGpu) { TestOrdinalEncoderInts(); } + +TEST(CategoricalEncoder, MixedGpu) { TestOrdinalEncoderMixed(); } + +TEST(CategoricalEncoder, EmptyGpu) { TestOrdinalEncoderEmpty(); } +} // namespace enc::cuda_impl diff --git a/tests/cpp/encoder/test_ordinal.h b/tests/cpp/encoder/test_ordinal.h new file mode 100644 index 000000000000..a8169af8e8e4 --- /dev/null +++ b/tests/cpp/encoder/test_ordinal.h @@ -0,0 +1,201 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#pragma once + +#include +#include + +#include // for int8_t, int32_t +#include // for partial_sum +#include // for string +#include // for vector + +#include "../../../src/data/cat_container.h" + +namespace enc { +template +auto MakeStrArrayImpl(Strs&&... strs) { + std::vector names{strs...}; + std::vector values; + std::vector offsets{0}; + + for (const auto& name : names) { + for (char c : name) { + values.push_back(c); + } + offsets.push_back(name.size()); + } + std::partial_sum(offsets.cbegin(), offsets.cend(), offsets.begin()); + return xgboost::cpu_impl::CatStrArray{offsets, values}; +} + +template +void TestOrdinalEncoderStrs() { + Encoder encoder; + auto sol = std::vector{0, 3, 1}; + + { + auto df = DfTest::Make(DfTest::MakeStrs("c", "b", "d", "a")); + auto orig_dict = df.View(); + ASSERT_EQ(orig_dict.Size(), 1); + + auto new_df = DfTest::Make(DfTest::MakeStrs("c", "a", "b")); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 3); + + ASSERT_EQ(new_df.Mapping(), sol); + } + { + // longer strings + auto df = DfTest::Make(DfTest::MakeStrs("cbd", "bbd", "dbd", "ab")); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeStrs("cbd", "ab", "bbd")); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 3); + ASSERT_EQ(new_df.Mapping(), sol); + } + { + // Test error message. + auto df = DfTest::Make(DfTest::MakeStrs("cbd", "bbd", "dbd", "ab")); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeStrs("oops", "ab", "bbd")); + auto new_dict = new_df.View(); + ASSERT_THAT([&] { encoder.Recode(orig_dict, new_dict, new_df.MappingView()); }, + ::testing::ThrowsMessage(::testing::HasSubstr("`oops`"))); + } + { + // Multi-columns + auto df = DfTest::Make(DfTest::MakeStrs("cbd", "bbd", "dbd", "ab"), + DfTest::MakeStrs("b", "c", "a", "d")); + auto orig_dict = df.View(); + + auto new_df = + DfTest::Make(DfTest::MakeStrs("cbd", "ab", "bbd"), DfTest::MakeStrs("d", "a", "b")); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + auto segs = new_df.Segment(); + auto beg = segs[0]; + auto end = segs[1]; + + auto sol0 = sol; + for (auto i = beg, k = 0; i < end; ++i, ++k) { + ASSERT_EQ(sol0[k], new_df.Mapping()[i]); + } + + beg = end; + end = segs[2]; + auto sol1 = std::vector{3, 2, 0}; + for (auto i = beg, k = 0; i < end; ++i, ++k) { + ASSERT_EQ(sol1[k], new_df.Mapping()[i]); + } + } +} + +template +void TestOrdinalEncoderInts() { + Encoder encoder; + auto sol = std::vector{0, 3, 1}; + + { + auto df = DfTest::Make(DfTest::MakeInts(2, 1, 3, 0)); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeInts(2, 0, 1)); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping(), sol); + } + { + // Test error message. + auto df = DfTest::Make(DfTest::MakeInts(2, 1, 3, 0)); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeInts(2, 0, 5)); + auto new_dict = new_df.View(); + ASSERT_THAT([&] { encoder.Recode(orig_dict, new_dict, new_df.MappingView()); }, + ::testing::ThrowsMessage(::testing::HasSubstr("`5`"))); + } +} + +template +void TestOrdinalEncoderMixed() { + Encoder encoder; + auto sol = std::vector{0, 3, 1}; + + { + auto df = + DfTest::Make(DfTest::MakeInts(2, 1, 3, 0), DfTest::MakeStrs("cbd", "bbd", "dbd", "ab")); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeInts(2, 0, 1), DfTest::MakeStrs("cbd", "ab", "bbd")); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 6); + for (std::size_t i = 0; i < new_df.Mapping().size(); ++i) { + ASSERT_EQ(new_df.Mapping()[i], sol[i % sol.size()]); + } + } + { + auto df = + DfTest::Make(DfTest::MakeStrs("cbd", "bbd", "dbd", "ab"), DfTest::MakeInts(2, 1, 3, 0)); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeStrs("cbd", "ab", "bbd"), DfTest::MakeInts(2, 0, 1)); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 6); + for (std::size_t i = 0; i < new_df.Mapping().size(); ++i) { + ASSERT_EQ(new_df.Mapping()[i], sol[i % sol.size()]); + } + } + { + auto df = + DfTest::Make(DfTest::MakeStrs("cbd", "bbd", "dbd", "ab"), DfTest::MakeInts(2, 1, 3, 0), + DfTest::MakeStrs("cbd", "bbd", "dbd", "ab")); + auto orig_dict = df.View(); + + auto new_df = DfTest::Make(DfTest::MakeStrs("cbd", "ab", "bbd"), DfTest::MakeInts(2, 0), + DfTest::MakeStrs("cbd", "ab", "bbd")); + auto new_dict = new_df.View(); + + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 8); + for (std::size_t i = 0; i < 3; ++i) { + ASSERT_EQ(new_df.Mapping()[i], sol[i]); + } + for (std::size_t i = 3, k = 0; i < 5; ++i, ++k) { + ASSERT_EQ(new_df.Mapping()[i], sol[k]); + } + for (std::size_t i = 5, k = 0; i < 8; ++i, ++k) { + ASSERT_EQ(new_df.Mapping()[i], sol[k]); + } + } +} + +template +void TestOrdinalEncoderEmpty() { + auto sol = std::vector{0, 3, 1}; + Encoder encoder; + auto df = DfTest::Make(DfTest::MakeInts(), DfTest::MakeStrs("cbd", "bbd", "dbd", "ab"), + DfTest::MakeInts()); + auto orig_dict = df.View(); + + auto new_df = + DfTest::Make(DfTest::MakeInts(), DfTest::MakeStrs("cbd", "ab", "bbd"), DfTest::MakeInts()); + auto new_dict = new_df.View(); + encoder.Recode(orig_dict, new_dict, new_df.MappingView()); + ASSERT_EQ(new_df.Mapping().size(), 3); + ASSERT_EQ(new_df.Mapping(), sol); +} +} // namespace enc