diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu index 8d6aacd2ef1..f1af62eaa87 100644 --- a/cpp/benchmarks/common/generate_input.cu +++ b/cpp/benchmarks/common/generate_input.cu @@ -580,7 +580,7 @@ std::unique_ptr create_random_utf8_string_column(data_profile cons null_mask.begin(), lengths.begin(), cuda::proclaim_return_type([] __device__(auto) { return 0; }), - thrust::logical_not{}); + cuda::std::logical_not{}); auto valid_lengths = thrust::make_transform_iterator( thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())), valid_or_zero{}); diff --git a/cpp/include/cudf/detail/utilities/functional.hpp b/cpp/include/cudf/detail/utilities/functional.hpp new file mode 100644 index 00000000000..114c69bbe46 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/functional.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf::detail { + +#if CCCL_MAJOR_VERSION >= 3 +using cuda::maximum; +using cuda::minimum; +#else +using thrust::maximum; +using thrust::minimum; +#endif + +} // namespace cudf::detail diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh index 8214ea6e83b..6ace930c1fe 100644 --- a/cpp/include/cudf/table/experimental/row_operators.cuh +++ b/cpp/include/cudf/table/experimental/row_operators.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -33,6 +33,7 @@ #include #include +#include #include #include #include @@ -1466,9 +1467,9 @@ class device_row_comparator { auto rvalid = detail::make_validity_iterator(rcol); if (nulls_are_equal == null_equality::UNEQUAL) { if (thrust::any_of( - thrust::seq, lvalid, lvalid + lcol.size(), thrust::logical_not()) or + thrust::seq, lvalid, lvalid + lcol.size(), cuda::std::logical_not()) or thrust::any_of( - thrust::seq, rvalid, rvalid + rcol.size(), thrust::logical_not())) { + thrust::seq, rvalid, rvalid + rcol.size(), cuda::std::logical_not())) { return false; } } else { diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu index 3c558f1e264..70e26ae4285 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cu +++ b/cpp/src/binaryop/compiled/binary_ops.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -241,8 +242,8 @@ struct null_considering_binop { return invalid_str; else if (lhs_valid && rhs_valid) { return (op == binary_operator::NULL_MAX) - ? thrust::maximum()(lhs_value, rhs_value) - : thrust::minimum()(lhs_value, rhs_value); + ? cudf::detail::maximum()(lhs_value, rhs_value) + : cudf::detail::minimum()(lhs_value, rhs_value); } else if (lhs_valid) return lhs_value; else diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu index 2e78954d78a..2695288af64 100644 --- a/cpp/src/filling/repeat.cu +++ b/cpp/src/filling/repeat.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -81,7 +82,7 @@ struct count_checker { if (static_cast(std::numeric_limits::max()) > std::numeric_limits::max()) { auto max = thrust::reduce( - rmm::exec_policy(stream), count.begin(), count.end(), 0, thrust::maximum()); + rmm::exec_policy(stream), count.begin(), count.end(), 0, cudf::detail::maximum()); CUDF_EXPECTS(max <= std::numeric_limits::max(), "count exceeds the column size limit", std::overflow_error); diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu index 583357d9090..a0ba81bccb2 100644 --- a/cpp/src/groupby/sort/group_rank_scan.cu +++ b/cpp/src/groupby/sort/group_rank_scan.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -146,7 +147,7 @@ std::unique_ptr rank_generator(column_view const& grouped_values, group_labels_begin + group_labels.size(), mutable_rank_begin, mutable_rank_begin, - thrust::equal_to{}, + cuda::std::equal_to{}, scan_op); return ranks; } diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu index 088ed05e5eb..f94ae71a23c 100644 --- a/cpp/src/groupby/sort/group_replace_nulls.cu +++ b/cpp/src/groupby/sort/group_replace_nulls.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -55,7 +56,7 @@ std::unique_ptr group_replace_nulls(cudf::column_view const& grouped_val thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator())); auto func = cudf::detail::replace_policy_functor(); - thrust::equal_to eq; + cuda::std::equal_to eq; if (replace_policy == cudf::replace_policy::PRECEDING) { thrust::inclusive_scan_by_key(rmm::exec_policy(stream), group_labels.begin(), diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh index a90445fabe1..160d0a3b276 100644 --- a/cpp/src/groupby/sort/group_scan_util.cuh +++ b/cpp/src/groupby/sort/group_scan_util.cuh @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -122,7 +123,7 @@ struct group_scan_functor() group_labels.end(), inp_iter, out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -167,7 +168,7 @@ struct group_scan_functor(0), gather_map.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, binop_generator.binop()); // diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh index 662c380eff5..9dba468bf14 100644 --- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh +++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh @@ -175,7 +175,7 @@ struct group_reduction_functor< inp_iter, thrust::make_discard_iterator(), out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -201,7 +201,7 @@ struct group_reduction_functor< rmm::device_uvector validity(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + cuda::std::logical_or{}); auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr); @@ -238,7 +238,7 @@ struct group_reduction_functor< inp_iter, thrust::make_discard_iterator(), out_iter, - thrust::equal_to{}, + cuda::std::equal_to{}, binop); }; @@ -254,7 +254,7 @@ struct group_reduction_functor< auto validity = rmm::device_uvector(num_groups, stream); do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr), validity.begin(), - thrust::logical_or{}); + cuda::std::logical_or{}); auto [null_mask, null_count] = cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr); diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 11d5749ee38..2be2e42c2b3 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -21,6 +21,7 @@ #include "io/utilities/hostdevice_vector.hpp" #include +#include #include #include #include @@ -300,8 +301,10 @@ rmm::device_buffer decompress_data(datasource& source, size_t const uncompressed_data_size = std::reduce(uncompressed_data_sizes.begin(), uncompressed_data_sizes.end()); - size_t const max_uncomp_block_size = std::reduce( - uncompressed_data_sizes.begin(), uncompressed_data_sizes.end(), 0, thrust::maximum()); + size_t const max_uncomp_block_size = std::reduce(uncompressed_data_sizes.begin(), + uncompressed_data_sizes.end(), + 0, + cudf::detail::maximum()); size_t temp_size = 0; status = diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu index cf5996dfd93..30501c3f2e2 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cu +++ b/cpp/src/io/comp/nvcomp_adapter.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -15,6 +15,7 @@ */ #include "nvcomp_adapter.cuh" +#include #include #include @@ -122,7 +123,7 @@ std::pair max_chunk_and_total_input_size(device_span()); + cudf::detail::maximum()); auto const sum = thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), input_sizes.end()); return {max, sum}; } diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 7b217d08da3..4b80b981030 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -400,7 +401,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations.Current(), detail::AddStackLevelFromStackOp{symbol_to_stack_op}, num_symbols_in, - cub::Equality{}, + cuda::std::equal_to{}, stream)); stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes); } else { @@ -499,7 +500,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations.Current(), detail::AddStackLevelFromStackOp{symbol_to_stack_op}, num_symbols_in, - cub::Equality{}, + cuda::std::equal_to{}, stream)); } else { CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu index c4fe7926706..13d1751e03d 100644 --- a/cpp/src/io/json/column_tree_construction.cu +++ b/cpp/src/io/json/column_tree_construction.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include "nested_json.hpp" #include +#include #include #include #include @@ -208,7 +209,7 @@ std::tuple reduce_to_column_tree( thrust::make_constant_iterator(1), non_leaf_nodes.begin(), non_leaf_nodes_children.begin(), - thrust::equal_to()); + cuda::std::equal_to()); thrust::scatter(rmm::exec_policy_nosync(stream), non_leaf_nodes_children.begin(), diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index e506d60a2be..712d280c11f 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -1007,13 +1008,13 @@ void scatter_offsets(tree_meta_t const& tree, col.string_offsets.begin(), col.string_offsets.end(), col.string_offsets.begin(), - thrust::maximum{}); + cudf::detail::maximum{}); } else if (col.type == json_col_t::ListColumn) { thrust::inclusive_scan(rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), col.child_offsets.begin(), - thrust::maximum{}); + cudf::detail::maximum{}); } } stream.synchronize(); diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 1fe58a0449f..c0790c2f73d 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -130,8 +131,8 @@ reduce_to_column_tree(tree_meta_t const& tree, ordered_row_offsets, unique_col_ids.begin(), max_row_offsets.begin(), - thrust::equal_to(), - thrust::maximum()); + cuda::std::equal_to(), + cudf::detail::maximum()); // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E) rmm::device_uvector column_categories(num_columns, stream); @@ -142,7 +143,7 @@ reduce_to_column_tree(tree_meta_t const& tree, thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()), unique_col_ids.begin(), column_categories.begin(), - thrust::equal_to(), + cuda::std::equal_to(), [] __device__(NodeT type_a, NodeT type_b) -> NodeT { auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR); auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR); diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index e2fe926ea19..e0d6f51aad9 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -213,8 +214,8 @@ void propagate_first_sibling_to_other(cudf::device_span node_l sorted_node_levels.end(), thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), - thrust::equal_to{}, - thrust::maximum{}); + cuda::std::equal_to{}, + cudf::detail::maximum{}); } // Generates a tree representation of the given tokens, token_indices. diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu index 1587c4da9c8..b8f0fe7cb07 100644 --- a/cpp/src/io/json/write_json.cu +++ b/cpp/src/io/json/write_json.cu @@ -333,8 +333,8 @@ std::unique_ptr struct_to_strings(table_view const& strings_columns, validity_iterator, d_str_separator.begin(), false, - thrust::equal_to{}, - thrust::logical_or{}); + cuda::std::equal_to{}, + cuda::std::logical_or{}); thrust::for_each(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), thrust::make_counting_iterator(total_rows), diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index c0887304db9..426e470a151 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -18,6 +18,7 @@ #include "io/utilities/column_buffer.hpp" #include "orc_gpu.hpp" +#include #include #include @@ -1511,10 +1512,11 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s, } if (t == nrows - 1) { s->u.rowdec.nz_count = min(nz_count, s->top.data.max_vals); } __syncthreads(); + // TBD: Brute-forcing this, there might be a more efficient way to find the thread with the // last row last_row = (nz_count == s->u.rowdec.nz_count) ? row_plus1 : 0; - last_row = block_reduce(temp_storage).Reduce(last_row, cub::Max()); + last_row = block_reduce(temp_storage).Reduce(last_row, cudf::detail::maximum{}); nz_pos = (valid) ? nz_count : 0; if (t == 0) { s->top.data.nrows = last_row; } if (valid && nz_pos - 1 < s->u.rowdec.nz_count) { s->u.rowdec.row[nz_pos - 1] = row_plus1; } diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index 3a1f3a88da4..2ccf3f5d284 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -366,8 +367,9 @@ static __device__ uint32_t IntegerRLE( orcenc_state_s* s, T const* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage) { using block_reduce = cub::BlockReduce; - uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; - uint32_t out_cnt = 0; + + uint8_t* dst = s->stream.data_ptrs[cid] + s->strm_pos[cid]; + uint32_t out_cnt = 0; __shared__ uint64_t block_vmin; while (numvals > 0) { @@ -413,9 +415,9 @@ static __device__ uint32_t IntegerRLE( T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits::max(); T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits::min(); uint32_t literal_mode, literal_w; - vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min()); + vmin = block_reduce(temp_storage).Reduce(vmin, cudf::detail::minimum{}); __syncthreads(); - vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max()); + vmax = block_reduce(temp_storage).Reduce(vmax, cudf::detail::maximum{}); if (t == 0) { uint32_t mode1_w, mode2_w; typename std::make_unsigned::type vrange_mode1, vrange_mode2; diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh index 56b7c8065ee..8dba755b73a 100644 --- a/cpp/src/io/parquet/delta_enc.cuh +++ b/cpp/src/io/parquet/delta_enc.cuh @@ -19,6 +19,7 @@ #include "parquet_gpu.hpp" #include +#include #include #include @@ -221,6 +222,7 @@ class delta_binary_packer { inline __device__ uint8_t* flush() { using cudf::detail::warp_size; + __shared__ T block_min; int const t = threadIdx.x; @@ -240,7 +242,7 @@ class delta_binary_packer { : cuda::std::numeric_limits::max(); // Find min delta for the block. - auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min()); + auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cudf::detail::minimum{}); if (t == 0) { block_min = min_delta; } __syncthreads(); @@ -250,7 +252,7 @@ class delta_binary_packer { // Get max normalized delta for each warp, and use that to determine how many bits to use // for the bitpacking of this warp. - U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max()); + U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cudf::detail::maximum{}); __syncwarp(); if (lane_id == 0) { _mb_bits[warp_id] = sizeof(long long) * 8 - __clzll(warp_max); } diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu index 7d670057cf9..fe9b05c8054 100644 --- a/cpp/src/io/parquet/page_string_decode.cu +++ b/cpp/src/io/parquet/page_string_decode.cu @@ -21,6 +21,7 @@ #include "rle_stream.cuh" #include +#include #include #include @@ -498,6 +499,7 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d { using cudf::detail::warp_size; using WarpReduce = cub::WarpReduce; + __shared__ typename WarpReduce::TempStorage temp_storage[2]; __shared__ __align__(16) delta_binary_decoder prefixes; @@ -550,7 +552,8 @@ __device__ thrust::pair totalDeltaByteArraySize(uint8_t const* d // note: warp_sum will only be valid on lane 0. auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum); __syncwarp(); - auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max()); + auto const warp_max = + WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cudf::detail::maximum{}); if (lane_id == 0) { total_bytes += warp_sum; diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index be1e7d38fff..5242b18b574 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -1149,7 +1149,7 @@ void include_decompression_scratch_size(device_span chunk page_keys + pages.size(), decomp_iter, decomp_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, decomp_sum{}); // retrieve to host so we can call nvcomp to get compression scratch sizes @@ -1388,7 +1388,7 @@ void reader::impl::setup_next_subpass(read_mode mode) page_keys + pass.pages.size(), page_size, c_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, cumulative_page_sum{}); // include scratch space needed for decompression. for certain codecs (eg ZSTD) this @@ -1703,7 +1703,7 @@ void reader::impl::compute_output_chunks_for_subpass() page_keys + subpass.pages.size(), page_input, c_info.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, cumulative_page_sum{}); auto iter = thrust::make_counting_iterator(0); // cap the max row in all pages by the max row we expect in the subpass. input chunking diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index e1e9bac5a07..052ed80bc14 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -608,7 +609,7 @@ void decode_page_headers(pass_intermediate_data& pass, level_bit_size, level_bit_size + pass.chunks.size(), 0, - thrust::maximum()); + cudf::detail::maximum()); pass.level_type_size = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8)); // sort the pages in chunk/schema order. diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh index dc023e69423..34e663447e3 100644 --- a/cpp/src/io/statistics/typed_statistics_chunk.cuh +++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ #include "statistics_type_identification.cuh" #include "temp_storage_wrapper.cuh" +#include #include #include @@ -202,11 +203,12 @@ __inline__ __device__ typed_statistics_chunk block_reduce( using E = typename detail::extrema_type::type; using extrema_reduce = cub::BlockReduce; using count_reduce = cub::BlockReduce; - output_chunk.minimum_value = - extrema_reduce(storage.template get()).Reduce(output_chunk.minimum_value, cub::Min()); + + output_chunk.minimum_value = extrema_reduce(storage.template get()) + .Reduce(output_chunk.minimum_value, cudf::detail::minimum{}); __syncthreads(); - output_chunk.maximum_value = - extrema_reduce(storage.template get()).Reduce(output_chunk.maximum_value, cub::Max()); + output_chunk.maximum_value = extrema_reduce(storage.template get()) + .Reduce(output_chunk.maximum_value, cudf::detail::maximum{}); __syncthreads(); output_chunk.non_nulls = count_reduce(storage.template get()).Sum(output_chunk.non_nulls); diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu index 2750a17d328..c6391d49294 100644 --- a/cpp/src/io/utilities/data_casting.cu +++ b/cpp/src/io/utilities/data_casting.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -814,7 +815,7 @@ static std::unique_ptr parse_string(string_view_pair_it str_tuples, str_tuples + col_size, cuda::proclaim_return_type([] __device__(auto t) { return t.second; }), size_type{0}, - thrust::maximum{}); + cudf::detail::maximum{}); auto sizes = rmm::device_uvector(col_size, stream); auto d_sizes = sizes.data(); diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu index 6f2acbb0712..0ed4b5193b7 100644 --- a/cpp/src/lists/set_operations.cu +++ b/cpp/src/lists/set_operations.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -103,8 +104,8 @@ std::unique_ptr have_overlap(lists_column_view const& lhs, contained.begin(), // values to reduce list_indices.begin(), // out keys overlap_results.begin(), // out values - thrust::equal_to{}, // comp for keys - thrust::logical_or{}); // reduction op for values + cuda::std::equal_to{}, // comp for keys + cuda::std::logical_or{}); // reduction op for values auto const num_non_empty_segments = thrust::distance(overlap_results.begin(), end.second); auto [null_mask, null_count] = diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu index 3a365477366..83423649507 100644 --- a/cpp/src/quantiles/tdigest/tdigest.cu +++ b/cpp/src/quantiles/tdigest/tdigest.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -395,7 +396,7 @@ std::unique_ptr percentile_approx(tdigest_column_view const& input, return std::pair{rmm::device_buffer{}, null_count}; } return cudf::detail::valid_if( - tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr); + tdigest_is_empty, tdigest_is_empty + tdv.size(), cuda::std::logical_not{}, stream, mr); }(); return cudf::make_lists_column(input.size(), diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index fd98d262154..f07b8695024 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -793,7 +794,7 @@ std::unique_ptr compute_tdigests(int delta, centroids_begin, // values thrust::make_discard_iterator(), // key output output, // output - thrust::equal_to{}, // key equality check + cuda::std::equal_to{}, // key equality check merge_centroids{}); // create final tdigest column @@ -1161,8 +1162,8 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, min_iter, thrust::make_discard_iterator(), merged_min_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::minimum{}); + cuda::std::equal_to{}, // key equality check + cudf::detail::minimum{}); auto merged_max_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -1176,8 +1177,8 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, max_iter, thrust::make_discard_iterator(), merged_max_col->mutable_view().begin(), - thrust::equal_to{}, // key equality check - thrust::maximum{}); + cuda::std::equal_to{}, // key equality check + cudf::detail::maximum{}); auto tdigest_offsets = tdv.centroids().offsets(); diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh index 6c35e750e6b..d9b1fefe09a 100644 --- a/cpp/src/reductions/segmented/simple.cuh +++ b/cpp/src/reductions/segmented/simple.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -249,7 +250,7 @@ std::unique_ptr fixed_point_segmented_reduction( counts.begin(), counts.end(), size_type{0}, - thrust::maximum{}); + cudf::detail::maximum{}); auto const new_scale = numeric::scale_type{col.type().scale() * max_count}; diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu index 8a98b65b406..d189b397afd 100644 --- a/cpp/src/rolling/detail/rolling_collect_list.cu +++ b/cpp/src/rolling/detail/rolling_collect_list.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -53,6 +54,7 @@ std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view con // offsets == [0, 2, 5, 5, 8, 11, 13] // scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0] // + auto const num_child_rows{ cudf::detail::get_value(offsets, offsets.size() - 1, stream)}; auto per_row_mapping = make_fixed_width_column( @@ -83,7 +85,7 @@ std::unique_ptr get_list_child_to_list_row_mapping(cudf::column_view con per_row_mapping_begin, per_row_mapping_begin + num_child_rows, per_row_mapping_begin, - thrust::maximum{}); + cudf::detail::maximum{}); return per_row_mapping; } diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu index e7dca2277ec..35a9a3ec38d 100644 --- a/cpp/src/sort/rank.cu +++ b/cpp/src/sort/rank.cu @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -145,7 +146,7 @@ void tie_break_ranks_transform(cudf::device_span dense_rank_sor tie_iter, thrust::make_discard_iterator(), tie_sorted.begin(), - thrust::equal_to{}, + cuda::std::equal_to{}, tie_breaker); using TransformerReturnType = cuda::std::decay_t>; @@ -202,7 +203,7 @@ void rank_min(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::minimum{}, + cudf::detail::minimum{}, cuda::std::identity{}, stream); } @@ -220,7 +221,7 @@ void rank_max(cudf::device_span group_keys, thrust::make_counting_iterator(1), sorted_order_view, rank_mutable_view.begin(), - thrust::maximum{}, + cudf::detail::maximum{}, cuda::std::identity{}, stream); } diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index 352ca83c8b2..9d30e3d0026 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -135,7 +136,7 @@ std::unique_ptr split_fn(strings_column_view const& input, return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }), 0, - thrust::maximum{}); + cudf::detail::maximum{}); // build strings columns for each token position for (size_type col = 0; col < columns_count; ++col) { @@ -346,7 +347,7 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, // column count is the maximum number of tokens for any string size_type const columns_count = thrust::reduce( - rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{}); + rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, cudf::detail::maximum{}); std::vector> results; // boundary case: if no columns, return one null column (issue #119) diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index ef96b9d3f36..68b610bcb93 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -227,7 +228,7 @@ std::unique_ptr
split_re(strings_column_view const& input, return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }), 0, - thrust::maximum{}); + cudf::detail::maximum{}); // boundary case: if no columns, return one all-null column (custrings issue #119) if (columns_count == 0) { diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu index 0aacfd16f67..972bcc32077 100644 --- a/cpp/src/text/bpe/byte_pair_encoding.cu +++ b/cpp/src/text/bpe/byte_pair_encoding.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -212,7 +213,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings, } } // compute the min rank across the block - auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + auto const reduce_rank = + block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); @@ -277,7 +279,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings, } // re-compute the minimum rank across the block (since new pairs are created above) - auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid); + auto const reduce_rank = + block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid); if (lane_idx == 0) { block_min_rank = reduce_rank; } __syncthreads(); } // if no min ranks are found we are done, otherwise start again diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 663595af5df..61a7375772b 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -368,7 +369,7 @@ CUDF_KERNEL void minhash_kernel(offsets_type offsets_itr, auto const values = block_values + (lane_idx * block_size); // cooperative groups does not have a min function and cub::BlockReduce was slower auto const minv = - thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{}); + thrust::reduce(thrust::seq, values, values + block_size, init, cudf::detail::minimum{}); if constexpr (blocks_per_row > 1) { // accumulates mins for each block into d_output cuda::atomic_ref ref{d_output[lane_idx + i]}; diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh index 119d8e7b138..d6a991f675c 100644 --- a/cpp/tests/iterator/iterator_tests.cuh +++ b/cpp/tests/iterator/iterator_tests.cuh @@ -19,6 +19,7 @@ #include #include +#include #include // for meanvar #include #include @@ -28,7 +29,7 @@ #include #include -#include +#include #include #include #include @@ -59,7 +60,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in, dev_result.begin(), num_items, - thrust::minimum{}, + cudf::detail::minimum{}, init, cudf::get_default_stream().value()); @@ -72,7 +73,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in, dev_result.begin(), num_items, - thrust::minimum{}, + cudf::detail::minimum{}, init, cudf::get_default_stream().value()); @@ -98,7 +99,7 @@ struct IteratorTest : public cudf::test::BaseFixture { d_in_last, dev_expected.begin(), dev_results.begin(), - thrust::equal_to{}); + cuda::std::equal_to{}); auto result = thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), dev_results.begin(), dev_results.end(),