From 163e27b4785a0890323bc0ff9c7a27657d23e0d3 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 20 Feb 2025 22:59:49 +0100
Subject: [PATCH] Replace deprecated CCCL features (#18036)

CCCL has deprecated a set of features that will be removed in an upcoming release.

Replace them with the suggested alternative.

NOTE: We have some facilities like `cub::AliasTemporaries` that have been internalized in the 2.8 release, where there is still an alias available in the `cub` namespace.

However that alias will be removed in the CCCL 3.0 release that we are testing in our CI. I added a conditional compilation there to ensure we are still able to build against CCCL 3.0. We can remove this once rapids switches to CCCL 2.8

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/18036
---
 cpp/src/io/fst/dispatch_dfa.cuh            | 39 +++++++++++-----------
 cpp/src/io/fst/logical_stack.cuh           | 19 +++++------
 cpp/src/io/parquet/writer_impl_helpers.cpp |  3 ++
 3 files changed, 32 insertions(+), 29 deletions(-)
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index ef5e9c8a78f..d8be747d93d 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -209,29 +209,25 @@ struct DispatchFSM : DeviceFSMPolicy {
                             FstScanTileStateT fst_tile_state)
 
   {
-    cudaError_t error = cudaSuccess;
-    cub::KernelConfig dfa_simulation_config;
-
     using PolicyT = typename ActivePolicyT::AgentDFAPolicy;
-    if (CubDebug(error = dfa_simulation_config.Init<PolicyT>(dfa_kernel))) return error;
 
     // Kernel invocation
     uint32_t grid_size = std::max(
       1u, CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD));
-    uint32_t block_threads = dfa_simulation_config.block_threads;
-
-    dfa_kernel<<<grid_size, block_threads, 0, stream>>>(dfa,
-                                                        d_chars_in,
-                                                        num_chars,
-                                                        seed_state,
-                                                        d_thread_state_transition,
-                                                        tile_state,
-                                                        fst_tile_state,
-                                                        transduced_out_it,
-                                                        transduced_out_idx_it,
-                                                        d_num_transduced_out_it);
+
+    dfa_kernel<<<grid_size, PolicyT::BLOCK_THREADS, 0, stream>>>(dfa,
+                                                                 d_chars_in,
+                                                                 num_chars,
+                                                                 seed_state,
+                                                                 d_thread_state_transition,
+                                                                 tile_state,
+                                                                 fst_tile_state,
+                                                                 transduced_out_it,
+                                                                 transduced_out_idx_it,
+                                                                 d_num_transduced_out_it);
 
     // Check for errors
+    cudaError_t error = cudaSuccess;
     if (CubDebug(error = cudaPeekAtLastError())) return error;
 
     return error;
@@ -394,8 +390,13 @@ struct DispatchFSM : DeviceFSMPolicy {
 
     // Alias the temporary allocations from the single storage blob (or compute the necessary size
     // of the blob)
-    error =
-      cub::AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+    // TODO (@miscco): remove this once rapids moves to CCCL 2.8
+#if CCCL_VERSION_MAJOR >= 3
+    error = cub::detail::AliasTemporaries(
+#else   // ^^^ CCCL 3.x ^^^ / vvv CCCL 2.x vvv
+    error = cub::AliasTemporaries(
+#endif  // CCCL 2.x
+      d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
     if (error != cudaSuccess) return error;
 
     // Return if the caller is simply requesting the size of the storage allocation
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index 98641f2c893..7b217d08da3 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -332,9 +332,8 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   // Transforming sequence of stack symbols to stack operations
   using StackSymbolToStackOpT = detail::StackSymbolToStackOp<StackOpT, StackSymbolToStackOpTypeT>;
 
-  // TransformInputIterator converting stack symbols to stack operations
-  using TransformInputItT =
-    cub::TransformInputIterator<StackOpT, StackSymbolToStackOpT, StackSymbolItT>;
+  // transform_iterator converting stack symbols to stack operations
+  using TransformInputItT = thrust::transform_iterator<StackSymbolToStackOpT, StackSymbolItT>;
 
   constexpr bool supports_reset_op = SupportResetOperation == stack_op_support::WITH_RESET_SUPPORT;
 
@@ -365,8 +364,8 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
   // with the empty_stack_symbol
   StackOpT const empty_stack{0, empty_stack_symbol};
 
-  cub::TransformInputIterator<StackOpT, detail::RemapEmptyStack<StackOpT>, StackOpT*>
-    kv_ops_scan_in(nullptr, detail::RemapEmptyStack<StackOpT>{empty_stack});
+  thrust::transform_iterator<detail::RemapEmptyStack<StackOpT>, StackOpT*> kv_ops_scan_in(
+    nullptr, detail::RemapEmptyStack<StackOpT>{empty_stack});
   StackOpT* kv_ops_scan_out = nullptr;
 
   std::size_t stack_level_scan_bytes      = 0;
@@ -532,7 +531,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
                                                 end_bit,
                                                 stream));
 
-  // TransformInputIterator that remaps all operations on stack level 0 to the empty stack symbol
+  // transform_iterator that remaps all operations on stack level 0 to the empty stack symbol
   kv_ops_scan_in  = {reinterpret_cast<StackOpT*>(d_kv_operations_unsigned.Current()),
                      detail::RemapEmptyStack<StackOpT>{empty_stack}};
   kv_ops_scan_out = reinterpret_cast<StackOpT*>(d_kv_operations_unsigned.Alternate());
@@ -553,9 +552,9 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
                thrust::device_ptr<StackSymbolT>{d_top_of_stack + num_symbols_out},
                read_symbol);
 
-  // Transform the stack operations to the stack symbol they represent
-  cub::TransformInputIterator<StackSymbolT, detail::StackOpToStackSymbol, StackOpT*>
-    kv_op_to_stack_sym_it(kv_ops_scan_out, detail::StackOpToStackSymbol{});
+  // transform_iterator the stack operations to the stack symbol they represent
+  thrust::transform_iterator<detail::StackOpToStackSymbol, StackOpT*> kv_op_to_stack_sym_it(
+    kv_ops_scan_out, detail::StackOpToStackSymbol{});
 
   // Scatter the stack symbols to the output tape (spots that are not scattered to have been
   // pre-filled with the read-symbol)
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index ede788c97c2..dee1a3615ef 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -26,6 +26,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
+#include <functional>
+#include <string>
+
 namespace cudf::io::parquet::detail {
 
 using namespace cudf::io::detail;