From 54c15b2a1a61f4d88437ab0433eecf27241bda77 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 25 Feb 2025 16:21:01 -0600
Subject: [PATCH 01/42] Use conda-build instead of conda-mambabuild (#18092)

This changes from `conda mambabuild` to `conda build`. Conda now uses the mamba solver so no performance regressions are expected.

This is a temporary change as we plan to migrate to `rattler-build` in the near future. However, this is needed sooner to drop `boa` and unblock Python 3.13 migrations.

xref: https://github.com/rapidsai/build-planning/issues/149

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/18092
---
 ci/build_cpp.sh    |  4 ++--
 ci/build_python.sh | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3d06eacf9ff..0c324d01cdf 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -18,7 +18,7 @@ rapids-logger "Begin cpp build"
 sccache --zero-stats
 
 # With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry build \
     conda/recipes/libcudf
 
 sccache --show-adv-stats
diff --git a/ci/build_python.sh b/ci/build_python.sh
index ed90041cc77..abbdc3f3a3b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -25,7 +25,7 @@ sccache --zero-stats
 # node works correctly
 # With boa installed conda build forwards to the boa builder
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
@@ -33,7 +33,7 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -42,13 +42,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -56,13 +56,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 
 sccache --show-adv-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/custreamz
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \

From 0f7a17f8767dfe5c00ea31feb894cf38a9fc1b6d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 25 Feb 2025 15:17:40 -0800
Subject: [PATCH 02/42] Update numba dep and upper-bound numpy (#18078)

This PR updates to numba-cuda 0.4 and numba 0.61. A numpy upper-bound is added since it looks like numpy 2.1 made some changes with which we are currently incompatible. Previously numba provided that upper bound for us.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/18078
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 6 +++---
 conda/environments/all_cuda-128_arch-x86_64.yaml | 6 +++---
 conda/recipes/cudf/meta.yaml                     | 6 +++---
 conda/recipes/pylibcudf/meta.yaml                | 2 +-
 dependencies.yaml                                | 9 +++++----
 python/cudf/pyproject.toml                       | 6 +++---
 python/cudf_polars/pyproject.toml                | 2 +-
 python/dask_cudf/pyproject.toml                  | 6 +++---
 python/pylibcudf/pyproject.toml                  | 2 +-
 9 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cc674732ba4..e7dbb765099 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -54,9 +54,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==4.2.0.11
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index 7593a72cc68..342ec8d4b59 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -53,9 +53,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcomp==4.2.0.11
 - nvtx>=0.2.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f817bc12c5b..43060ef1c87 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -75,9 +75,9 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.2.0,<0.3.0a0
-    - numba >=0.59.1,<0.61.0a0
-    - numpy >=1.23,<3.0a0
+    - numba-cuda >=0.4.0,<0.5.0a0
+    - numba >=0.59.1,<0.62.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 14e2f31a5a5..ae02cf8d4e5 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -73,7 +73,7 @@ requirements:
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
-    - numpy >=1.23,<3.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index e7840d56880..c7869eee922 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -723,7 +723,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - &numpy numpy>=1.23,<3.0a0
+          - &numpy numpy>=1.23,<2.1
           - pandas>=2.0,<2.2.4dev0
   run_pylibcudf:
     common:
@@ -753,8 +753,8 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0
-          - &numba-dep numba>=0.59.1,<0.61.0a0
+          - &numba-cuda-dep numba-cuda>=0.4.0,<0.5.0a0
+          - &numba-dep numba>=0.59.1,<0.62.0a0
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -885,7 +885,8 @@ dependencies:
         matrices:
           - matrix: {dependencies: "oldest"}
             packages:
-              - numba-cuda==0.2.0
+              - numba-cuda==0.4.0
+              - numba==0.59.1
               - pandas==2.0.*
           - matrix: {dependencies: "latest"}
             packages:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 16cd97677ef..8b8abe90ac9 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,9 +24,9 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.4.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0a0",
-    "numba>=0.59.1,<0.61.0a0",
-    "numpy>=1.23,<3.0a0",
+    "numba-cuda>=0.4.0,<0.5.0a0",
+    "numba>=0.59.1,<0.62.0a0",
+    "numpy>=1.23,<2.1",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 872c08a66f9..9026a0c29ca 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -35,7 +35,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 87bf282f376..83493d7f2a4 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cudf==25.4.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pandas>=2.0,<2.2.4dev0",
     "pynvml>=12.0.0,<13.0.0a0",
     "rapids-dask-dependency==25.4.*,>=0.0.0a0",
@@ -47,8 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==25.4.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0a0",
-    "numba>=0.59.1,<0.61.0a0",
+    "numba-cuda>=0.4.0,<0.5.0a0",
+    "numba>=0.59.1,<0.62.0a0",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 939da65c1ec..e12d1ffdb39 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -42,7 +42,7 @@ classifiers = [
 test = [
     "fastavro>=0.22.9",
     "hypothesis",
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pandas",
     "pytest-cov",
     "pytest-xdist",

From 8d6bdc34c4b2d0d6be614c04af16b8064d2c723d Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 25 Feb 2025 15:21:19 -0800
Subject: [PATCH 03/42] Remove static configure step (#18091)

This check has been superseded by #17781.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/18091
---
 .github/workflows/pr.yaml  | 11 -----------
 ci/configure_cpp_static.sh | 21 ---------------------
 2 files changed, 32 deletions(-)
 delete mode 100755 ci/configure_cpp_static.sh

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 38b890893d0..2c583598f54 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,7 +24,6 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
-      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-libcudf
@@ -192,16 +191,6 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
-  static-configure:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
-    with:
-      build_type: pull-request
-      # Use the wheel container so we can skip conda solves and since our
-      # primary static consumers (Spark) are not in conda anyway.
-      container_image: "rapidsai/ci-wheel:latest"
-      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
deleted file mode 100755
index 3d0647a96f6..00000000000
--- a/ci/configure_cpp_static.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-source rapids-date-string
-
-rapids-logger "Configure static cpp build"
-
-ENV_YAML_DIR="$(mktemp -d)"
-REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
-
-rapids-dependency-file-generator \
-  --output requirements \
-  --file-key test_static_build \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
-
-rapids-pip-retry install -r "${REQUIREMENTS_FILE}"
-pyenv rehash
-
-cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF

From e365986cf886fe3a9531952fe5b91a34ca466c45 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 25 Feb 2025 17:32:23 -0600
Subject: [PATCH 04/42] Run narwhals tests nightly. (#18093)

This enables narwhals tests in nightly CI.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/cudf/pull/18093
---
 .github/workflows/test.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 12f6d751493..7046fd0e5dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -168,3 +168,14 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: "ci/test_cudf_polars_polars_tests.sh"
+  narwhals-tests:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    with:
+      build_type: ${{ inputs.build_type }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      node_type: "gpu-l4-latest-1"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: ci/test_narwhals.sh

From 18a5412ced238630bb1a6f5b15e6f319dd388090 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 25 Feb 2025 18:57:00 -0500
Subject: [PATCH 05/42] Add new nvtext::normalize_characters API (#17818)

Adds new normalizer APIs as part of the rework for the subword-tokenizer.
The new API is split into 2 parts. First a normalizer object is created with appropriate state: lower-case and special-tokens. The normalizing tables are currently hardcoded inside libcudf. Future versions of the this may load these tables from some other source. The 2nd API is given the input strings column and the normalizer object and returns a normalized strings column. The normalizer object can be reused on all subsequent `normalize_characters` calls.

The current `nvtext::normalize_characters` loads the normalizing tables on each call which can be significant overhead. This API will be deprecated and replaced by these 2 new ones. Some utility functions from that implementation have been refactored to be used by both until the old one is removed.

The first API creates the normalizer object.
```cpp
std::unique_ptr<character_normalizer> create_character_normalizer(
  bool do_lower_case,
  cudf::strings_column_view const& special_tokens,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

The 2nd API uses the normalizer on a strings column:
```cpp
std::unique_ptr<cudf::column> normalize_characters(
  cudf::strings_column_view const& input,
  character_normalizer const& normalizer,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Using the python interface:
```python
import cudf
from cudf.core.character_normalizer import CharacterNormalizer

cn = CharacterNormalizer(do_lower=False)
sn = cn.normalize(input_strings)

```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Karthikeyan (https://github.com/karthikeyann)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17818
---
 cpp/benchmarks/text/normalize.cpp             |   9 +-
 cpp/include/cudf/strings/detail/utilities.hpp |  14 +-
 cpp/include/nvtext/normalize.hpp              | 111 ++++-
 cpp/src/strings/utilities.cu                  |  14 +-
 cpp/src/text/normalize.cu                     | 395 +++++++++++++++++-
 cpp/src/text/normalize.cuh                    | 100 +++++
 cpp/src/text/subword/data_normalizer.cu       |  76 +---
 cpp/tests/text/normalize_tests.cpp            | 165 +++++++-
 python/cudf/cudf/core/character_normalizer.py |  46 ++
 python/cudf/cudf/core/column/string.py        |  28 +-
 .../cudf/cudf/tests/text/test_text_methods.py |   8 +-
 .../pylibcudf/libcudf/nvtext/normalize.pxd    |  15 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pxd  |  13 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyi  |  10 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyx  |  71 +++-
 .../pylibcudf/tests/test_nvtext_normalize.py  |  97 ++++-
 16 files changed, 1018 insertions(+), 154 deletions(-)
 create mode 100644 cpp/src/text/normalize.cuh
 create mode 100644 python/cudf/cudf/core/character_normalizer.py

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 594dc0de28a..494d5722ae4 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
+    // we expect the normalizer to be created once and re-used
+    // so creating it is not measured
+    auto normalizer = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      auto result = nvtext::normalize_characters(input, to_lower);
+      auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
 }
@@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state)
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("max_width", {128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index d276c5df7dc..8fb1f30f961 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the first and last offset in the given strings column
+ *
+ * This accounts for sliced input columns as well.
+ *
+ * @param input Strings column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return First and last offset values
+ */
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream);
+
 }  // namespace strings::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 74325f4a406..70ee7891ad7 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Normalizer object to be used with nvtext::normalize_characters
+ *
+ * Use nvtext::create_normalizer to create this object.
+ *
+ * This normalizer includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes any accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ *
+ * If `special_tokens` are included the padding after `[` and before `]` is not
+ * inserted if the characters between them match one of the given tokens.
+ * Also, the `special_tokens` are expected to include the `[]` characters
+ * at the beginning of and end of each string appropriately.
+ */
+struct character_normalizer {
+  /**
+   * @brief Normalizer object constructor
+   *
+   * This initializes and holds the character normalizing tables and settings.
+   *
+   * @param do_lower_case If true, upper-case characters are converted to
+   *        lower-case and accents are stripped from those characters.
+   *        If false, accented and upper-case characters are not transformed.
+   * @param special_tokens Each row is a token including the `[]` brackets.
+   *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  character_normalizer(bool do_lower_case,
+                       cudf::strings_column_view const& special_tokens,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~character_normalizer();
+
+  struct character_normalizer_impl;
+  std::unique_ptr<character_normalizer_impl> _impl;
+};
+
+/**
+ * @brief Create a normalizer object
+ *
+ * Creates a normalizer object which can be reused on multiple calls to
+ * nvtext::normalize_characters
+ *
+ * @see nvtext::character_normalizer
+ *
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param special_tokens Individual tokens including `[]` brackets.
+ *        Default is no special tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::normalize_characters
+ */
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Normalizes the text in input strings column
+ *
+ * @see nvtext::character_normalizer for details on the normalizer behavior
+ *
+ * @code{.pseudo}
+ * cn = create_character_normalizer(true)
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,cn)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ *
+ * cn = create_character_normalizer(false)
+ * s2 = normalize_characters(s,cn)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * @param input The input strings to normalize
+ * @param normalizer Normalizer to use for this function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& input,
+  character_normalizer const& normalizer,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 45bd4615435..c5d46598d4a 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                                 : cudf::detail::get_value<int32_t>(offsets, index, stream);
 }
 
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return {0L, 0L}; }
+  auto const first_offset = (input.offset() == 0) ? 0
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  return {first_offset, last_offset};
+}
+
 }  // namespace detail
 
 rmm::device_uvector<string_view> create_string_vector_from_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 7e2b766862d..0e680e98ec5 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 #include "text/utilities/tokenize_ops.cuh"
@@ -22,10 +23,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
@@ -38,9 +40,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cub/cub.cuh>
+#include <cuda/functional>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
+#include <thrust/remove.h>
 #include <thrust/transform_reduce.h>
 
 #include <limits>
@@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080;
 constexpr uint32_t UTF8_2BYTE = 0x0800;
 constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 
+__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out)
+{
+  auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint);
+  return cudf::strings::detail::from_char_utf8(utf8, out);
+}
+
 /**
  * @brief Convert code-point arrays into UTF-8 bytes for each string.
  */
@@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn {
     // convert each code-point to 1-4 UTF-8 encoded bytes
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
-      uint32_t code_point = *str_cps++;
-      if (code_point < UTF8_1BYTE)  // ASCII range
-        *out_ptr++ = static_cast<char>(code_point);
-      else if (code_point < UTF8_2BYTE) {  // create two-byte UTF-8
-        // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else if (code_point < UTF8_3BYTE) {  // create three-byte UTF-8
-        // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else {  // create four-byte UTF-8
-        // maximum code-point value is 0x0011'0000
-        // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz
-        *out_ptr++ = static_cast<char>((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24);
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      }
+      uint32_t codepoint = *str_cps++;
+      out_ptr += cp_to_utf8(codepoint, out_ptr);
     }
   }
 };
@@ -261,4 +255,361 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   return detail::normalize_characters(input, do_lower_case, stream, mr);
 }
 
+struct character_normalizer::character_normalizer_impl {
+  rmm::device_uvector<uint32_t> cp_metadata;
+  rmm::device_uvector<aux_codepoint_data_type> aux_table;
+  bool do_lower_case;
+  std::unique_ptr<cudf::column> special_tokens;
+  rmm::device_uvector<cudf::string_view> special_tokens_view;
+
+  cudf::device_span<cudf::string_view const> get_special_tokens() const
+  {
+    return special_tokens_view;
+  }
+
+  character_normalizer_impl(rmm::device_uvector<uint32_t>&& cp_metadata,
+                            rmm::device_uvector<aux_codepoint_data_type>&& aux_table,
+                            bool do_lower_case,
+                            std::unique_ptr<cudf::column>&& special_tokens,
+                            rmm::device_uvector<cudf::string_view>&& special_tokens_view)
+    : cp_metadata(std::move(cp_metadata)),
+      aux_table(std::move(aux_table)),
+      do_lower_case{do_lower_case},
+      special_tokens{std::move(special_tokens)},
+      special_tokens_view{std::move(special_tokens_view)}
+  {
+  }
+};
+
+character_normalizer::character_normalizer(bool do_lower_case,
+                                           cudf::strings_column_view const& special_tokens,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref)
+{
+  auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream);
+  auto aux_table   = nvtext::detail::get_aux_codepoint_data(stream);
+  CUDF_EXPECTS(
+    !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument);
+
+  auto sorted = std::move(
+    cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
+  if (do_lower_case) {
+    // lower-case the tokens so they will match the normalized input
+    sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
+  }
+
+  auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
+    cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
+
+  _impl = std::make_unique<character_normalizer_impl>(std::move(cp_metadata),
+                                                      std::move(aux_table),
+                                                      do_lower_case,
+                                                      std::move(sorted),
+                                                      std::move(tokens_view));
+}
+
+character_normalizer::~character_normalizer() {}
+
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<character_normalizer>(do_lower_case, special_tokens, stream, mr);
+}
+
+namespace detail {
+namespace {
+
+/**
+ * @brief Kernel handles fixing up the normalized data to account for any special tokens
+ *
+ * This undoes the padding added around the `[]` for patterns matching the strings in the
+ * special_tokens array.
+ *
+ * Launched as a thread per input byte (total_count).
+ *
+ * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte
+ * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3
+ * @param special_tokens Tokens to check against
+ */
+CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized,
+                                       int64_t total_count,
+                                       cudf::device_span<cudf::string_view const> special_tokens)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= total_count) { return; }
+  auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1;
+  if (*begin != '[') { return; }
+  auto const end   = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS;
+  auto const match = thrust::find(thrust::seq, begin, end, static_cast<uint32_t>(']'));
+  if (match == end) { return; }
+  char candidate[8];
+  auto const ch_begin =
+    thrust::transform_iterator(begin, [](auto v) { return static_cast<char>(v); });
+  auto const ch_end = ch_begin + thrust::distance(begin, match + 1);
+  auto last         = thrust::copy_if(
+    thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; });
+  *last = 0;  // only needed for debug
+
+  auto const size  = static_cast<cudf::size_type>(thrust::distance(candidate, last));
+  auto const token = cudf::string_view(candidate, size);
+  // the binary_search expects the special_tokens to be sorted
+  if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) {
+    return;
+  }
+
+  // fix up chars to remove the extra spaces
+  *(begin + 1) = 0;  // removes space after '['
+  *(match - 1) = 0;  // removes space before ']'
+}
+
+/**
+ * @brief The normalizer kernel
+ *
+ * Launched as a thread per input byte (total_bytes).
+ *
+ * Converts the input d_chars into codepoints to lookup in the provided tables.
+ * Once processed, the d_output contains 3 uints per input byte each encoded
+ * as output UTF-8. Any zero values are to removed by a subsequent kernel call.
+ *
+ * @param d_chars The characters for the input strings column to normalize
+ * @param total_bytes The number of bytes in the d_chars
+ * @param cp_metadata First lookup table for codepoint metadata
+ * @param aux_table Second lookup table containing possible replacement characters
+ * @param do_lower_case True if the normalization includes lower-casing characters
+ * @param d_output The output of the normalization (UTF-8 encoded)
+ */
+CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
+                                        int64_t total_bytes,
+                                        codepoint_metadata_type const* cp_metadata,
+                                        aux_codepoint_data_type const* aux_table,
+                                        bool do_lower_case,
+                                        uint32_t* d_output)
+{
+  uint32_t replacement[MAX_NEW_CHARS] = {0};
+
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+
+  if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
+    auto const cp = [utf8 = d_chars + idx] {
+      cudf::char_utf8 ch_utf8 = *utf8;
+      if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); }
+      return cudf::strings::detail::utf8_to_codepoint(ch_utf8);
+    }();
+    auto const metadata = cp_metadata[cp];
+
+    if (!should_remove_cp(metadata, do_lower_case)) {
+      int8_t num_new_chars = 1;
+      // retrieve the normalized value for cp
+      auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp;
+      replacement[0]    = new_cp == 0 ? cp : new_cp;
+
+      if (do_lower_case && is_multi_char_transform(metadata)) {
+        auto const next_cps = aux_table[cp];
+        replacement[1]      = static_cast<uint32_t>(next_cps >> 32);
+        replacement[2]      = static_cast<uint32_t>(next_cps & 0xFFFFFFFF);
+        num_new_chars       = 2 + (replacement[2] != 0);
+      }
+
+      if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
+        replacement[1] = replacement[0];
+        replacement[0] = SPACE_CODE_POINT;  // add spaces around the new codepoint
+        replacement[2] = SPACE_CODE_POINT;
+        num_new_chars  = 3;
+      }
+
+      // convert codepoints back to UTF-8 in-place
+      for (int k = 0; k < num_new_chars; ++k) {
+        auto const new_cp = replacement[k];
+        if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k)); }
+      }
+    }
+  }
+
+  // employ an optimized coalesced writer to output replacement as a block of transposed data
+  using block_store =
+    cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  __shared__ typename block_store::TempStorage bs_stg;
+  auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS;
+  block_store(bs_stg).Store(block_base, replacement);
+}
+
+/**
+ * @brief Computes the output sizes for each row
+ *
+ * The input offsets are used with segmented-reduce to count the number of
+ * non-zero values for each output row.
+ *
+ * @param d_normalized The UTF-8 encoded normalized values
+ * @param offsets These identify the row boundaries
+ * @param offset Only non-zero if the input column has been sliced
+ * @param size The number of output rows (sames as the number of input rows)
+ * @param stream Stream used for allocating device memory and launching kernels
+ * @return The sizes of each output row
+ */
+template <typename OffsetType>
+rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_normalized,
+                                                   OffsetType offsets,
+                                                   int64_t offset,
+                                                   cudf::size_type size,
+                                                   rmm::cuda_stream_view stream)
+{
+  auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
+
+  auto d_data = d_normalized.data();
+
+  // counts the non-zero bytes in the d_data array
+  auto d_in = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([d_data] __device__(auto idx) {
+      idx = idx * MAX_NEW_CHARS;
+      // transform function counts number of non-zero bytes in uint32_t value
+      auto tfn = [](uint32_t v) -> cudf::size_type {
+        return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) +
+               ((v & 0xFF000000) > 0);
+      };
+      auto const begin = d_data + idx;
+      auto const end   = begin + MAX_NEW_CHARS;
+      return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{});
+    }));
+
+  // DeviceSegmentedReduce is used to compute the size of each output row
+  auto d_out = output_sizes.begin();
+  auto temp  = std::size_t{0};
+  if (offset == 0) {
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+  } else {
+    // offsets need to be normalized for segmented-reduce to work efficiently
+    auto offsets_itr = thrust::transform_iterator(
+      offsets,
+      cuda::proclaim_return_type<int64_t>([offset] __device__(auto o) { return o - offset; }));
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+  }
+
+  return output_sizes;
+}
+
+// handles ranges above int32 max
+template <typename InputIterator, typename OutputIterator, typename T>
+OutputIterator remove_copy_safe(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T const& value,
+                                rmm::cuda_stream_view stream)
+{
+  auto const copy_size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                                  static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto itr = first;
+  while (itr != last) {
+    auto const copy_end =
+      static_cast<std::size_t>(std::distance(itr, last)) <= copy_size ? last : itr + copy_size;
+    result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value);
+    itr    = copy_end;
+  }
+  return result;
+}
+
+// handles ranges above int32 max
+template <typename Iterator, typename T>
+Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream)
+{
+  auto const size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                             static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto result = first;
+  auto itr    = first;
+  while (itr != last) {
+    auto end = static_cast<std::size_t>(std::distance(itr, last)) <= size ? last : itr + size;
+    result   = thrust::remove(rmm::exec_policy(stream), itr, end, value);
+    itr      = end;
+  }
+  return result;
+}
+}  // namespace
+
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); }
+
+  auto [first_offset, last_offset] =
+    cudf::strings::detail::get_first_and_last_offset(input, stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
+
+  if (chars_size == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
+
+  constexpr int64_t block_size = 256;
+  cudf::detail::grid_1d grid{chars_size, block_size};
+  auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS;
+
+  auto const& parameters = normalizer._impl;
+
+  auto d_normalized = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
+  data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    d_input_chars,
+    chars_size,
+    parameters->cp_metadata.data(),
+    parameters->aux_table.data(),
+    parameters->do_lower_case,
+    d_normalized.data());
+
+  // This removes space added around any special tokens in the form of [ttt].
+  // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right
+  // before returning the output strings column.
+  auto const special_tokens = parameters->get_special_tokens();
+  if (!special_tokens.empty()) {
+    special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      d_normalized.data(), chars_size, special_tokens);
+  }
+
+  // Use segmented-reduce over the non-zero codepoints to get the size of the output rows
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  auto output_sizes =
+    compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream);
+
+  // convert the sizes to offsets
+  auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+
+  // create output chars by calling remove_copy(0) on the bytes in d_normalized
+  auto chars       = rmm::device_uvector<char>(total_size, stream, mr);
+  auto const begin = reinterpret_cast<char const*>(d_normalized.begin());
+  // the remove() above speeds up the remove_copy() by roughly 10%
+  auto const end =
+    reinterpret_cast<char const*>(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream));
+  remove_copy_safe(begin, end, chars.data(), 0, stream);
+
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   chars.release(),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::normalize_characters(input, normalizer, stream, mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh
new file mode 100644
index 00000000000..3972726d536
--- /dev/null
+++ b/cpp/src/text/normalize.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "text/subword/detail/cp_data.h"
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @brief Bit used to filter out invalid code points.
+ *
+ * When normalizing characters to code point values, if this bit is set,
+ * the code point should be filtered out before returning from the normalizer.
+ */
+constexpr uint32_t FILTER_BIT = 22;
+
+/**
+ * @brief Retrieve new code point from metadata value.
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return The replacement character if appropriate.
+ */
+__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
+
+/**
+ * @brief Retrieve token category from the metadata value.
+ *
+ * Category values are 0-5:
+ * 0 - character should be padded
+ * 1 - pad character if lower-case
+ * 2 - character should be removed
+ * 3 - remove character if lower-case
+ * 4 - whitespace character -- always replace
+ * 5 - uncategorized
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return Category value.
+ */
+__device__ constexpr uint32_t extract_token_cat(uint32_t metadata)
+{
+  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be padded.
+ */
+__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool always_replace(uint32_t metadata)
+{
+  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
+}
+
+/**
+ * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
+ */
+__device__ constexpr bool is_multi_char_transform(uint32_t metadata)
+{
+  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
+}
+
+/**
+ * @brief Returns true if the byte passed in could be a valid head byte for
+ * a utf8 character. That is, not binary `10xxxxxx`
+ */
+__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
+
+}  // namespace detail
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 7a39199011e..4c54409c41a 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 
@@ -38,81 +39,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Bit used to filter out invalid code points.
- *
- * When normalizing characters to code point values, if this bit is set,
- * the code point should be filtered out before returning from the normalizer.
- */
-constexpr uint32_t FILTER_BIT = 22;
-
-/**
- * @brief Retrieve new code point from metadata value.
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return The replacement character if appropriate.
- */
-__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
-
-/**
- * @brief Retrieve token category from the metadata value.
- *
- * Category values are 0-5:
- * 0 - character should be padded
- * 1 - pad character if lower-case
- * 2 - character should be removed
- * 3 - remove character if lower-case
- * 4 - whitespace character -- always replace
- * 5 - uncategorized
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return Category value.
- */
-__device__ uint32_t extract_token_cat(uint32_t metadata)
-{
-  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool should_remove_cp(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be padded.
- */
-__device__ bool should_add_spaces(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool always_replace(uint32_t metadata)
-{
-  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
-}
-
-/**
- * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
- */
-__device__ bool is_multi_char_transform(uint32_t metadata)
-{
-  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
-}
-
-/**
- * @brief Returns true if the byte passed in could be a valid head byte for
- * a utf8 character. That is, not binary `10xxxxxx`
- */
-__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
-
 /**
  * @brief Converts a UTF-8 character into a unicode code point value.
  *
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 2515cc917fa..530148eb654 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
   EXPECT_EQ(results->size(), 0);
   results = nvtext::normalize_characters(strings_view, false);
   EXPECT_EQ(results->size(), 0);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(TextNormalizeTest, AllNullStrings)
@@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
   results = nvtext::normalize_characters(strings_view, false);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(TextNormalizeTest, SomeNullStrings)
@@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
   auto results = nvtext::normalize_characters(strings_view, false);
   cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeCharacters)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  std::vector<char const*> h_strings{"abc£def",
-                                     nullptr,
-                                     "éè â îô\taeio",
-                                     "\tĂĆĖÑ  Ü",
-                                     "ACEN U",
-                                     "P^NP",
-                                     "$41.07",
-                                     "[a,b]",
-                                     "丏丟",
-                                     ""};
-  auto validity =
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
-  cudf::strings_column_view strings_view(strings);
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
+    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto sv = cudf::strings_column_view(input);
   {
-    auto results = nvtext::normalize_characters(strings_view, true);
+    auto results = nvtext::normalize_characters(sv, true);
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "ee a io aeio",
@@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = nvtext::normalize_characters(strings_view, false);
+    auto results = nvtext::normalize_characters(sv, false);
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "éè â îô aeio",
@@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
+TEST_F(TextNormalizeTest, WithNormalizer)
+{
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
+  // the following include punctuation, accents, whitespace, and CJK characters
+  auto input = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô\taeio",
+                                                   "\tĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P^NP",
+                                                   "$41.07",
+                                                   "[a,b]",
+                                                   "丏丟",
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto const sv = cudf::strings_column_view(input);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  auto expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                        "",
+                                                        "ee a io aeio",
+                                                        " acen  u",
+                                                        "acen u",
+                                                        "p ^ np",
+                                                        " $ 41 . 07",
+                                                        " [ a , b ] ",
+                                                        " 丏  丟 ",
+                                                        "",
+                                                        long_row,
+                                                        long_row,
+                                                        long_row},
+                                                       {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // test normalizer re-use
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô aeio",
+                                                   " ĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P ^ NP",
+                                                   " $ 41 . 07",
+                                                   " [ a , b ] ",
+                                                   " 丏  丟 ",
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextNormalizeTest, SpecialTokens)
+{
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
+  auto input =
+    cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
+                                        "[bos]these should[sep]work too[eos]",
+                                        "some[non]tokens[eol]too",
+                                        long_row,
+                                        long_row,
+                                        long_row});
+
+  auto sv             = cudf::strings_column_view(input);
+  auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
+  auto stv            = cudf::strings_column_view(special_tokens);
+
+  auto normalizer = nvtext::create_character_normalizer(true, stv);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  auto expected   = cudf::test::strings_column_wrapper(
+    {" [bos] some strings with  [pad]  special [sep] tokens [eos] ",
+       " [bos] these should [sep] work too [eos] ",
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false, stv);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  expected   = cudf::test::strings_column_wrapper(
+    {" [BOS] Some strings with  [PAD]  special [SEP] tokens [EOS] ",
+       " [ bos ] these should [ sep ] work too [ eos ] ",
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 {
   cudf::test::strings_column_wrapper strings(
@@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 
   std::vector<cudf::column_view> sliced = cudf::split(strings, {4});
   auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true);
-  cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  auto expected =
+    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
+  expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
+  expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
-  cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+  normalizer = nvtext::create_character_normalizer(false);
+  results    = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer);
+  expected   = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py
new file mode 100644
index 00000000000..1240c0e1eb7
--- /dev/null
+++ b/python/cudf/cudf/core/character_normalizer.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import pylibcudf as plc
+
+import cudf
+
+
+class CharacterNormalizer:
+    """
+    A normalizer object used to normalize input text.
+
+    Parameters
+    ----------
+    do_lower : bool
+        If True, the normalizer should also lower-case
+        while normalizing.
+    special_tokens : cudf.Series
+        Series of special tokens.
+    """
+
+    def __init__(
+        self,
+        do_lower: bool,
+        special_tokens: cudf.Series = cudf.Series([], dtype="object"),
+    ) -> None:
+        self.normalizer = plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, special_tokens._column.to_pylibcudf(mode="read")
+        )
+
+    def normalize(self, text: cudf.Series) -> cudf.Series:
+        """
+        Parameters
+        ----------
+        text : cudf.Series
+            The strings to be normalized.
+
+        Returns
+        -------
+        cudf.Series
+            Normalized strings
+        """
+        result = text._column.normalize_characters(self.normalizer)
+
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 04a72017c33..c0ad33ec7d6 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4679,8 +4679,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
         Normalizes strings characters for tokenizing.
 
-        This uses the normalizer that is built into the
-        subword_tokenize function which includes:
+        .. deprecated:: 25.04
+           Use `CharacterNormalizer` instead.
+
+        The normalizer function includes:
 
             - adding padding around punctuation (unicode category starts with
               "P") as well as certain ASCII symbols like "^" and "$"
@@ -4720,8 +4722,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         2              $ 99
         dtype: object
         """
+        warnings.warn(
+            "normalize_characters is deprecated and will be removed in a future "
+            "version. Use CharacterNormalizer instead.",
+            FutureWarning,
+        )
         return self._return_or_inplace(
-            self._column.normalize_characters(do_lower)
+            self._column.characters_normalize(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -6256,14 +6263,25 @@ def normalize_spaces(self) -> Self:
         )
 
     @acquire_spill_lock()
-    def normalize_characters(self, do_lower: bool = True) -> Self:
+    def characters_normalize(self, do_lower: bool = True) -> Self:
         return ColumnBase.from_pylibcudf(  # type: ignore[return-value]
-            plc.nvtext.normalize.normalize_characters(
+            plc.nvtext.normalize.characters_normalize(
                 self.to_pylibcudf(mode="read"),
                 do_lower,
             )
         )
 
+    @acquire_spill_lock()
+    def normalize_characters(
+        self, normalizer: plc.nvtext.normalize.CharacterNormalizer
+    ) -> Self:
+        return ColumnBase.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                normalizer,
+            )
+        )
+
     @acquire_spill_lock()
     def replace_tokens(
         self, targets: Self, replacements: Self, delimiter: plc.Scalar
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 86e1e46c1a2..dc45827d2e8 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -8,6 +8,7 @@
 
 import cudf
 from cudf.core.byte_pair_encoding import BytePairEncoder
+from cudf.core.character_normalizer import CharacterNormalizer
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing import assert_eq
 
@@ -251,7 +252,8 @@ def test_normalize_characters():
         ]
     )
 
-    actual = strings.str.normalize_characters()
+    normalizer_lower = CharacterNormalizer(True)
+    actual = normalizer_lower.normalize(strings.str)
     assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
@@ -265,7 +267,9 @@ def test_normalize_characters():
             "Stock ^   $ 1",
         ]
     )
-    actual = strings.str.normalize_characters(do_lower=False)
+
+    normalizer = CharacterNormalizer(False)
+    actual = normalizer.normalize(strings.str)
     assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index f8b082c8429..2cf2bfb8ac9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
         const column_view & strings,
         bool do_lower_case
     ) except +libcudf_exception_handler
+
+    cdef struct character_normalizer "nvtext::character_normalizer":
+        pass
+
+    cdef unique_ptr[character_normalizer] create_character_normalizer(
+        bool do_lower_case,
+        const column_view & strings
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] normalize_characters(
+        const column_view & strings,
+        const character_normalizer & normalizer
+    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
index 90676145afa..e6688e19762 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -1,9 +1,18 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
 from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer
 
+cdef class CharacterNormalizer:
+    cdef unique_ptr[character_normalizer] c_obj
 
 cpdef Column normalize_spaces(Column input)
 
-cpdef Column normalize_characters(Column input, bool do_lower_case)
+cpdef Column characters_normalize(Column input, bool do_lower_case)
+
+cpdef Column normalize_characters(
+  Column input,
+  CharacterNormalizer normalizer
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
index 1d90a5a8960..d722ef6c79e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -1,6 +1,12 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
 
+class CharacterNormalizer:
+    def __init__(self, do_lower_case: bool, special_tokens: Column): ...
+
 def normalize_spaces(input: Column) -> Column: ...
-def normalize_characters(input: Column, do_lower_case: bool) -> Column: ...
+def characters_normalize(input: Column, do_lower_case: bool) -> Column: ...
+def normalize_characters(
+    input: Column, normalizer: CharacterNormalizer
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index b259ccaefa6..6a18c205841 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -1,16 +1,37 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize
 
-__all__ = ["normalize_characters", "normalize_spaces"]
+__all__ = [
+    "CharacterNormalizer"
+    "normalize_characters",
+    "normalize_spaces",
+    "characters_normalize"
+]
+
+cdef class CharacterNormalizer:
+    """The normalizer object to be used with ``normalize_characters``.
+
+    For details, see :cpp:class:`cudf::nvtext::character_normalizer`.
+    """
+    def __cinit__(self, bool do_lower_case, Column tokens):
+        cdef column_view c_tokens = tokens.view()
+        with nogil:
+            self.c_obj = move(
+                cpp_normalize.create_character_normalizer(
+                    do_lower_case,
+                    c_tokens
+                )
+            )
+
+    __hash__ = None
 
 cpdef Column normalize_spaces(Column input):
     """
@@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_spaces(input.view())
+        c_result = cpp_normalize.normalize_spaces(input.view())
 
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column normalize_characters(Column input, bool do_lower_case):
+cpdef Column characters_normalize(Column input, bool do_lower_case):
     """
     Normalizes strings characters for tokenizing.
 
@@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            do_lower_case
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    normalizer : CharacterNormalizer
+        Normalizer object used for modifying the input column text
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            dereference(normalizer.c_obj.get())
+        )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index 25b6d1389ec..47bbb191be6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -15,7 +15,7 @@ def norm_spaces_input_data():
 
 @pytest.fixture(scope="module")
 def norm_chars_input_data():
-    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"]
     return pa.array(arr)
 
 
@@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data):
 
 @pytest.mark.parametrize("do_lower", [True, False])
 def test_normalize_characters(norm_chars_input_data, do_lower):
-    result = plc.nvtext.normalize.normalize_characters(
+    result = plc.nvtext.normalize.characters_normalize(
         plc.interop.from_arrow(norm_chars_input_data),
         do_lower,
     )
-    expected = pa.array(
-        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower,
+            plc.column_factories.make_empty_column(plc.types.TypeId.STRING),
+        ),
+    )
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower):
+    special_tokens = pa.array(["[pad]"])
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, plc.interop.from_arrow(special_tokens)
+        ),
     )
-    if not do_lower:
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
+        )
+    else:
         expected = pa.array(
-            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
         )
     assert_column_eq(result, expected)

From 5eb552754020bed652f3f278a6b5cc494eeb9bce Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 25 Feb 2025 16:15:40 -0800
Subject: [PATCH 06/42] Remove unused var (#18096)

The `cython_lib_dir` was removed as part of the switch to publishing a libcudf wheel.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/18096
---
 python/cudf/CMakeLists.txt       | 4 ----
 python/cudf_kafka/CMakeLists.txt | 4 ----
 python/pylibcudf/CMakeLists.txt  | 4 ----
 3 files changed, 12 deletions(-)

diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 2a17bc5dbb7..090e475471d 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -37,7 +37,3 @@ rapids_cython_init()
 
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
index 3e12eb6aa41..13b859bc33b 100644
--- a/python/cudf_kafka/CMakeLists.txt
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -35,7 +35,3 @@ include(rapids-cython-core)
 rapids_cython_init()
 
 add_subdirectory(cudf_kafka/_lib)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index fe6e73a3f14..153570a4a7e 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -37,7 +37,3 @@ include(rapids-cython-core)
 rapids_cython_init()
 
 add_subdirectory(pylibcudf)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
-endif()

From d8b3d801ec4830102242db1fa60a88e1a0bb7299 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 25 Feb 2025 16:58:14 -0800
Subject: [PATCH 07/42] Fix scatter_by_map with spilling enabled (#18095)

closes https://github.com/rapidsai/cudf/issues/18088

Before the old Cython bindings of `columns_split` spill locked the conversion from libcudf to a cudf Python column. When I replaced these bindings, this spill locking was removed during the refactor.

I'm spot checking that other APIs are not affected. If so I can open PRs for those

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/18095
---
 python/cudf/cudf/core/indexed_frame.py  |  6 +++++-
 python/cudf/cudf/tests/test_spilling.py | 11 ++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9c48b31a309..211d161696e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3308,9 +3308,13 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]:
             splits,
         )
 
+        @acquire_spill_lock()
+        def split_from_pylibcudf(split: list[plc.Column]) -> list[ColumnBase]:
+            return [ColumnBase.from_pylibcudf(col) for col in split]
+
         return [
             self._from_columns_like_self(
-                [ColumnBase.from_pylibcudf(col) for col in split],
+                split_from_pylibcudf(split),
                 self._column_names,
                 self.index.names if keep_index else None,
             )
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 13d98e43ddc..08226dd7f6d 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import contextlib
@@ -784,3 +784,12 @@ def test_spilling_and_copy_on_write(manager: SpillManager):
         assert not a.is_spilled
         assert a.owner.exposed
         assert not b.owner.exposed
+
+
+def test_scatter_by_map():
+    data = range(10)
+    with cudf.option_context("spill", True):
+        df = cudf.DataFrame(data)
+        result = df.scatter_by_map(data)
+    for i, res in zip(data, result):
+        assert_eq(res, cudf.DataFrame([i], index=[i]))

From 46b9799ea55b899e08f6b758ec90e9742a72d159 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 25 Feb 2025 19:14:57 -0600
Subject: [PATCH 08/42] Fix `test_scan_csv_multi` cudf-polars test (#18064)

The current implementation of `test_scan_csv_multi` does not work if the compute task is run on distinct worker processes (because it changes directory in lieu of using a proper file path).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/18064
---
 python/cudf_polars/tests/test_scan.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 9c58a24c065..8ff0db084b1 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -1,9 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import os
-
 import pytest
 
 import polars as pl
@@ -203,8 +201,11 @@ def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows):
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test*.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
-    os.chdir(tmp_path)
-    q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows)
+    if isinstance(filename, list):
+        source = [tmp_path / fn for fn in filename]
+    else:
+        source = tmp_path / filename
+    q = pl.scan_csv(source, glob=glob, n_rows=n_rows, skip_rows=skiprows)
 
     assert_gpu_result_equal(q)
 

From 72d5792c79f11c90f43c6991dd54e082b3c0ad98 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 26 Feb 2025 08:14:26 -0600
Subject: [PATCH 09/42] Relax inconsistent schema handling in
 `dask_cudf.read_parquet` (#17554)

Addresses an issue raised offline by @praateekmahajan

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/17554
---
 .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 18 +++----
 .../dask_cudf/io/tests/test_parquet.py        | 48 ++++++++++++++++++-
 2 files changed, 53 insertions(+), 13 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0792663c7e..c0b9d71653c 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            if col_name is None:
-                # Pyarrow cannot handle `None` as a field name.
-                # However, this should be a simple range index that
-                # we can ignore anyway
-                continue
-            typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
-            if (
-                col_name in schema.names
-                and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype))
-                and isinstance(col, cudf.core.column.StringColumn)
-            ):
-                df._data[col_name] = col.astype(typ)
+            if col_name in schema.names:
+                typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
+                if not isinstance(
+                    typ, (cudf.ListDtype, cudf.StructDtype)
+                ) and isinstance(col, cudf.core.column.StringColumn):
+                    df._data[col_name] = col.astype(typ)
 
 
 to_parquet = dd.to_parquet
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 9f7031f4d2a..3a88668e6d2 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import dask
@@ -486,6 +487,52 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
+@pytest.mark.parametrize("specify_schema", [True, False])
+def test_read_inconsistent_schema(tmpdir, specify_schema):
+    if specify_schema:
+        # If we specify the expected schema,
+        # we also need to specify the partitioning.
+        kwargs = {
+            "dataset": {
+                "schema": pa.schema(
+                    [
+                        ("id", pa.int64()),
+                        ("text", pa.string()),
+                        ("meta1", pa.struct([("field1", pa.string())])),
+                    ]
+                ),
+                "partitioning": None,
+            },
+        }
+    else:
+        kwargs = {}
+
+    records = [
+        {"id": 123, "text": "foo"},
+        {
+            "text": "bar",
+            "meta1": [{"field1": "cat"}],
+            "id": 456,
+        },
+    ]
+    columns = ["text", "id"]
+    pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet")
+    pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet")
+    # Check that cuDF and Dask cuDF match
+    dd.assert_eq(
+        cudf.read_parquet(
+            tmpdir, columns=columns, allow_mismatched_pq_schemas=True
+        ),
+        dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs),
+        check_index=False,
+    )
+    # Check that "pandas" and "cudf" backends match
+    dd.assert_eq(
+        dd.read_parquet(tmpdir, columns=columns),
+        dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -526,7 +573,6 @@ def test_cudf_list_struct_write(tmpdir):
 
 
 def test_null_partition(tmpdir):
-    import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
 
     ids = pd.Series([0, 1, None], dtype="Int64")

From e5d866bc68c4762ebd6e3e888e4abeaf4ccd9302 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 07:01:27 -0800
Subject: [PATCH 10/42] Short circuit Index.equal if compared Index isn't same
 type (#18067)

closes https://github.com/rapidsai/cudf/issues/8689

Before, comparing two different Index subclasses would execute a GPU kernel when we know they wouldn't be equal (e.g. DatetimeIndex equals RangeIndex). This PR add a short circuit clause to check that we are comparing the same subclasses.

Also ensures we don't return a `np.bool_` object from this result.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/18067
---
 python/cudf/cudf/core/column/column.py | 2 +-
 python/cudf/cudf/core/index.py         | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 06dc4058115..67a0aa7a781 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -713,7 +713,7 @@ def all(self, skipna: bool = True) -> bool:
         # is empty.
         if self.null_count == self.size:
             return True
-        return self.reduce("all")
+        return bool(self.reduce("all"))
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1730a692dc1..f4e5f6e96ae 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1286,6 +1286,15 @@ def equals(self, other) -> bool:
         elif other_is_categorical and not self_is_categorical:
             self = self.astype(other.dtype)
             check_dtypes = True
+        elif (
+            not self_is_categorical
+            and not other_is_categorical
+            and not isinstance(other, RangeIndex)
+            and not isinstance(self, type(other))
+        ):
+            # Can compare Index to CategoricalIndex or RangeIndex
+            # Other comparisons are invalid
+            return False
 
         try:
             return self._column.equals(

From 1a8d6368405fac3c5e55592fef2d9259081b045c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 11:00:27 -0800
Subject: [PATCH 11/42] Enforce deprecation of dtype parameter in sum/product
 (#18070)

xref https://github.com/rapidsai/cudf/pull/16313

Deprecated in 24.08

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18070
---
 python/cudf/cudf/core/column/column.py    | 20 +++++---------------
 python/cudf/cudf/core/column/timedelta.py |  3 +--
 python/cudf/cudf/core/indexed_frame.py    |  8 --------
 python/cudf/cudf/tests/test_reductions.py |  8 --------
 4 files changed, 6 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 67a0aa7a781..b57d1f03981 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import warnings
 from collections import abc
 from collections.abc import MutableSequence, Sequence
 from functools import cached_property
@@ -1946,8 +1945,7 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            dtype = kwargs.pop("dtype", None)
-            return preprocessed.reduce(op, dtype, **kwargs)
+            return preprocessed.reduce(op, **kwargs)
         return preprocessed
 
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
@@ -2110,16 +2108,8 @@ def scan(self, scan_op: str, inclusive: bool, **kwargs) -> Self:
             )
         )
 
-    def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
-        if dtype is not None:
-            warnings.warn(
-                "dtype is deprecated and will be remove in a future release. "
-                "Cast the result (e.g. .astype) after the operation instead.",
-                FutureWarning,
-            )
-            col_dtype = dtype
-        else:
-            col_dtype = self._reduction_result_dtype(reduction_op)
+    def reduce(self, reduction_op: str, **kwargs) -> ScalarLike:
+        col_dtype = self._reduction_result_dtype(reduction_op)
 
         # check empty case
         if len(self) <= self.null_count:
@@ -2148,7 +2138,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
             }:
                 scale = -plc_scalar.type().scale()
                 # https://docs.microsoft.com/en-us/sql/t-sql/data-types/precision-scale-and-length-transact-sql
-                p = col_dtype.precision
+                p = col_dtype.precision  # type: ignore[union-attr]
                 nrows = len(self)
                 if reduction_op in {"min", "max"}:
                     new_p = p
@@ -2162,7 +2152,7 @@ def reduce(self, reduction_op: str, dtype=None, **kwargs) -> ScalarLike:
                     raise NotImplementedError(
                         f"{reduction_op} not implemented for decimal types."
                     )
-                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)
+                precision = max(min(new_p, col_dtype.MAX_PRECISION), 0)  # type: ignore[union-attr]
                 new_dtype = type(col_dtype)(precision, scale)
                 result_col = result_col.astype(new_dtype)
             elif isinstance(col_dtype, IntervalDtype):
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 1cbbac0f8cc..8b0ef9f0cc8 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -452,14 +452,13 @@ def sum(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype | None = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
             # sees the signature from Reducible (which doesn't have the extra
             # parameters from ColumnBase._reduce) so we have to ignore this.
             self.astype(np.dtype(np.int64)).sum(  # type: ignore
-                skipna=skipna, min_count=min_count, dtype=dtype
+                skipna=skipna, min_count=min_count
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 211d161696e..9d426ad6bf7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1328,7 +1328,6 @@ def sum(
         self,
         axis=no_default,
         skipna=True,
-        dtype=None,
         numeric_only=False,
         min_count=0,
         **kwargs,
@@ -1342,8 +1341,6 @@ def sum(
             Axis for the function to be applied on.
         skipna: bool, default True
             Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
         numeric_only : bool, default False
             If True, includes only float, int, boolean columns.
             If False, will raise error in-case there are
@@ -1373,7 +1370,6 @@ def sum(
             "sum",
             axis=axis,
             skipna=skipna,
-            dtype=dtype,
             numeric_only=numeric_only,
             min_count=min_count,
             **kwargs,
@@ -1384,7 +1380,6 @@ def product(
         self,
         axis=no_default,
         skipna=True,
-        dtype=None,
         numeric_only=False,
         min_count=0,
         **kwargs,
@@ -1398,8 +1393,6 @@ def product(
             Axis for the function to be applied on.
         skipna: bool, default True
             Exclude NA/null values when computing the result.
-        dtype: data type
-            Data type to cast the result to.
         numeric_only : bool, default False
             If True, includes only float, int, boolean columns.
             If False, will raise error in-case there are
@@ -1432,7 +1425,6 @@ def product(
             "prod" if axis in {1, "columns"} else "product",
             axis=axis,
             skipna=skipna,
-            dtype=dtype,
             numeric_only=numeric_only,
             min_count=min_count,
             **kwargs,
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 80ffce9e8be..75e38b9246a 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -512,14 +512,6 @@ def test_reduction_column_multiindex():
     assert_eq(result, expected)
 
 
-@pytest.mark.parametrize("op", ["sum", "product"])
-def test_dtype_deprecated(op):
-    ser = cudf.Series(range(5))
-    with pytest.warns(FutureWarning):
-        result = getattr(ser, op)(dtype=np.dtype(np.int8))
-    assert isinstance(result, np.int8)
-
-
 @pytest.mark.parametrize(
     "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")]
 )

From 54e740af7a08b99cca84f4f668886031a2c36e71 Mon Sep 17 00:00:00 2001
From: MithunR <mithunr@nvidia.com>
Date: Wed, 26 Feb 2025 11:12:10 -0800
Subject: [PATCH 12/42] Remove static column vectors from window function
 tests. (#18099)

Fixes #18079.

This commit fixes the failures reported in #18079, where the use of static column vector objects in the tests causes the use of a CUDA runtime context before it's been initialized, causing the tests to fail with:
```
parallel_for failed: cudaErrorInvalidResourceHandle: invalid resource handle
```

The solution is to switch the static column vectors to runtime, as a member of the test utility class `rolling_runner`.

Authors:
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/18099
---
 cpp/tests/rolling/offset_row_window_test.cpp | 28 +++++++++++---------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/cpp/tests/rolling/offset_row_window_test.cpp b/cpp/tests/rolling/offset_row_window_test.cpp
index dcaa47e722b..4477ca388df 100644
--- a/cpp/tests/rolling/offset_row_window_test.cpp
+++ b/cpp/tests/rolling/offset_row_window_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,18 +43,21 @@ auto constexpr null = int32_t{0};  // NULL representation for int32_t;
 auto no_nulls_list() { return nulls_at({}); }
 
 struct OffsetRowWindowTest : public cudf::test::BaseFixture {
-  static ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
-  static ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
   struct rolling_runner {
     cudf::window_bounds _preceding, _following;
     cudf::size_type _min_periods;
     bool _grouped = true;
+    ints_column const _keys;    // {0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
+    ints_column const _values;  // {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
 
     rolling_runner(cudf::window_bounds const& preceding,
                    cudf::window_bounds const& following,
                    cudf::size_type min_periods_ = 1)
-      : _preceding{preceding}, _following{following}, _min_periods{min_periods_}
+      : _preceding{preceding},
+        _following{following},
+        _min_periods{min_periods_},
+        _keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1},
+        _values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
     {
     }
 
@@ -80,9 +83,6 @@ struct OffsetRowWindowTest : public cudf::test::BaseFixture {
   };
 };
 
-ints_column const OffsetRowWindowTest::_keys{0, 0, 0, 0, 0, 0, 1, 1, 1, 1};
-ints_column const OffsetRowWindowTest::_values{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
 auto const AGG_COUNT_NON_NULL =
   cudf::make_count_aggregation<cudf::rolling_aggregation>(cudf::null_policy::EXCLUDE);
 auto const AGG_COUNT_ALL =
@@ -96,7 +96,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_3_to_Minus_1)
 {
   auto const preceding = cudf::window_bounds::get(3);
   auto const following = cudf::window_bounds::get(-1);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(true);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{0, 1, 2, 2, 2, 2, 0, 1, 2, 2}, nulls_at({0, 6})});
@@ -136,7 +137,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_3_to_Minus_1)
 {
   auto const preceding = cudf::window_bounds::get(3);
   auto const following = cudf::window_bounds::get(-1);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(false);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, nulls_at({0})});
@@ -176,7 +178,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Grouped_0_to_2)
 {
   auto const preceding = cudf::window_bounds::get(0);
   auto const following = cudf::window_bounds::get(2);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(true);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(true);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     *run_rolling(*AGG_COUNT_NON_NULL),
@@ -219,7 +222,8 @@ TEST_F(OffsetRowWindowTest, OffsetRowWindow_Ungrouped_0_to_2)
 {
   auto const preceding = cudf::window_bounds::get(0);
   auto const following = cudf::window_bounds::get(2);
-  auto run_rolling     = rolling_runner{preceding, following}.min_periods(1).grouped(false);
+  auto run_rolling     = rolling_runner{preceding, following};
+  run_rolling.min_periods(1).grouped(false);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*run_rolling(*AGG_COUNT_NON_NULL),
                                  ints_column{{2, 2, 2, 2, 2, 2, 2, 2, 1, null}, nulls_at({9})});

From 79d0b75a5327f72cdc14297885257a8979bdf0f2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 12:01:22 -0800
Subject: [PATCH 13/42] Align StringColumn constructor with ColumnBase base
 class (#18086)

With this PR, the constructors of all subclasses of `ColumnBase` are aligned.

This will allow us to, in the future, more easily align on an interface for a public `ColumnBase` that other libraries can use to extend cudf.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18086
---
 python/cudf/cudf/core/column/column.py |  9 +++++----
 python/cudf/cudf/core/column/string.py | 24 ++++++++++++++----------
 python/cudf/cudf/tests/test_string.py  | 10 +++++++++-
 3 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b57d1f03981..89ac39b2be5 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2312,13 +2312,14 @@ def build_column(
             offset=offset,
             null_count=null_count,
         )
-    elif dtype.type in (np.object_, np.str_):
+    elif dtype == CUDF_STRING_DTYPE:
         return cudf.core.column.StringColumn(
-            data=data,
-            mask=mask,
+            data=data,  # type: ignore[arg-type]
             size=size,
+            dtype=dtype,
+            mask=mask,
             offset=offset,
-            children=children,
+            children=children,  # type: ignore[arg-type]
             null_count=null_count,
         )
     elif isinstance(dtype, ListDtype):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index c0ad33ec7d6..28e8b98edfe 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -21,7 +21,7 @@
 import cudf.core.column.datetime as datetime
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
 from cudf.core._internals import binaryop
-from cudf.core.buffer import acquire_spill_lock
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.scalar import pa_scalar_to_plc_scalar
@@ -46,7 +46,6 @@
         ScalarLike,
         SeriesOrIndex,
     )
-    from cudf.core.buffer import Buffer
     from cudf.core.column.lists import ListColumn
     from cudf.core.column.numerical import NumericalColumn
 
@@ -5595,13 +5594,14 @@ class StringColumn(column.ColumnBase):
 
     Parameters
     ----------
+    data : Buffer
+        Buffer of the string data
     mask : Buffer
         The validity mask
     offset : int
         Data offset
     children : Tuple[Column]
-        Two non-null columns containing the string data and offsets
-        respectively
+        Columns containing the offsets
     """
 
     _start_offset: int | None
@@ -5629,14 +5629,20 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
-        data: Buffer | None = None,
+        data: Buffer,
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int | None = None,
-        children: tuple["column.ColumnBase", ...] = (),
+        children: tuple[column.ColumnBase] = (),  # type: ignore[assignment]
     ):
-        dtype = cudf.api.types.dtype("object")
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer")
+        if dtype != CUDF_STRING_DTYPE:
+            raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}")
+        if len(children) > 1:
+            raise ValueError("StringColumn must have at most 1 offset column.")
 
         if size is None:
             for child in children:
@@ -5731,8 +5737,6 @@ def base_size(self) -> int:
     # override for string column
     @property
     def data(self):
-        if self.base_data is None:
-            return None
         if self._data is None:
             if (
                 self.offset == 0
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 164fcb06624..18aee0001c4 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -13,8 +13,11 @@
 import pyarrow as pa
 import pytest
 
+import rmm
+
 import cudf
 from cudf import concat
+from cudf.core.buffer import as_buffer
 from cudf.core.column.string import StringColumn
 from cudf.core.index import Index
 from cudf.testing import assert_eq
@@ -1202,7 +1205,12 @@ def test_string_misc_name(ps_gs, name):
 
 
 def test_string_no_children_properties():
-    empty_col = StringColumn(children=())
+    empty_col = StringColumn(
+        as_buffer(rmm.DeviceBuffer(size=0)),
+        size=0,
+        dtype=np.dtype("object"),
+        children=(),
+    )
     assert empty_col.base_children == ()
     assert empty_col.base_size == 0
 

From aa7f436bdc22fb5b25903252c437e32fbc8b33c0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 18:55:25 -0800
Subject: [PATCH 14/42] Allow pivot_table to accept single label index and
 column arguments (#18115)

closes https://github.com/rapidsai/cudf/issues/12410
closes https://github.com/rapidsai/cudf/issues/12409

The fix just mirrors the pandas logic.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18115
---
 python/cudf/cudf/core/reshape.py       | 20 +++++++++-----------
 python/cudf/cudf/tests/test_reshape.py | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index c5d2fd349e9..7d76907916f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1526,9 +1526,9 @@ def pivot_table(
     ----------
     data : DataFrame
     values : column name or list of column names to aggregate, optional
-    index : list of column names
+    index : scalar or list of column names
             Values to group by in the rows.
-    columns : list of column names
+    columns : scalar or list of column names
             Values to group by in the columns.
     aggfunc : str or dict, default "mean"
             If dict is passed, the key is column to aggregate
@@ -1562,6 +1562,11 @@ def pivot_table(
     if sort is not True:
         raise NotImplementedError("sort is not supported yet")
 
+    if is_scalar(index):
+        index = [index]
+    if is_scalar(columns):
+        columns = [columns]
+
     keys = index + columns
 
     values_passed = values is not None
@@ -1620,15 +1625,8 @@ def pivot_table(
         table = table.fillna(fill_value)
 
     # discard the top level
-    if values_passed and not values_multi and table._data.multiindex:
-        column_names = table._data.level_names[1:]
-        table_columns = tuple(
-            map(lambda column: column[1:], table._column_names)
-        )
-        table.columns = pd.MultiIndex.from_tuples(
-            tuples=table_columns, names=column_names
-        )
-
+    if values_passed and not values_multi and table._data.nlevels > 1:
+        table.columns = table._data.to_pandas_index.droplevel(0)
     if len(index) == 0 and len(columns) > 0:
         table = table.T
 
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 7fbe072dde7..eae73e47955 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -798,6 +798,25 @@ def test_dataframe_pivot_table_simple(aggfunc, fill_value):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize("index", ["A", ["A"]])
+@pytest.mark.parametrize("columns", ["C", ["C"]])
+def test_pivot_table_scalar_index_columns(index, columns):
+    data = {
+        "A": ["one", "one", "two", "three"] * 6,
+        "B": ["A", "B", "C"] * 8,
+        "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
+        "D": range(24),
+        "E": range(24),
+    }
+    result = cudf.DataFrame(data).pivot_table(
+        values="D", index=index, columns=columns, aggfunc="sum"
+    )
+    expected = pd.DataFrame(data).pivot_table(
+        values="D", index=index, columns=columns, aggfunc="sum"
+    )
+    assert_eq(result, expected)
+
+
 def test_crosstab_simple():
     a = np.array(
         [

From 7713bc1e8a339644815421b442abd6f91e04e15b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 19:06:55 -0800
Subject: [PATCH 15/42] Simplify DecimalDtype and DecimalColumn operations
 (#18111)

Broken off (the non-breaking parts) from https://github.com/rapidsai/cudf/pull/18035/ as that PR will probably not move forward since it would require a pyarrow minimum version bump to 19

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18111
---
 docs/cudf/source/conf.py                  |  1 +
 python/cudf/cudf/core/column/decimal.py   | 30 +++++-----------
 python/cudf/cudf/core/column/timedelta.py |  4 ++-
 python/cudf/cudf/core/dtypes.py           | 43 ++++++++++++-----------
 python/cudf/cudf/core/scalar.py           |  6 ++--
 5 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c74da8d0ca9..8eea644363b 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -585,6 +585,7 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "pd.DataFrame"),
     ("py:class", "pandas.core.indexes.frozen.FrozenList"),
     ("py:class", "pa.Array"),
+    ("py:class", "pa.Decimal128Type"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
     ("py:class", "pyarrow.lib.DataType"),
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3c603c8e6ef..8db6f805bce 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -13,7 +13,6 @@
 import pylibcudf as plc
 
 import cudf
-from cudf.api.types import is_scalar
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase
@@ -73,11 +72,8 @@ def __cuda_array_interface__(self):
     def as_decimal_column(
         self,
         dtype: Dtype,
-    ) -> "DecimalBaseColumn":
-        if (
-            isinstance(dtype, cudf.core.dtypes.DecimalDtype)
-            and dtype.scale < self.dtype.scale
-        ):
+    ) -> DecimalBaseColumn:
+        if isinstance(dtype, DecimalDtype) and dtype.scale < self.dtype.scale:
             warnings.warn(
                 "cuDF truncates when downcasting decimals to a lower scale. "
                 "To round, use Series.round() or DataFrame.round()."
@@ -204,22 +200,17 @@ def normalize_binop_value(self, other) -> Self | cudf.Scalar:
                     other = other.astype(self.dtype)
             return other
         if isinstance(other, cudf.Scalar) and isinstance(
-            # TODO: Should it be possible to cast scalars of other numerical
-            # types to decimal?
             other.dtype,
-            cudf.core.dtypes.DecimalDtype,
+            DecimalDtype,
         ):
+            # TODO: Should it be possible to cast scalars of other numerical
+            # types to decimal?
             if _same_precision_and_scale(self.dtype, other.dtype):
                 other = other.astype(self.dtype)
             return other
-        elif is_scalar(other) and isinstance(other, (int, Decimal)):
-            other = Decimal(other)
-            metadata = other.as_tuple()
-            precision = max(len(metadata.digits), metadata.exponent)
-            scale = -cast(int, metadata.exponent)
-            return cudf.Scalar(
-                other, dtype=self.dtype.__class__(precision, scale)
-            )
+        elif isinstance(other, (int, Decimal)):
+            dtype = self.dtype._from_decimal(Decimal(other))
+            return cudf.Scalar(other, dtype=dtype)
         return NotImplemented
 
     def as_numerical_column(
@@ -373,11 +364,6 @@ def __init__(
             children=children,
         )
 
-    def __setitem__(self, key, value):
-        if isinstance(value, np.integer):
-            value = int(value)
-        super().__setitem__(key, value)
-
     @classmethod
     def from_arrow(cls, data: pa.Array):
         dtype = Decimal64Dtype.from_arrow(data.type)
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8b0ef9f0cc8..d02681d389d 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -309,7 +309,9 @@ def total_seconds(self) -> ColumnBase:
         # https://github.com/rapidsai/cudf/issues/17664
         return (
             (self.astype(np.dtype(np.int64)) * conversion)
-            .astype(cudf.Decimal128Dtype(38, 9))
+            .astype(
+                cudf.Decimal128Dtype(cudf.Decimal128Dtype.MAX_PRECISION, 9)
+            )
             .round(decimals=abs(int(math.log10(conversion))))
             .astype(np.dtype(np.float64))
         )
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 977208f5eb4..ac9c4d23cc2 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -776,35 +776,36 @@ def _recursively_replace_fields(self, result: dict) -> dict:
 class DecimalDtype(_BaseDtype):
     _metadata = ("precision", "scale")
 
-    def __init__(self, precision, scale=0):
+    def __init__(self, precision: int, scale: int = 0) -> None:
         self._validate(precision, scale)
-        self._typ = pa.decimal128(precision, scale)
+        self._precision = precision
+        self._scale = scale
 
     @property
-    def str(self):
+    def str(self) -> str:
         return f"{self.name!s}({self.precision}, {self.scale})"
 
     @property
-    def precision(self):
+    def precision(self) -> int:
         """
         The decimal precision, in number of decimal digits (an integer).
         """
-        return self._typ.precision
+        return self._precision
 
     @precision.setter
-    def precision(self, value):
+    def precision(self, value: int) -> None:
         self._validate(value, self.scale)
-        self._typ = pa.decimal128(precision=value, scale=self.scale)
+        self._precision = value
 
     @property
-    def scale(self):
+    def scale(self) -> int:
         """
         The decimal scale (an integer).
         """
-        return self._typ.scale
+        return self._scale
 
     @property
-    def itemsize(self):
+    def itemsize(self) -> int:
         """
         Length of one column element in bytes.
         """
@@ -815,14 +816,14 @@ def type(self):
         # might need to account for precision and scale here
         return decimal.Decimal
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Decimal128Type:
         """
         Return the equivalent ``pyarrow`` dtype.
         """
-        return self._typ
+        return pa.decimal128(self.precision, self.scale)
 
     @classmethod
-    def from_arrow(cls, typ):
+    def from_arrow(cls, typ: pa.Decimal128Type) -> Self:
         """
         Construct a cudf decimal dtype from a ``pyarrow`` dtype
 
@@ -856,23 +857,23 @@ def __repr__(self):
         )
 
     @classmethod
-    def _validate(cls, precision, scale=0):
+    def _validate(cls, precision: int, scale: int) -> None:
         if precision > cls.MAX_PRECISION:
             raise ValueError(
                 f"Cannot construct a {cls.__name__}"
                 f" with precision > {cls.MAX_PRECISION}"
             )
         if abs(scale) > precision:
-            raise ValueError(f"scale={scale} exceeds precision={precision}")
+            raise ValueError(f"{scale=} cannot exceed {precision=}")
 
     @classmethod
-    def _from_decimal(cls, decimal):
+    def _from_decimal(cls, decimal: decimal.Decimal) -> Self:
         """
         Create a cudf.DecimalDtype from a decimal.Decimal object
         """
         metadata = decimal.as_tuple()
-        precision = max(len(metadata.digits), -metadata.exponent)
-        return cls(precision, -metadata.exponent)
+        precision = max(len(metadata.digits), -metadata.exponent)  # type: ignore[operator]
+        return cls(precision, -metadata.exponent)  # type: ignore[operator]
 
     def serialize(self) -> tuple[dict, list]:
         return (
@@ -885,7 +886,7 @@ def serialize(self) -> tuple[dict, list]:
         )
 
     @classmethod
-    def deserialize(cls, header: dict, frames: list):
+    def deserialize(cls, header: dict, frames: list) -> Self:
         _check_type(cls, header, frames, is_valid_class=issubclass)
         return cls(header["precision"], header["scale"])
 
@@ -896,8 +897,8 @@ def __eq__(self, other: Dtype) -> bool:
             return False
         return self.precision == other.precision and self.scale == other.scale
 
-    def __hash__(self):
-        return hash(self._typ)
+    def __hash__(self) -> int:
+        return hash(self.to_arrow())
 
 
 @doc_apply(
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index cf85282cccb..29139768a36 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -85,9 +85,9 @@ def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]:
         return value.as_py(), dtype
 
     if isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-        value = pa.scalar(
-            value, type=pa.decimal128(dtype.precision, dtype.scale)
-        ).as_py()
+        if isinstance(value, np.integer):
+            value = int(value)
+        value = pa.scalar(value, type=dtype.to_arrow()).as_py()
     if isinstance(value, decimal.Decimal) and dtype is None:
         dtype = cudf.Decimal128Dtype._from_decimal(value)
 

From 601d0a10c853ef837c948e536a8b5a11f4cd26ab Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 26 Feb 2025 21:34:11 -0600
Subject: [PATCH 16/42] Add `as_proxy_object` API to `cudf.pandas` (#18072)

This is a public API to proxify true `pandas` or `cudf` objects.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/18072
---
 python/cudf/cudf/pandas/__init__.py           |  7 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 27 +++++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 88 +++++++++++++++++++
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 52fc945709e..742a6b57e59 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -8,12 +8,17 @@
 import pylibcudf
 import rmm.mr
 
-from .fast_slow_proxy import is_proxy_instance, is_proxy_object
+from .fast_slow_proxy import (
+    as_proxy_object,
+    is_proxy_instance,
+    is_proxy_object,
+)
 from .magics import load_ipython_extension
 from .profiler import Profiler
 
 __all__ = [
     "Profiler",
+    "as_proxy_object",
     "install",
     "is_proxy_instance",
     "is_proxy_object",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 45944452c17..147971e8bee 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -151,7 +151,7 @@ def make_final_proxy_type(
     additional_attributes
         Mapping of additional attributes to add to the class
        (optional), these will override any defaulted attributes (e.g.
-       ``__init__`). If you want to remove a defaulted attribute
+       ``__init__``). If you want to remove a defaulted attribute
        completely, pass the special sentinel ``_DELETE`` as a value.
     postprocess
         Optional function called to allow the proxy to postprocess
@@ -1335,6 +1335,31 @@ def _get_proxy_base_class(cls):
     return object
 
 
+def as_proxy_object(obj: Any) -> Any:
+    """
+    Wraps a cudf or pandas object in a proxy object if applicable.
+
+    There will be no memory transfer, i.e., GPU objects stay on GPU and
+    CPU objects stay on CPU. The object will be wrapped in a
+    proxy object. This is useful for ensuring that the object is
+    compatible with the fast-slow proxy system.
+
+    Parameters
+    ----------
+    obj : Any
+        The object to wrap.
+
+    Returns
+    -------
+    Any
+        The wrapped proxy object if applicable, otherwise the original object.
+    """
+    if _is_final_type(obj):
+        typ = get_final_type_map()[type(obj)]
+        return typ._fsproxy_wrap(obj, None)
+    return obj
+
+
 def is_proxy_instance(obj, type):
     return is_proxy_object(obj) and obj.__class__.__name__ == type.__name__
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 47de8fb1435..d3bfd9298c2 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -44,6 +44,7 @@
     OOMFallbackError,
     TypeFallbackError,
     _Unusable,
+    as_proxy_object,
     is_proxy_object,
 )
 from cudf.testing import assert_eq
@@ -1979,6 +1980,93 @@ def test_numpy_data_access():
     assert type(expected) is type(actual)
 
 
+@pytest.mark.parametrize(
+    "obj",
+    [
+        pd.DataFrame({"a": [1, 2, 3]}),
+        pd.Series([1, 2, 3]),
+        pd.Index([1, 2, 3]),
+        pd.Categorical([1, 2, 3]),
+        pd.to_datetime(["2021-01-01", "2021-01-02"]),
+        pd.to_timedelta(["1 days", "2 days"]),
+        xpd.DataFrame({"a": [1, 2, 3]}),
+        xpd.Series([1, 2, 3]),
+        xpd.Index([1, 2, 3]),
+        xpd.Categorical([1, 2, 3]),
+        xpd.to_datetime(["2021-01-01", "2021-01-02"]),
+        xpd.to_timedelta(["1 days", "2 days"]),
+        cudf.DataFrame({"a": [1, 2, 3]}),
+        cudf.Series([1, 2, 3]),
+        cudf.Index([1, 2, 3]),
+        cudf.Index([1, 2, 3], dtype="category"),
+        cudf.to_datetime(["2021-01-01", "2021-01-02"]),
+        cudf.Index([1, 2, 3], dtype="timedelta64[ns]"),
+        [1, 2, 3],
+        {"a": 1, "b": 2},
+        (1, 2, 3),
+    ],
+)
+def test_as_proxy_object(obj):
+    proxy_obj = as_proxy_object(obj)
+    if isinstance(
+        obj,
+        (
+            pd.DataFrame,
+            pd.Series,
+            pd.Index,
+            pd.Categorical,
+            xpd.DataFrame,
+            xpd.Series,
+            xpd.Index,
+            xpd.Categorical,
+            cudf.DataFrame,
+            cudf.Series,
+            cudf.Index,
+        ),
+    ):
+        assert is_proxy_object(proxy_obj)
+        if isinstance(proxy_obj, xpd.DataFrame):
+            tm.assert_frame_equal(proxy_obj, xpd.DataFrame(obj))
+        elif isinstance(proxy_obj, xpd.Series):
+            tm.assert_series_equal(proxy_obj, xpd.Series(obj))
+        elif isinstance(proxy_obj, xpd.Index):
+            tm.assert_index_equal(proxy_obj, xpd.Index(obj))
+        else:
+            tm.assert_equal(proxy_obj, obj)
+    else:
+        assert not is_proxy_object(proxy_obj)
+        assert proxy_obj == obj
+
+
+def test_as_proxy_object_doesnot_copy_series():
+    s = pd.Series([1, 2, 3])
+    proxy_obj = as_proxy_object(s)
+    s[0] = 10
+    assert proxy_obj[0] == 10
+    tm.assert_series_equal(s, proxy_obj)
+
+
+def test_as_proxy_object_doesnot_copy_dataframe():
+    df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    proxy_obj = as_proxy_object(df)
+    df.iloc[0, 0] = 10
+    assert proxy_obj.iloc[0, 0] == 10
+    tm.assert_frame_equal(df, proxy_obj)
+
+
+def test_as_proxy_object_doesnot_copy_index():
+    idx = pd.Index([1, 2, 3])
+    proxy_obj = as_proxy_object(idx)
+    assert proxy_obj._fsproxy_wrapped is idx
+
+
+def test_as_proxy_object_no_op_for_intermediates():
+    s = pd.Series(["abc", "def", "ghi"])
+    str_attr = s.str
+    proxy_obj = as_proxy_object(str_attr)
+    assert proxy_obj is str_attr
+
+
 def test_pickle_round_trip_proxy_numpy_array(array):
     arr, proxy_arr = array
     pickled_arr = BytesIO()

From 10048b813bc4054c9a092f31194a676e7459e840 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Feb 2025 20:26:23 -0800
Subject: [PATCH 17/42] Make Column.view/can_cast_safely accept a dtype object
 (#18066)

Partially broken off from https://github.com/rapidsai/cudf/pull/17978

Since Column objects are technically private, not marking this as breaking.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18066
---
 python/cudf/cudf/core/column/categorical.py |  3 +-
 python/cudf/cudf/core/column/column.py      | 10 ++----
 python/cudf/cudf/core/column/datetime.py    |  3 +-
 python/cudf/cudf/core/column/string.py      | 35 +++++++++------------
 python/cudf/cudf/core/column/timedelta.py   | 13 +++++---
 python/cudf/cudf/tests/test_column.py       | 15 ++++++---
 python/cudf/cudf/utils/utils.py             |  3 +-
 7 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a57ff9a7817..d41e448254c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -36,6 +36,7 @@
         ColumnBinaryOperand,
         ColumnLike,
         Dtype,
+        DtypeObj,
         ScalarLike,
         SeriesOrIndex,
         SeriesOrSingleColumnIndex,
@@ -1168,7 +1169,7 @@ def _mimic_inplace(
             self._codes = other_col.codes
         return out
 
-    def view(self, dtype: Dtype) -> ColumnBase:
+    def view(self, dtype: DtypeObj) -> ColumnBase:
         raise NotImplementedError(
             "Categorical column views are not currently supported"
         )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 89ac39b2be5..61f4f7d52fb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -950,7 +950,7 @@ def copy(self, deep: bool = True) -> Self:
                 ),
             )
 
-    def view(self, dtype: Dtype) -> ColumnBase:
+    def view(self, dtype: DtypeObj) -> ColumnBase:
         """
         View the data underlying a column as different dtype.
         The source column must divide evenly into the size of
@@ -959,13 +959,9 @@ def view(self, dtype: Dtype) -> ColumnBase:
 
         Parameters
         ----------
-        dtype : NumPy dtype, string
+        dtype : Dtype object
             The dtype to view the data as
-
         """
-
-        dtype = cudf.dtype(dtype)
-
         if dtype.kind in ("o", "u", "s"):
             raise TypeError(
                 "Bytes viewed as str without metadata is ambiguous"
@@ -1586,7 +1582,7 @@ def distinct_count(self, dropna: bool = True) -> int:
             self._distinct_count[dropna] = result
             return self._distinct_count[dropna]
 
-    def can_cast_safely(self, to_dtype: Dtype) -> bool:
+    def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         raise NotImplementedError()
 
     @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 92d5c39e69d..213e91d7b3f 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -47,6 +47,7 @@
         ColumnBinaryOperand,
         DatetimeLikeScalar,
         Dtype,
+        DtypeObj,
         ScalarLike,
     )
     from cudf.core.column.numerical import NumericalColumn
@@ -837,7 +838,7 @@ def is_unique(self) -> bool:
     def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
-    def can_cast_safely(self, to_dtype: Dtype) -> bool:
+    def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 28e8b98edfe..944f5cd6d26 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -43,6 +43,7 @@
         ColumnBinaryOperand,
         ColumnLike,
         Dtype,
+        DtypeObj,
         ScalarLike,
         SeriesOrIndex,
     )
@@ -5640,7 +5641,7 @@ def __init__(
         if not isinstance(data, Buffer):
             raise ValueError("data must be a Buffer")
         if dtype != CUDF_STRING_DTYPE:
-            raise ValueError(f"dtypy must be {CUDF_STRING_DTYPE}")
+            raise ValueError(f"dtype must be {CUDF_STRING_DTYPE}")
         if len(children) > 1:
             raise ValueError("StringColumn must have at most 1 offset column.")
 
@@ -5826,23 +5827,22 @@ def __contains__(self, item: ScalarLike) -> bool:
         other = [item] if is_scalar(item) else item
         return self.contains(column.as_column(other, dtype=self.dtype)).any()
 
-    def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
-        out_dtype = cudf.api.types.dtype(dtype)
-        if out_dtype.kind == "b":
+    def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn:
+        if dtype.kind == "b":
             with acquire_spill_lock():
                 plc_column = plc.strings.attributes.count_characters(
                     self.to_pylibcudf(mode="read")
                 )
                 result = ColumnBase.from_pylibcudf(plc_column)
             return (result > np.int8(0)).fillna(False)
-        elif out_dtype.kind in {"i", "u"}:
+        elif dtype.kind in {"i", "u"}:
             if not self.is_integer().all():
                 raise ValueError(
                     "Could not convert strings to integer "
                     "type due to presence of non-integer values."
                 )
             cast_func = plc.strings.convert.convert_integers.to_integers
-        elif out_dtype.kind == "f":
+        elif dtype.kind == "f":
             if not self.is_float().all():
                 raise ValueError(
                     "Could not convert strings to float "
@@ -5850,10 +5850,8 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
                 )
             cast_func = plc.strings.convert.convert_floats.to_floats
         else:
-            raise ValueError(
-                f"dtype must be a numerical type, not {out_dtype}"
-            )
-        plc_dtype = dtype_to_pylibcudf_type(out_dtype)
+            raise ValueError(f"dtype must be a numerical type, not {dtype}")
+        plc_dtype = dtype_to_pylibcudf_type(dtype)
         with acquire_spill_lock():
             return type(self).from_pylibcudf(  # type: ignore[return-value]
                 cast_func(self.to_pylibcudf(mode="read"), plc_dtype)
@@ -5973,17 +5971,15 @@ def to_pandas(
         else:
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
-    def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        to_dtype = cudf.api.types.dtype(to_dtype)
-
+    def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         if self.dtype == to_dtype:
             return True
-        elif to_dtype.kind in {"i", "u"} and not self.is_integer().all():
-            return False
-        elif to_dtype.kind == "f" and not self.is_float().all():
-            return False
-        else:
+        elif to_dtype.kind in {"i", "u"} and self.is_integer().all():
+            return True
+        elif to_dtype.kind == "f" and self.is_float().all():
             return True
+        else:
+            return False
 
     def find_and_replace(
         self,
@@ -6122,12 +6118,11 @@ def _binaryop(
         return NotImplemented
 
     @copy_docstring(ColumnBase.view)
-    def view(self, dtype) -> ColumnBase:
+    def view(self, dtype: DtypeObj) -> ColumnBase:
         if self.null_count > 0:
             raise ValueError(
                 "Can not produce a view of a string column with nulls"
             )
-        dtype = cudf.api.types.dtype(dtype)
         str_byte_offset = self.base_children[0].element_indexing(self.offset)
         str_end_byte_offset = self.base_children[0].element_indexing(
             self.offset + self.size
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index d02681d389d..e4d47f492c2 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -28,7 +28,12 @@
 if TYPE_CHECKING:
     from collections.abc import Sequence
 
-    from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        DatetimeLikeScalar,
+        Dtype,
+        DtypeObj,
+    )
 
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
@@ -380,10 +385,10 @@ def find_and_replace(
             ),
         )
 
-    def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if to_dtype.kind == "m":  # type: ignore[union-attr]
+    def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
+        if to_dtype.kind == "m":
             to_res, _ = np.datetime_data(to_dtype)
-            self_res, _ = np.datetime_data(self.dtype)
+            self_res = self.time_unit
 
             max_int = np.iinfo(np.int64).max
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 2996a88c171..b7cd2388f30 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -290,6 +290,8 @@ def test_column_chunked_array_creation():
     ],
 )
 def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype):
+    from_dtype = np.dtype(from_dtype)
+    to_dtype = np.dtype(to_dtype)
     cpu_data = np.asarray(data, dtype=from_dtype)
     gpu_data = as_column(data, dtype=from_dtype)
 
@@ -314,6 +316,8 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype):
     ],
 )
 def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype):
+    from_dtype = np.dtype(from_dtype)
+    to_dtype = np.dtype(to_dtype)
     cpu_data = np.asarray(data, dtype=from_dtype)
     gpu_data = as_column(data, dtype=from_dtype)
 
@@ -337,6 +341,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype):
     ],
 )
 def test_column_view_valid_string_to_numeric(data, to_dtype):
+    to_dtype = np.dtype(to_dtype)
     expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype))
     got = cudf.Series(str_host_view(data, to_dtype))
 
@@ -352,7 +357,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="int32")
     expect = cudf.Series(expect_data, dtype="float32")
-    got = cudf.Series._from_column(sr._column.view("float32"))
+    got = cudf.Series._from_column(sr._column.view(np.dtype(np.float32)))
 
     assert_eq(expect, got)
 
@@ -364,7 +369,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="float64")
     expect = cudf.Series(expect_data, dtype="int64")
-    got = cudf.Series._from_column(sr._column.view("int64"))
+    got = cudf.Series._from_column(sr._column.view(np.dtype(np.int64)))
 
     assert_eq(expect, got)
 
@@ -376,7 +381,7 @@ def test_column_view_numeric_slice(slc):
 
     expect = cudf.Series(data[slc].view("int64"))
     got = cudf.Series._from_column(
-        sr._column.slice(slc.start, slc.stop).view("int64")
+        sr._column.slice(slc.start, slc.stop).view(np.dtype(np.int64))
     )
 
     assert_eq(expect, got)
@@ -389,7 +394,9 @@ def test_column_view_string_slice(slc):
     data = ["a", "bcde", "cd", "efg", "h"]
 
     expect = cudf.Series._from_column(
-        cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8")
+        cudf.Series(data)
+        ._column.slice(slc.start, slc.stop)
+        .view(np.dtype(np.int8))
     )
     got = cudf.Series(str_host_view(data[slc], "int8"))
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index fd946937945..2678a4f8116 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -18,9 +18,10 @@
 import cudf.api.types
 from cudf.core import column
 from cudf.core.buffer import as_buffer
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE
 
 # The size of the mask in bytes
-mask_dtype = cudf.api.types.dtype(np.int32)
+mask_dtype = SIZE_TYPE_DTYPE
 mask_bitsize = mask_dtype.itemsize * 8
 
 # Mapping from ufuncs to the corresponding binary operators.

From b8ec71a24b4b8a3e3a997f38881ddfedd698610e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 27 Feb 2025 06:56:24 -0500
Subject: [PATCH 18/42] Bump polars version to <1.24 (#18076)

The PR upgrades the Polars version to 1.23.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/18076
---
 ci/test_cudf_polars_polars_tests.sh            |  2 ++
 .../environments/all_cuda-118_arch-x86_64.yaml |  2 +-
 .../environments/all_cuda-128_arch-x86_64.yaml |  2 +-
 conda/recipes/cudf-polars/meta.yaml            |  2 +-
 dependencies.yaml                              |  2 +-
 .../cudf_polars/cudf_polars/testing/plugin.py  |  1 -
 python/cudf_polars/cudf_polars/utils/dtypes.py | 18 ++++++++++++++----
 python/cudf_polars/pyproject.toml              |  2 +-
 8 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 3466edacfc5..1df7bb61834 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -26,6 +26,8 @@ git clone https://github.com/pola-rs/polars.git --branch "${TAG}" --depth 1
 
 # Install requirements for running polars tests
 rapids-logger "Install polars test requirements"
+# TODO: Remove sed command when polars-cloud supports 1.23
+sed -i '/^polars-cloud$/d' polars/py-polars/requirements-dev.txt
 rapids-pip-retry install -r polars/py-polars/requirements-dev.txt -r polars/py-polars/requirements-ci.txt
 
 # shellcheck disable=SC2317
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e7dbb765099..a23981b4e72 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.23
+- polars>=1.20,<1.24
 - pre-commit
 - ptxcompiler
 - pyarrow>=14.0.0,<20.0.0a0
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index 342ec8d4b59..e2b9302dc36 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.4dev0
 - pandoc
-- polars>=1.20,<1.23
+- polars>=1.20,<1.24
 - pre-commit
 - pyarrow>=14.0.0,<20.0.0a0
 - pydata-sphinx-theme>=0.15.4
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
index 1d36ab2a3e4..64a147d3c63 100644
--- a/conda/recipes/cudf-polars/meta.yaml
+++ b/conda/recipes/cudf-polars/meta.yaml
@@ -43,7 +43,7 @@ requirements:
   run:
     - python
     - pylibcudf ={{ version }}
-    - polars >=1.20,<1.23
+    - polars >=1.20,<1.24
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index c7869eee922..1578dadc793 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -813,7 +813,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.20,<1.23
+          - polars>=1.20,<1.24
   run_cudf_polars_experimental:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/testing/plugin.py b/python/cudf_polars/cudf_polars/testing/plugin.py
index a7b10a6e8fa..9b798688992 100644
--- a/python/cudf_polars/cudf_polars/testing/plugin.py
+++ b/python/cudf_polars/cudf_polars/testing/plugin.py
@@ -197,7 +197,6 @@ def pytest_configure(config: pytest.Config) -> None:
     "tests/unit/io/test_multiscan.py::test_include_file_paths[scan_csv-write_csv]": "Need to expose include_file_paths xref: cudf#18012",
     "tests/unit/streaming/test_streaming_io.py::test_parquet_eq_statistics[False]": "Debug output on stderr doesn't match",
     # Maybe flaky, order-dependent?
-    "tests/unit/test_projections.py::test_schema_full_outer_join_projection_pd_13287": "Order-specific result check, query is correct but in different order",
     "tests/unit/test_queries.py::test_group_by_agg_equals_zero_3535": "libcudf sums all nulls to null, not zero",
 }
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 6bb5d78c488..85a4f007cf0 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 
 """Datatype utilities."""
@@ -71,7 +71,9 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
     -------
     True if casting is supported, False otherwise
     """
-    has_empty = from_.id() == plc.TypeId.EMPTY or to.id() == plc.TypeId.EMPTY
+    to_is_empty = to.id() == plc.TypeId.EMPTY
+    from_is_empty = from_.id() == plc.TypeId.EMPTY
+    has_empty = to_is_empty or from_is_empty
     return (
         (
             from_ == to
@@ -84,8 +86,16 @@ def can_cast(from_: plc.DataType, to: plc.DataType) -> bool:
                 )
             )
         )
-        or (from_.id() == plc.TypeId.STRING and is_numeric_not_bool(to))
-        or (to.id() == plc.TypeId.STRING and is_numeric_not_bool(from_))
+        or (
+            from_.id() == plc.TypeId.STRING
+            and not to_is_empty
+            and is_numeric_not_bool(to)
+        )
+        or (
+            to.id() == plc.TypeId.STRING
+            and not from_is_empty
+            and is_numeric_not_bool(from_)
+        )
     )
 
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 9026a0c29ca..e9fc054efc2 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "polars>=1.20,<1.23",
+    "polars>=1.20,<1.24",
     "pylibcudf==25.4.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

From 25f17ad02615afd7cbb9ee2784de392f6e0c7a66 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 27 Feb 2025 13:13:23 -0500
Subject: [PATCH 19/42] Make pylibcudf traits raise exceptions gracefully
 rather than terminating in C++ (#18117)

Closes #18110

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/18117
---
 .../pylibcudf/libcudf/utilities/traits.pxd    | 40 +++++++++----------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
index 93f13a7e11f..33749141590 100644
--- a/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -6,22 +6,22 @@ from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
-    cdef bool is_relationally_comparable(data_type)
-    cdef bool is_equality_comparable(data_type)
-    cdef bool is_numeric(data_type)
-    cdef bool is_numeric_not_bool(data_type)
-    cdef bool is_index_type(data_type)
-    cdef bool is_unsigned(data_type)
-    cdef bool is_integral(data_type)
-    cdef bool is_integral_not_bool(data_type)
-    cdef bool is_floating_point(data_type)
-    cdef bool is_boolean(data_type)
-    cdef bool is_timestamp(data_type)
-    cdef bool is_fixed_point(data_type)
-    cdef bool is_duration(data_type)
-    cdef bool is_chrono(data_type)
-    cdef bool is_dictionary(data_type)
-    cdef bool is_fixed_width(data_type)
-    cdef bool is_compound(data_type)
-    cdef bool is_nested(data_type)
-    cdef bool is_bit_castable(data_type, data_type)
+    cdef bool is_relationally_comparable(data_type) except +libcudf_exception_handler
+    cdef bool is_equality_comparable(data_type) except +libcudf_exception_handler
+    cdef bool is_numeric(data_type) except +libcudf_exception_handler
+    cdef bool is_numeric_not_bool(data_type) except +libcudf_exception_handler
+    cdef bool is_index_type(data_type) except +libcudf_exception_handler
+    cdef bool is_unsigned(data_type) except +libcudf_exception_handler
+    cdef bool is_integral(data_type) except +libcudf_exception_handler
+    cdef bool is_integral_not_bool(data_type) except +libcudf_exception_handler
+    cdef bool is_floating_point(data_type) except +libcudf_exception_handler
+    cdef bool is_boolean(data_type) except +libcudf_exception_handler
+    cdef bool is_timestamp(data_type) except +libcudf_exception_handler
+    cdef bool is_fixed_point(data_type) except +libcudf_exception_handler
+    cdef bool is_duration(data_type) except +libcudf_exception_handler
+    cdef bool is_chrono(data_type) except +libcudf_exception_handler
+    cdef bool is_dictionary(data_type) except +libcudf_exception_handler
+    cdef bool is_fixed_width(data_type) except +libcudf_exception_handler
+    cdef bool is_compound(data_type) except +libcudf_exception_handler
+    cdef bool is_nested(data_type) except +libcudf_exception_handler
+    cdef bool is_bit_castable(data_type, data_type) except +libcudf_exception_handler

From b92d2c0adcca94a5cd04d9206fc89ca059f50f36 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 27 Feb 2025 10:14:24 -0800
Subject: [PATCH 20/42] Remove now non-existent job (#18123)

This job was removed from PRs in
https://github.com/rapidsai/cudf/pull/18091 but I forgot to remove the
corresponding nightly test job.
---
 .github/workflows/test.yaml | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 7046fd0e5dc..8357a12e221 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -46,18 +46,6 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
-  static-configure:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
-    with:
-      build_type: ${{ inputs.build_type }}
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      # Use the wheel container so we can skip conda solves and since our
-      # primary static consumers (Spark) are not in conda anyway.
-      container_image: "rapidsai/ci-wheel:latest"
-      run_script: "ci/configure_cpp_static.sh"
   cpp-linters:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04

From 960bb28f426d004ed96ac066e07675d87bb186de Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 27 Feb 2025 13:25:54 -0600
Subject: [PATCH 21/42] Use cpu16 for build CI jobs (#18124)

We use `cpu16` for PR jobs that build libcudf (conda and wheels). We
also need to use `cpu16` for the corresponding jobs in `build.yaml`.
---
 .github/workflows/build.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 11104037c5e..148861c0fa2 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -34,6 +34,7 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+      node_type: "cpu16"
   python-build:
     needs: [cpp-build]
     secrets: inherit
@@ -77,6 +78,7 @@ jobs:
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
+      node_type: "cpu16"
       script: ci/build_wheel_libcudf.sh
   wheel-publish-libcudf:
     needs: wheel-build-libcudf

From 08ea13a407f09babe647fef8cf98595c7e710f0b Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 27 Feb 2025 20:35:44 +0100
Subject: [PATCH 22/42] Add include for `<functional>` (#18102)

There are some files that use `std::function` and it seems they were relying on transitive includes from CCCL headers because building cudf fails with CCCL 2.8, which is the next CCCL release in line for rapids

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/18102
---
 cpp/benchmarks/common/random_distribution_factory.cuh | 3 ++-
 cpp/src/column/column_device_view.cu                  | 3 ++-
 cpp/src/io/functions.cpp                              | 1 +
 cpp/src/io/json/host_tree_algorithms.cu               | 3 ++-
 cpp/src/io/json/read_json.cu                          | 1 +
 cpp/src/io/orc/aggregate_orc_metadata.cpp             | 1 +
 cpp/src/io/orc/writer_impl.cu                         | 1 +
 cpp/src/io/parquet/reader_impl_chunking.cu            | 1 +
 cpp/src/io/parquet/writer_impl.cu                     | 1 +
 cpp/src/lists/dremel.cu                               | 4 +++-
 cpp/src/strings/regex/regex.cuh                       | 3 ++-
 cpp/src/strings/replace/multi_re.cu                   | 3 ++-
 cpp/src/table/row_operators.cu                        | 4 +++-
 cpp/src/text/bpe/load_merge_pairs.cu                  | 3 ++-
 cpp/tests/groupby/tdigest_tests.cu                    | 4 +++-
 cpp/tests/io/metadata_utilities.cpp                   | 4 +++-
 cpp/tests/io/parquet_writer_test.cpp                  | 1 +
 cpp/tests/reductions/scan_tests.cpp                   | 3 ++-
 18 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh
index c27616132d0..32424fbaaa3 100644
--- a/cpp/benchmarks/common/random_distribution_factory.cuh
+++ b/cpp/benchmarks/common/random_distribution_factory.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <thrust/tabulate.h>
 
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <type_traits>
 
diff --git a/cpp/src/column/column_device_view.cu b/cpp/src/column/column_device_view.cu
index 9dc39f01ab3..c304d705f9b 100644
--- a/cpp/src/column/column_device_view.cu
+++ b/cpp/src/column/column_device_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
+#include <functional>
 #include <numeric>
 
 namespace cudf {
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 53c1d335a40..204aca8a69c 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -36,6 +36,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
+#include <functional>
 #include <utility>
 
 namespace cudf::io {
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index 7b9fc25d1cc..e506d60a2be 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,6 +46,7 @@
 
 #include <algorithm>
 #include <deque>
+#include <functional>
 
 namespace cudf::io::json::detail {
 
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0c95c2b05e8..c265ac5e316 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -43,6 +43,7 @@
 #include <BS_thread_pool.hpp>
 #include <BS_thread_pool_utils.hpp>
 
+#include <functional>
 #include <numeric>
 
 namespace cudf::io::json::detail {
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 050bf692c14..77643d294e8 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -19,6 +19,7 @@
 #include "io/utilities/row_selection.hpp"
 
 #include <algorithm>
+#include <functional>
 #include <numeric>
 
 namespace cudf::io::orc::detail {
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index dbf5e293c4e..3a20ffbce19 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -64,6 +64,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <functional>
 #include <numeric>
 #include <tuple>
 #include <utility>
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 03a37327e9b..be1e7d38fff 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -40,6 +40,7 @@
 #include <thrust/transform_scan.h>
 #include <thrust/unique.h>
 
+#include <functional>
 #include <numeric>
 
 namespace cudf::io::parquet::detail {
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 9e50fafa8a7..4a410cec558 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -53,6 +53,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <functional>
 #include <iterator>
 #include <numeric>
 #include <utility>
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 469442d46d4..d7b1bf360fe 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
 
+#include <functional>
+
 namespace cudf::detail {
 namespace {
 /**
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index d22fb04696c..6071a9fdd2d 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.  All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <cuda_runtime.h>
 #include <thrust/pair.h>
 
+#include <functional>
 #include <memory>
 
 namespace cudf {
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 0777253bb38..af8b53ccd8c 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@
 #include <thrust/pair.h>
 
 #include <algorithm>
+#include <functional>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 990c4855a14..d77cc0cf17a 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,8 @@
 
 #include <thrust/iterator/transform_iterator.h>
 
+#include <functional>
+
 namespace cudf {
 namespace experimental {
 
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index a13a435a271..9118fe54ab2 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include <cuda/functional>
 
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <vector>
 
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 883a5093bd1..ad92e322ee2 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,8 @@
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include <functional>
+
 namespace {
 /**
  * @brief Functor to generate a tdigest by key.
diff --git a/cpp/tests/io/metadata_utilities.cpp b/cpp/tests/io/metadata_utilities.cpp
index 380d66c53f9..980d8d8b3d1 100644
--- a/cpp/tests/io/metadata_utilities.cpp
+++ b/cpp/tests/io/metadata_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/io_metadata_utilities.hpp>
 
+#include <functional>
+
 namespace cudf::test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e201dc0565c..d99e19822c0 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -33,6 +33,7 @@
 
 #include <array>
 #include <fstream>
+#include <functional>
 
 using cudf::test::iterators::no_nulls;
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 5f911597b02..c6c419706e0 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <thrust/tuple.h>
 
 #include <algorithm>
+#include <functional>
 #include <numeric>
 
 using aggregation      = cudf::aggregation;

From 4fda491e84bf212e16ab8d6ee5cf97da6d67362b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 27 Feb 2025 15:42:40 -0500
Subject: [PATCH 23/42] Add new nvtext tokenized minhash API (#17944)

Creates a new minhash API that works on ngrams of row elements given a list column of strings.

```
std::unique_ptr<cudf::column> minhash_ngrams(
  cudf::lists_column_view const& input,
  cudf::size_type ngrams,
  uint32_t seed,
  cudf::device_span<uint32_t const> parameter_a,
  cudf::device_span<uint32_t const> parameter_b,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

The input column is expected to be rows of words (strings) and each row is hashed using a sliding window of words (ngrams) and then the permuted algorithm is re-used to produce the minhash values.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Ayush Dattagupta (https://github.com/ayushdg)
  - Matthew Murray (https://github.com/Matt711)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/17944
---
 cpp/include/nvtext/minhash.hpp                |  94 +++++
 cpp/src/text/minhash.cu                       | 392 +++++++++++++++---
 cpp/tests/text/minhash_tests.cpp              | 173 +++++++-
 python/cudf/cudf/core/column/string.py        | 114 +++++
 .../cudf/cudf/tests/text/test_text_methods.py |  42 ++
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  18 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  18 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyi |   8 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx |  96 ++++-
 .../pylibcudf/tests/test_nvtext_minhash.py    |  48 ++-
 10 files changed, 911 insertions(+), 92 deletions(-)

diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 43f060fdafa..5f978a0d8ec 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -125,5 +125,99 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each input row
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input row is first hashed using the given `seed` over a sliding window
+ * of `ngrams` of strings. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a ngrams at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each set of ngrams and the minimum value
+ * is computed as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all ngrams in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the ngrams < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param ngrams The number of strings to hash within each row
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_ngrams(
+  cudf::lists_column_view const& input,
+  cudf::size_type ngrams,
+  uint32_t seed,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each input row
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * The input row is first hashed using the given `seed` over a sliding window
+ * of `ngrams` of strings. These hash values are then combined with the `a`
+ * and `b` parameter values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv[i] = hash value of a ngrams at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each set of ngrams and the minimum value
+ * is computed as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all ngrams in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the ngrams < 2
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
+ *
+ * @param input List strings column to compute minhash
+ * @param ngrams The number of strings to hash within each row
+ * @param seed Seed value used for the hash algorithm
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_ngrams(
+  cudf::lists_column_view const& input,
+  cudf::size_type ngrams,
+  uint64_t seed,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 50c16c8ba6c..663595af5df 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
@@ -62,19 +63,20 @@ constexpr cudf::thread_index_type tile_size = block_size;
 constexpr cuda::std::size_t params_per_thread = 16;
 
 // Separate kernels are used to process strings above and below this value (in bytes).
-constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
+constexpr cudf::size_type wide_row_threshold = 1 << 18;  // 256K
 // The number of blocks per string for the above-threshold kernel processing.
-constexpr cudf::size_type blocks_per_string = 64;
+constexpr cudf::size_type blocks_per_row = 64;
 // The above values were determined using the redpajama and books_sample datasets
 
 /**
  * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ * for strings column
  *
  * This kernel computes the hashes for each string using the seed and the specified
  * hash function. The width is used to compute rolling substrings to hash over.
  * The hashes are stored in d_hashes to be used in the minhash_kernel.
  *
- * This kernel also counts the number of strings above the wide_string_threshold
+ * This kernel also counts the number of strings above the wide_row_threshold
  * and proactively initializes the output values for those strings.
  *
  * @tparam HashFunction The hash function to use for this kernel
@@ -84,7 +86,7 @@ constexpr cudf::size_type blocks_per_string = 64;
  * @param seed The seed used for the hash function
  * @param width Width in characters used for determining substrings to hash
  * @param d_hashes The resulting hash values are stored here
- * @param threshold_count Stores the number of strings above wide_string_threshold
+ * @param threshold_count Stores the number of strings above wide_row_threshold
  * @param param_count Number of parameters (used for the proactive initialize)
  * @param d_results Final results vector (used for the proactive initialize)
  */
@@ -146,7 +148,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   }
 
   // logic appended here so an extra kernel is not required
-  if (size_bytes >= wide_string_threshold) {
+  if (size_bytes >= wide_row_threshold) {
     if (lane_idx == 0) {
       // count the number of wide strings
       cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
@@ -160,31 +162,130 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   }
 }
 
+/**
+ * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ * for a lists column
+ *
+ * This kernel computes the hashes for each row using the seed and the specified
+ * hash function. The ngrams identifies consecutive strings to hash over in
+ * sliding window formation. The hashes are stored in d_hashes and used as input
+ * to the minhash_kernel.
+ *
+ * This kernel also counts the number of rows above the wide_row_threshold
+ * and proactively initializes the output values for those rows.
+ *
+ * @tparam HashFunction The hash function to use for this kernel
+ * @tparam hash_value_type Derived from HashFunction result_type
+ *
+ * @param d_input The input column to hash
+ * @param seed The seed used for the hash function
+ * @param ngrams Number of strings in each row to hash
+ * @param d_hashes The resulting hash values are stored here
+ * @param threshold_count Stores the number of rows above wide_row_threshold
+ * @param param_count Number of parameters (used for the proactive initialize)
+ * @param d_results Final results vector (used for the proactive initialize)
+ */
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+CUDF_KERNEL void minhash_ngrams_kernel(cudf::detail::lists_column_device_view const d_input,
+                                       hash_value_type seed,
+                                       cudf::size_type ngrams,
+                                       hash_value_type* d_hashes,
+                                       cudf::size_type* threshold_count,
+                                       cudf::size_type param_count,
+                                       hash_value_type* d_results)
+{
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = tid / tile_size;
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  // retrieve this row's offset to locate the output position in d_hashes
+  auto const offsets_itr = d_input.offsets().data<cudf::size_type>() + d_input.offset();
+  auto const offset      = offsets_itr[row_idx];
+  auto const size_row    = offsets_itr[row_idx + 1] - offset;
+  if (size_row == 0) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const lane_idx = static_cast<cudf::size_type>(tid % tile_size);
+
+  // hashes for this row/thread are stored here
+  auto seed_hashes  = d_hashes + offset - offsets_itr[0] + lane_idx;
+  auto const hasher = HashFunction(seed);
+
+  for (auto idx = lane_idx; idx < size_row; idx += tile_size, seed_hashes += tile_size) {
+    if (d_row.is_null(idx)) {
+      *seed_hashes = 0;
+      continue;
+    }
+
+    auto next_idx = cuda::std::min(idx + ngrams, size_row - 1);
+    if ((idx != 0) && ((next_idx - idx) < ngrams)) {
+      *seed_hashes = 0;
+      continue;
+    }
+
+    auto const first_str = d_row.element<cudf::string_view>(idx);
+    auto const last_str  = d_row.element<cudf::string_view>(next_idx);
+    // build super-string since adjacent strings are contiguous in memory
+    auto const size = static_cast<cudf::size_type>(
+      thrust::distance(first_str.data(), last_str.data()) + last_str.size_bytes());
+    auto const hash_str = cudf::string_view(first_str.data(), size);
+    hash_value_type hv;
+    if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+      hv = hasher(hash_str);
+    } else {
+      hv = cuda::std::get<0>(hasher(hash_str));
+    }
+    // disallowing hash to zero case
+    *seed_hashes = cuda::std::max(hv, hash_value_type{1});
+  }
+
+  // logic appended here to count long rows so an extra kernel is not required
+  if (size_row >= wide_row_threshold) {
+    if (lane_idx == 0) {
+      // count the number of wide rows
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
+      ref.fetch_add(1, cuda::std::memory_order_relaxed);
+    }
+    // initialize the output -- only needed for wider rows
+    auto d_output = d_results + (row_idx * param_count);
+    for (auto i = lane_idx; i < param_count; i += tile_size) {
+      d_output[i] = cuda::std::numeric_limits<hash_value_type>::max();
+    }
+  }
+}
+
 /**
  * @brief Permutation calculation kernel
  *
- * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and
- * parameter_b values to compute the final output results.
+ * This kernel uses the hashes from the minhash_seed_kernel or minhash_ngrams_kernel
+ * and the 'parameter_a' and 'parameter_b' values to compute the final output.
  * The output is the number of input rows (N) by the number of parameter values (M).
- * Each output[i] is the calculated result for parameter_a/b[0:M].
+ * Each row output[i] is the calculated result for parameter_a/b[0:M].
+ *
+ * This kernel is launched with either blocks per row of 1 for rows
+ * below the wide_row_threshold or blocks per row = blocks_per_rows
+ * for rows above wide_row_threshold.
  *
- * This kernel is launched with either blocks per strings of 1 for strings
- * below the wide_strings_threshold or blocks per string = blocks_per_strings
- * for strings above wide_strings_threshold.
+ * Note that this was refactored to accommodate lists of strings which is possible
+ * since there is no need here to access the characters, only the hash values.
+ * The offsets and width are used to locate and count the hash values produced by
+ * kernels above for each input row.
  *
+ * @tparam offsets_type Type for the offsets iterator for the input column
  * @tparam hash_value_type Derived from HashFunction result_type
- * @tparam blocks_per_string Number of blocks used to process each string
+ * @tparam blocks_per_row Number of blocks used to process each row
  *
- * @param d_strings The input strings to hash
- * @param indices The indices of the strings in d_strings to process
+ * @param offsets_itr The offsets are used to address the d_hashes
+ * @param indices The indices of the rows in the input column
  * @param parameter_a 1st set of parameters for the calculation result
  * @param parameter_b 2nd set of parameters for the calculation result
- * @param width Used for calculating the number of available hashes in each string
- * @param d_hashes The hash values computed in minhash_seed_kernel
+ * @param width Used for calculating the number of available hashes in each row
+ * @param d_hashes The hash values computed in one of the hash kernels
  * @param d_results Final results vector of calculate values
  */
-template <typename hash_value_type, int blocks_per_string>
-CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
+template <typename offsets_type, typename hash_value_type, int blocks_per_row>
+CUDF_KERNEL void minhash_kernel(offsets_type offsets_itr,
                                 cudf::device_span<cudf::size_type const> indices,
                                 cudf::device_span<hash_value_type const> parameter_a,
                                 cudf::device_span<hash_value_type const> parameter_b,
@@ -193,41 +294,36 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 hash_value_type* d_results)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
-  auto const idx = (tid / blocks_per_string) / block_size;
+  auto const idx = (tid / blocks_per_row) / block_size;
   if (idx >= indices.size()) { return; }
-  auto const str_idx = indices[idx];
-  if (d_strings.is_null(str_idx)) { return; }
+  auto const row_idx = indices[idx];
 
   auto const block      = cooperative_groups::this_thread_block();
-  int const section_idx = block.group_index().x % blocks_per_string;
+  int const section_idx = block.group_index().x % blocks_per_row;
 
-  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
-  auto const offsets_itr =
-    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
-  auto const offset     = offsets_itr[str_idx];
-  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  auto const offset   = offsets_itr[row_idx];
+  auto const row_size = static_cast<cudf::size_type>(offsets_itr[row_idx + 1] - offset);
 
   // number of items to process in this block;
-  // last block also includes any remainder values from the size_bytes/blocks_per_string truncation
+  // last block also includes any remainder values from the row_size/blocks_per_row truncation
   // example:
-  //  each section_size for string with size 588090 and blocks_per_string=64 is 9188
+  //  each section_size for string with size 588090 and blocks_per_row=64 is 9188
   //  except the last section which is 9188 + (588090 % 64) = 9246
-  auto const section_size =
-    (size_bytes / blocks_per_string) +
-    (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
-  auto const section_offset = section_idx * (size_bytes / blocks_per_string);
+  auto const section_size = (row_size / blocks_per_row) +
+                            (section_idx < (blocks_per_row - 1) ? 0 : row_size % blocks_per_row);
+  auto const section_offset = section_idx * (row_size / blocks_per_row);
 
   // hash values for this block/section
   auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
   // width used here as a max value since a string's char-count <= byte-count
   auto const hashes_size =
-    section_idx < (blocks_per_string - 1)
+    section_idx < (blocks_per_row - 1)
       ? section_size
-      : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
+      : cuda::std::max(static_cast<cudf::size_type>(row_size > 0), section_size - width + 1);
 
-  auto const init     = size_bytes == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
+  auto const init     = row_size == 0 ? 0 : cuda::std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = block.thread_rank();
-  auto const d_output = d_results + (str_idx * parameter_a.size());
+  auto const d_output = d_results + (row_idx * parameter_a.size());
 
   auto const begin = seed_hashes + lane_idx;
   auto const end   = seed_hashes + hashes_size;
@@ -273,7 +369,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       // cooperative groups does not have a min function and cub::BlockReduce was slower
       auto const minv =
         thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
-      if constexpr (blocks_per_string > 1) {
+      if constexpr (blocks_per_row > 1) {
         // accumulates mins for each block into d_output
         cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
         ref.fetch_min(minv, cuda::std::memory_order_relaxed);
@@ -285,6 +381,46 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   }
 }
 
+/**
+ * @brief Partition input rows by row size
+ *
+ * The returned index is the first row above the wide_row_threshold size.
+ * The returned vector are the indices partitioned above and below the
+ * wide_row_threshold size.
+ *
+ * @param size Number of rows in the input column
+ * @param threshold_count Number of rows above wide_row_threshold
+ * @param tfn Transform function returns the size of each row
+ * @param stream Stream used for allocation and kernel launches
+ */
+template <typename transform_fn>
+std::pair<cudf::size_type, rmm::device_uvector<cudf::size_type>> partition_input(
+  cudf::size_type size,
+  cudf::size_type threshold_count,
+  transform_fn tfn,
+  rmm::cuda_stream_view stream)
+{
+  auto indices = rmm::device_uvector<cudf::size_type>(size, stream);
+  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
+  cudf::size_type threshold_index = threshold_count < size ? size : 0;
+
+  // if we counted a split of above/below threshold then
+  // compute partitions based on the size of each string
+  if ((threshold_count > 0) && (threshold_count < size)) {
+    auto sizes = rmm::device_uvector<cudf::size_type>(size, stream);
+    auto begin = thrust::counting_iterator<cudf::size_type>(0);
+    auto end   = begin + size;
+    thrust::transform(rmm::exec_policy_nosync(stream), begin, end, sizes.data(), tfn);
+    // these 2 are slightly faster than using partition()
+    thrust::sort_by_key(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
+    auto const lb = thrust::lower_bound(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_row_threshold);
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
+  }
+  return {threshold_index, std::move(indices)};
+}
+
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          hash_value_type seed,
@@ -334,40 +470,112 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                                                          d_threshold_count.data(),
                                                                          parameter_a.size(),
                                                                          d_results);
-  auto const threshold_count = d_threshold_count.value(stream);
 
-  auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
-  cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0;
+  auto transform_fn = [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
+    if (d_strings.is_null(idx)) { return 0; }
+    return d_strings.element<cudf::string_view>(idx).size_bytes();
+  };
+  auto [threshold_index, indices] =
+    partition_input(input.size(), d_threshold_count.value(stream), transform_fn, stream);
 
-  // if we counted a split of above/below threshold then
-  // compute partitions based on the size of each string
-  if ((threshold_count > 0) && (threshold_count < input.size())) {
-    auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      thrust::counting_iterator<cudf::size_type>(0),
-                      thrust::counting_iterator<cudf::size_type>(input.size()),
-                      sizes.data(),
-                      cuda::proclaim_return_type<cudf::size_type>(
-                        [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
-                          if (d_strings.is_null(idx)) { return 0; }
-                          return d_strings.element<cudf::string_view>(idx).size_bytes();
-                        }));
-    thrust::sort_by_key(
-      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
-    auto const lb = thrust::lower_bound(
-      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
-    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
+  auto input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  using offsets_type = decltype(input_offsets);
+
+  // handle the strings below the threshold width
+  if (threshold_index > 0) {
+    auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
+                               block_size};
+    minhash_kernel<offsets_type, hash_value_type, 1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+  }
+
+  // handle the strings above the threshold width
+  if (threshold_index < input.size()) {
+    auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto d_indices =
+      cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size};
+    minhash_kernel<offsets_type, hash_value_type, blocks_per_row>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        input_offsets, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
 
+  return results;
+}
+
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+std::unique_ptr<cudf::column> minhash_ngrams_fn(
+  cudf::lists_column_view const& input,
+  cudf::size_type ngrams,
+  hash_value_type seed,
+  cudf::device_span<hash_value_type const> parameter_a,
+  cudf::device_span<hash_value_type const> parameter_b,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+  CUDF_EXPECTS(
+    (static_cast<std::size_t>(input.size()) * parameter_a.size()) <
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "The number of parameters times the number of input rows exceeds the column size limit",
+    std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto results =
+    cudf::make_numeric_column(output_type,
+                              input.size() * static_cast<cudf::size_type>(parameter_a.size()),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
+  auto d_results = results->mutable_view().data<hash_value_type>();
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
+                             block_size};
+  auto const hashes_size = input.child().size();
+  auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
+  auto d_threshold_count = cudf::detail::device_scalar<cudf::size_type>(0, stream);
+
+  auto d_list = cudf::detail::lists_column_device_view(*d_input);
+  minhash_ngrams_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(d_list,
+                                                                         seed,
+                                                                         ngrams,
+                                                                         d_hashes.data(),
+                                                                         d_threshold_count.data(),
+                                                                         parameter_a.size(),
+                                                                         d_results);
+
+  auto sizes_fn = [d_list] __device__(auto idx) -> cudf::size_type {
+    if (d_list.is_null(idx)) { return 0; }
+    return cudf::list_device_view(d_list, idx).size();
+  };
+  auto [threshold_index, indices] =
+    partition_input(input.size(), d_threshold_count.value(stream), sizes_fn, stream);
+
+  auto input_offsets = input.offsets_begin();  // already includes input.offset()
+  using offset_type  = decltype(input_offsets);
+
   // handle the strings below the threshold width
   if (threshold_index > 0) {
     auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
     cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
                                block_size};
-    minhash_kernel<hash_value_type, 1>
+    minhash_kernel<offset_type, hash_value_type, 1>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+        input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results);
   }
 
   // handle the strings above the threshold width
@@ -375,10 +583,10 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
     auto d_indices =
       cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
-    cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
-    minhash_kernel<hash_value_type, blocks_per_string>
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_row, block_size};
+    minhash_kernel<offset_type, hash_value_type, blocks_per_row>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
+        input_offsets, d_indices, parameter_a, parameter_b, ngrams, d_hashes.data(), d_results);
   }
 
   return results;
@@ -426,6 +634,20 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_ngrams(cudf::lists_column_view const& input,
+                                             cudf::size_type ngrams,
+                                             uint32_t seed,
+                                             cudf::device_span<uint32_t const> parameter_a,
+                                             cudf::device_span<uint32_t const> parameter_b,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::minhash_ngrams_fn<HashFunction>(
+    input, ngrams, seed, parameter_a, parameter_b, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
                                         cudf::device_span<uint64_t const> parameter_a,
@@ -440,6 +662,20 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_ngrams(cudf::lists_column_view const& input,
+                                               cudf::size_type ngrams,
+                                               uint64_t seed,
+                                               cudf::device_span<uint64_t const> parameter_a,
+                                               cudf::device_span<uint64_t const> parameter_b,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::minhash_ngrams_fn<HashFunction>(
+    input, ngrams, seed, parameter_a, parameter_b, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
+}
+
 }  // namespace detail
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
@@ -454,6 +690,19 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_ngrams(cudf::lists_column_view const& input,
+                                             cudf::size_type ngrams,
+                                             uint32_t seed,
+                                             cudf::device_span<uint32_t const> parameter_a,
+                                             cudf::device_span<uint32_t const> parameter_b,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
                                         cudf::device_span<uint64_t const> parameter_a,
@@ -466,4 +715,17 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_ngrams(cudf::lists_column_view const& input,
+                                               cudf::size_type ngrams,
+                                               uint64_t seed,
+                                               cudf::device_span<uint64_t const> parameter_a,
+                                               cudf::device_span<uint64_t const> parameter_b,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64_ngrams(input, ngrams, seed, parameter_a, parameter_b, stream, mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 8bfb17e0efd..db43484ab09 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -187,6 +187,15 @@ TEST_F(MinHashTest, EmptyTest)
   auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
   results = nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
+
+  auto empty = cudf::test::lists_column_wrapper<cudf::string_view>();
+  auto lview = cudf::lists_column_view(empty);
+  results =
+    nvtext::minhash_ngrams(lview, 4, 0, cudf::column_view(params), cudf::column_view(params));
+  EXPECT_EQ(results->size(), 0);
+  results =
+    nvtext::minhash64_ngrams(lview, 4, 0, cudf::column_view(params64), cudf::column_view(params64));
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(MinHashTest, ErrorsTest)
@@ -194,17 +203,20 @@ TEST_F(MinHashTest, ErrorsTest)
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
   auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
-               std::invalid_argument);
+  auto eview = cudf::column_view(empty);
+  EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 0), std::invalid_argument);
   auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
-  EXPECT_THROW(
-    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
-    std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
-               std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
-    std::invalid_argument);
+  auto eview64 = cudf::column_view(empty64);
+  EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 0), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash(view, 0, eview, eview, 4), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash64(view, 0, eview64, eview64, 4), std::invalid_argument);
+
+  auto empty_list = cudf::test::lists_column_wrapper<cudf::string_view>();
+  auto lview      = cudf::lists_column_view(empty_list);
+  EXPECT_THROW(nvtext::minhash_ngrams(lview, 0, 0, eview, eview), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash64_ngrams(lview, 0, 0, eview64, eview64), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, eview, eview), std::invalid_argument);
+  EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, eview64, eview64), std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
@@ -212,16 +224,133 @@ TEST_F(MinHashTest, ErrorsTest)
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
   auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
-               std::overflow_error);
+  auto pview        = cudf::column_view(params);
+  EXPECT_THROW(nvtext::minhash(view, 0, pview, pview, 4), std::overflow_error);
   auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(
-    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
-    std::overflow_error);
-
-  EXPECT_THROW(nvtext::minhash(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
-               std::invalid_argument);
-  EXPECT_THROW(
-    nvtext::minhash64(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
-    std::invalid_argument);
+  auto pview64  = cudf::column_view(params64);
+  EXPECT_THROW(nvtext::minhash64(view, 0, pview64, pview64, 4), std::overflow_error);
+
+  auto offsets = cudf::test::fixed_width_column_wrapper<int32_t>(
+    thrust::counting_iterator<cudf::size_type>(0),
+    thrust::counting_iterator<cudf::size_type>(h_input.size() + 1));
+  auto input_ngrams =
+    cudf::make_lists_column(h_input.size(), offsets.release(), input.release(), 0, {});
+  lview = cudf::lists_column_view(input_ngrams->view());
+  EXPECT_THROW(nvtext::minhash_ngrams(lview, 4, 0, pview, pview), std::overflow_error);
+  EXPECT_THROW(nvtext::minhash64_ngrams(lview, 4, 0, pview64, pview64), std::overflow_error);
+}
+
+TEST_F(MinHashTest, Ngrams)
+{
+  using LCWS = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto input =
+    LCWS({LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."},
+          LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."},
+          LCWS{"short", "row"}});
+
+  auto view = cudf::lists_column_view(input);
+
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params));
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{ 230924604u,   55492793u, 963436400u},
+    LCW32{ 230924604u,  367515795u, 963436400u},
+    LCW32{2380648568u, 1330223236u, 279797904u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64));
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 208926840193078200ul, 576399628675212695ul,  312927673584437419ul},
+    LCW64{ 677038498284219393ul, 326338087730412201ul,  298455901014050223ul},
+    LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
+TEST_F(MinHashTest, NgramsWide)
+{
+  auto many     = std::vector<char const*>(1024, "hello");
+  auto str_data = cudf::test::strings_column_wrapper(many.begin(), many.end());
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int32_t, uint64_t>({0ul, many.size() / 2, many.size()});
+  auto input = cudf::make_lists_column(2, offsets.release(), str_data.release(), 0, {});
+
+  auto view = cudf::lists_column_view(input->view());
+
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params));
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{ 571536396u, 2346676954u, 4121817512u},
+    LCW32{ 571536396u, 2346676954u, 4121817512u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64));
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul},
+    LCW64{ 1947142336021414174ul, 1219519365938078011ul, 491896395854741840ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
+TEST_F(MinHashTest, NgramsSliced)
+{
+  using LCWS = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto input =
+    LCWS({LCWS{"ignored", "row"},
+          LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog."},
+          LCWS{"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "", "dog."},
+          LCWS{"short", "row"},
+          LCWS{"ignored", "row"}});
+
+  auto view  = cudf::lists_column_view(cudf::slice(input, {1, 4}).front());
+  auto first = thrust::counting_iterator<uint32_t>(10);
+
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_ngrams(view, 4, 0, cudf::column_view(params), cudf::column_view(params));
+
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{ 230924604u,   55492793u, 963436400u},
+    LCW32{ 230924604u,  367515795u, 963436400u},
+    LCW32{2380648568u, 1330223236u, 279797904u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 =
+    nvtext::minhash64_ngrams(view, 4, 0, cudf::column_view(params64), cudf::column_view(params64));
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{ 208926840193078200ul, 576399628675212695ul,  312927673584437419ul},
+    LCW64{ 677038498284219393ul, 326338087730412201ul,  298455901014050223ul},
+    LCW64{1493265692486268500ul, 720255058049417768ul, 2253087432826260995ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 944f5cd6d26..b82ec1958fb 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5533,6 +5533,120 @@ def minhash64(
             self._column.minhash64(seed, a_column, b_column, width)  # type: ignore[arg-type]
         )
 
+    def minhash_ngrams(
+        self, ngrams: int, seed: np.uint32, a: ColumnLike, b: ColumnLike
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a ngrams of strings within each row,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        ngrams : int
+            Number of strings to hash within each row.
+        seed : uint32
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']])
+        >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
+        >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
+        >>> s.str.minhash_ngrams(ngrams=2, seed=0, a=a, b=b)
+        0      [416367551, 832735099, 1249102647]
+        1    [1906668704, 3813337405, 1425038810]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(b)}"
+            )
+        plc_column = plc.nvtext.minhash.minhash_ngrams(
+            self._column.to_pylibcudf(mode="read"),
+            ngrams,
+            seed,
+            a._column.to_pylibcudf(mode="read"),
+            b._column.to_pylibcudf(mode="read"),
+        )
+        result = ColumnBase.from_pylibcudf(plc_column)
+        return self._return_or_inplace(result)
+
+    def minhash64_ngrams(
+        self, ngrams: int, seed: np.uint64, a: ColumnLike, b: ColumnLike
+    ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a ngrams of strings within each row,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        ngrams : int
+            Number of strings to hash within each row.
+        seed : uint64
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series([['this', 'is', 'my'], ['favorite', 'book']])
+        >>> a = cudf.Series([2, 3], dtype=np.uint64)
+        >>> b = cudf.Series([5, 6], dtype=np.uint64)
+        >>> s.str.minhash64_ngrams(ngrams=2, seed=0, a=a, b=b)
+        0    [1304293339825194559, 1956440009737791829]
+        1     [472203876238918632, 1861227318965224922]
+        dtype: list
+        """
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(b)}"
+            )
+        plc_column = plc.nvtext.minhash.minhash64_ngrams(
+            self._column.to_pylibcudf(mode="read"),
+            ngrams,
+            seed,
+            a._column.to_pylibcudf(mode="read"),
+            b._column.to_pylibcudf(mode="read"),
+        )
+        result = ColumnBase.from_pylibcudf(plc_column)
+        return self._return_or_inplace(result)
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index dc45827d2e8..47b41bd1e39 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -930,6 +930,48 @@ def test_minhash():
         strings.str.minhash64(1, a=params, b=params, width=8)
 
 
+def test_minhash_ngrams():
+    strings = cudf.Series(
+        [["this", "is", "my"], ["favorite", "book", "today"]]
+    )
+
+    params = cudf.Series([1, 2, 3], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([416367548, 832735096, 1249102644], dtype=np.uint32),
+            cudf.Series([1408797893, 2817595786, 4226393679], dtype=np.uint32),
+        ]
+    )
+    actual = strings.str.minhash_ngrams(ngrams=2, seed=0, a=params, b=params)
+    assert_eq(expected, actual)
+
+    params = cudf.Series([1, 2, 3], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [652146669912597278, 1304293339825194556, 1956440009737791826],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [1776622609581023632, 1247402209948353305, 718181810315682986],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = strings.str.minhash64_ngrams(ngrams=2, seed=0, a=params, b=params)
+    assert_eq(expected, actual)
+
+    # test wrong input types
+    with pytest.raises(ValueError):
+        strings.str.minhash_ngrams(ngrams=7, seed=1, a="a", b="b")
+    with pytest.raises(ValueError):
+        params = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash_ngrams(ngrams=6, seed=1, a=params, b=params)
+    with pytest.raises(ValueError):
+        params = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64_ngrams(ngrams=8, seed=1, a=params, b=params)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 9d1e8cba425..bfbb99e8eb0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -25,3 +25,19 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &b,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] minhash_ngrams(
+        const column_view &strings,
+        const size_type ngrams,
+        const uint32_t seed,
+        const column_view &a,
+        const column_view &b,
+    ) except +
+
+    cdef unique_ptr[column] minhash64_ngrams(
+        const column_view &strings,
+        const size_type ngrams,
+        const uint64_t seed,
+        const column_view &a,
+        const column_view &b,
+    ) except +
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 0af53748cdc..f1e099ca7da 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 from pylibcudf.column cimport Column
@@ -24,3 +24,19 @@ cpdef Column minhash64(
     Column b,
     size_type width
 )
+
+cpdef Column minhash_ngrams(
+    Column input,
+    size_type width,
+    uint32_t seed,
+    Column a,
+    Column b
+)
+
+cpdef Column minhash64_ngrams(
+    Column input,
+    size_type width,
+    uint64_t seed,
+    Column a,
+    Column b
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
index 5d88cfbbea0..bb50a150798 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
 
@@ -8,3 +8,9 @@ def minhash(
 def minhash64(
     input: Column, seed: int, a: Column, b: Column, width: int
 ) -> Column: ...
+def minhash_ngrams(
+    input: Column, ngrams: int, seed: int, a: Column, b: Column
+) -> Column: ...
+def minhash64_ngrams(
+    input: Column, ngrams: int, seed: int, a: Column, b: Column
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 84811cda867..cdc4a4f3ac8 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
@@ -8,12 +8,16 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    minhash_ngrams as cpp_minhash_ngrams,
+    minhash64_ngrams as cpp_minhash64_ngrams,
 )
 from pylibcudf.libcudf.types cimport size_type
 
 __all__ = [
     "minhash",
     "minhash64",
+    "minhash_ngrams",
+    "minhash64_ngrams",
 ]
 
 cpdef Column minhash(
@@ -103,3 +107,93 @@ cpdef Column minhash64(
         )
 
     return Column.from_libcudf(move(c_result))
+
+cpdef Column minhash_ngrams(
+    Column input,
+    size_type ngrams,
+    uint32_t seed,
+    Column a,
+    Column b
+):
+    """
+    Returns the minhash values for each input row of strings.
+    This function uses MurmurHash3_x86_32 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash_ngrams`.
+
+    Parameters
+    ----------
+    input : Column
+        List column of strings to compute minhash
+    ngrams : size_type
+        Number of consecutive strings to hash in each row
+    seed : uint32_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each row per
+        value in columns a and b.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash_ngrams(
+            input.view(),
+            ngrams,
+            seed,
+            a.view(),
+            b.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column minhash64_ngrams(
+    Column input,
+    size_type ngrams,
+    uint64_t seed,
+    Column a,
+    Column b
+):
+    """
+    Returns the minhash values for each input row of strings.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
+
+    For details, see :cpp:func:`minhash64_ngrams`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to compute minhash
+    ngrams : size_type
+        Number of consecutive strings to hash in each row
+    seed : uint64_t
+        Seed used for the hash function
+    a : Column
+        1st parameter value used for the minhash algorithm.
+    b : Column
+        2nd parameter value used for the minhash algorithm.
+
+    Returns
+    -------
+    Column
+        List column of minhash values for each row per
+        value in columns a and b.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_minhash64_ngrams(
+            input.view(),
+            ngrams,
+            seed,
+            a.view(),
+            b.view()
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ad7a6f7a762..ff8545f0617 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -33,3 +33,49 @@ def test_minhash(minhash_input_data, width):
     assert pa_result.type == pa.list_(
         pa.field("element", seed_type, nullable=False)
     )
+
+
+@pytest.fixture(scope="module", params=[pa.uint32(), pa.uint64()])
+def minhash_ngrams_input_data(request):
+    input_arr = pa.array(
+        [
+            ["foo", "bar", "foo foo", "bar bar", "foo bar", "bar foo"],
+            [
+                "one",
+                "two",
+                "three",
+                "four",
+                "five",
+                "six",
+                "seven",
+                "eight",
+                "nine",
+                "ten",
+                "eleven",
+            ],
+        ]
+    )
+    ab = pa.array([2, 3, 4, 5], request.param)
+    return input_arr, ab, request.param
+
+
+@pytest.mark.parametrize("ngrams", [5, 10])
+def test_minhash_ngrams(minhash_ngrams_input_data, ngrams):
+    input_arr, ab, seed_type = minhash_ngrams_input_data
+    minhash_func = (
+        plc.nvtext.minhash.minhash_ngrams
+        if seed_type == pa.uint32()
+        else plc.nvtext.minhash.minhash64_ngrams
+    )
+    result = minhash_func(
+        plc.interop.from_arrow(input_arr),
+        ngrams,
+        0,
+        plc.interop.from_arrow(ab),
+        plc.interop.from_arrow(ab),
+    )
+    pa_result = plc.interop.to_arrow(result)
+    assert all(len(got) == len(ab) for got, s in zip(pa_result, input_arr))
+    assert pa_result.type == pa.list_(
+        pa.field("element", seed_type, nullable=False)
+    )

From cf8938bc6b11de35337f6d4a04c73559420f3f4b Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 27 Feb 2025 16:42:46 -0500
Subject: [PATCH 24/42] Add a list of expected failures to narwhals tests
 (#18097)

## Description
<!-- Provide a standalone description of changes in this PR. -->
<!-- Reference any issues closed by this PR with "closes #1234". -->
<!-- Note: The pull request title will be included in the CHANGELOG. -->
Adds an xfail list to the narwhals tests we run using cudf.

Note: We can update/replace the dict when running Narwhals with
cudf.pandas. xref #18031
## Checklist
- [ ] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] New or existing tests cover these changes.
- [ ] The documentation is up to date with these changes.

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 ci/test_narwhals.sh                           |  1 +
 docs/cudf/source/conf.py                      |  1 +
 python/cudf/cudf/testing/__init__.py          |  3 ++-
 .../cudf/cudf/testing/narwhals_test_plugin.py | 25 +++++++++++++++++++
 4 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/testing/narwhals_test_plugin.py

diff --git a/ci/test_narwhals.sh b/ci/test_narwhals.sh
index 4a32ff0b0fd..28eceff2f80 100755
--- a/ci/test_narwhals.sh
+++ b/ci/test_narwhals.sh
@@ -26,6 +26,7 @@ rapids-logger "Run narwhals tests for cuDF"
 python -m pytest \
     --cache-clear \
     --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-narwhals.xml" \
+    -p cudf.testing.narwhals_test_plugin \
     --numprocesses=8 \
     --dist=worksteal \
     --constructors=cudf
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 8eea644363b..92b37c4b3f2 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -207,6 +207,7 @@ def clean_all_xml_files(path):
 exclude_patterns = [
     "venv",
     "**/includes/**",
+    "narwhals_test_plugin",
 ]
 
 # The name of the Pygments (syntax highlighting) style to use.
diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py
index 4e92b43b9f9..a4afa54f754 100644
--- a/python/cudf/cudf/testing/__init__.py
+++ b/python/cudf/cudf/testing/__init__.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 
+from cudf.testing import narwhals_test_plugin
 from cudf.testing.testing import (
     assert_eq,
     assert_frame_equal,
diff --git a/python/cudf/cudf/testing/narwhals_test_plugin.py b/python/cudf/cudf/testing/narwhals_test_plugin.py
new file mode 100644
index 00000000000..d794bd0120a
--- /dev/null
+++ b/python/cudf/cudf/testing/narwhals_test_plugin.py
@@ -0,0 +1,25 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Plugin for running narwhals test suite with cudf."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping
+
+EXPECTED_FAILURES: Mapping[str, str] = {
+    "tests/frame/select_test.py::test_select_duplicates[cudf]": "cuDF doesn't support having multiple columns with same names",
+}
+
+
+def pytest_collection_modifyitems(session, config, items) -> None:
+    """Mark known failing tests."""
+    import pytest
+
+    for item in items:
+        if item.nodeid in EXPECTED_FAILURES:
+            exp_val = EXPECTED_FAILURES[item.nodeid]
+            item.add_marker(pytest.mark.xfail(reason=exp_val))

From 83a29ce1e99221436e6d7a8ac06d87ee0982bf20 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 28 Feb 2025 14:41:49 +0000
Subject: [PATCH 25/42] Minor improvements in arrow interop (#18053)

When ingesting data from an arrow stream, if the stream contains only a single chunk we can avoid the concatenation.

Additionally, explicitly raise exceptions if the arrow-side column length would exceed cudf column size limits.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/18053
---
 cpp/include/cudf/interop.hpp         | 12 +++++++++++-
 cpp/src/interop/from_arrow_device.cu |  9 +++++++++
 cpp/src/interop/from_arrow_host.cu   |  9 +++++++++
 cpp/src/interop/from_arrow_stream.cu |  3 ++-
 4 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 810f0377597..276a1ea77e2 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -327,6 +327,8 @@ unique_device_array_t to_arrow_host(
  *
  * @throws cudf::data_type_error if the input array is not a struct array.
  *
+ * @throws std::overflow_error if the input arrow object exceeds the column size limit.
+ *
  * The conversion will not call release on the input Array.
  *
  * @param schema `ArrowSchema` pointer to describe the type of the data
@@ -367,6 +369,8 @@ std::unique_ptr<cudf::column> from_arrow_column(
  *
  * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
  *
+ * @throws std::overflow_error if the input arrow object exceeds the column size limit.
+ *
  * @throws cudf::data_type_error if the input array is not a struct array,
  * non-struct arrays should be passed to `from_arrow_host_column` instead.
  *
@@ -411,6 +415,8 @@ std::unique_ptr<table> from_arrow_stream(
  *
  * @throws cudf::data_type_error if input arrow data type is not supported in cudf.
  *
+ * @throws std::overflow_error if the input arrow object exceeds the column size limit.
+ *
  * The conversion will not call release on the input Array.
  *
  * @param schema `ArrowSchema` pointer to describe the type of the data
@@ -483,6 +489,8 @@ using unique_table_view_t =
  *
  * @throws cudf::data_type_error if the input arrow data type is not supported.
  *
+ * @throws std::overflow_error if the input arrow object exceeds the column size limit.
+ *
  * Each child of the input struct will be the columns of the resulting table_view.
  *
  * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
@@ -528,6 +536,8 @@ using unique_column_view_t =
  *
  * @throws cudf::data_type_error input arrow data type is not supported.
  *
+ * @throws std::overflow_error if the input arrow object exceeds the column size limit.
+ *
  * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
  * over any memory which is allocated, such as converting boolean columns from the bitmap
  * used by Arrow to the 1-byte per value for cudf.
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 29c4dfd35ac..836da2987e2 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -40,6 +40,10 @@
 #include <nanoarrow/nanoarrow.hpp>
 #include <nanoarrow/nanoarrow_device.h>
 
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+
 namespace cudf {
 
 namespace detail {
@@ -317,6 +321,11 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
+  CUDF_EXPECTS(
+    input->length <= static_cast<std::int64_t>(std::numeric_limits<cudf::size_type>::max()),
+    "Total number of rows in Arrow column exceeds the column size limit.",
+    std::overflow_error);
+
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
                type, dispatch_from_arrow_device{}, schema, input, type, skip_mask, stream, mr))
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index ea5487a2960..0be1557faaf 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -43,6 +43,10 @@
 #include <nanoarrow/nanoarrow.hpp>
 #include <nanoarrow/nanoarrow_device.h>
 
+#include <cstdint>
+#include <limits>
+#include <stdexcept>
+
 namespace cudf {
 namespace detail {
 
@@ -381,6 +385,11 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
+  CUDF_EXPECTS(
+    input->length <= static_cast<std::int64_t>(std::numeric_limits<cudf::size_type>::max()),
+    "Total number of rows in Arrow column exceeds the column size limit.",
+    std::overflow_error);
+
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
                type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask))
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index deff62be576..ce1db96ca43 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -121,6 +121,7 @@ std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
 
   schema.release(&schema);
 
+  if (chunks.size() == 1) { return std::move(chunks[0]); }
   auto chunk_views = std::vector<table_view>{};
   chunk_views.reserve(chunks.size());
   std::transform(

From 09ebf31011f27d343c32ef406b90c3ecc12b0107 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 28 Feb 2025 16:36:46 -0800
Subject: [PATCH 26/42] Use protocol for dlpack instead of deprecated function
 (#18134)

This PR adapts cudf's dlpack tests for compatibility with cupy 13.4,
which was just released yesterday on PyPI and containers
https://github.com/cupy/cupy/pull/8722 that breaks the legacy toDlpack
functionality.
---
 python/cudf/cudf/core/df_protocol.py       | 2 +-
 python/cudf/cudf/core/subword_tokenizer.py | 2 +-
 python/cudf/cudf/tests/test_dlpack.py      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index cc9f39d70ef..5f2dfe98a3e 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -105,7 +105,7 @@ def __dlpack__(self):
         # DLPack not implemented in NumPy yet, so leave it out here.
         try:
             cuda_array = as_cuda_array(self._buf).view(self._dtype)
-            return cp.asarray(cuda_array).toDlpack()
+            return cp.asarray(cuda_array).__dlpack__()
         except ValueError:
             raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
 
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 50d1a11c39b..24e6aa40de0 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -19,7 +19,7 @@ def _cast_to_appropriate_type(ar, cast_type):
     elif cast_type == "tf":
         from tensorflow.experimental.dlpack import from_dlpack
 
-    return from_dlpack(ar.astype("int32").toDlpack())
+    return from_dlpack(ar.astype("int32").__dlpack__())
 
 
 class SubwordTokenizer:
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 20c24bd7564..187a5524e8e 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import itertools
 from contextlib import ExitStack as does_not_raise
@@ -140,7 +140,7 @@ def test_to_dlpack_cupy_2d(data_2d):
 def test_from_dlpack_cupy_1d(data_1d):
     cupy_array = cupy.array(data_1d)
     cupy_host_array = cupy_array.get()
-    dlt = cupy_array.toDlpack()
+    dlt = cupy_array.__dlpack__()
 
     gs = cudf.from_dlpack(dlt)
     cudf_host_array = gs.to_numpy(na_value=np.nan)
@@ -151,7 +151,7 @@ def test_from_dlpack_cupy_1d(data_1d):
 def test_from_dlpack_cupy_2d(data_2d):
     cupy_array = cupy.array(data_2d, order="F")
     cupy_host_array = cupy_array.get().flatten()
-    dlt = cupy_array.toDlpack()
+    dlt = cupy_array.__dlpack__()
 
     gdf = cudf.from_dlpack(dlt)
     cudf_host_array = np.array(gdf.to_pandas()).flatten()

From 0cf66982df885513921372f0dcbcc32b6d4cd243 Mon Sep 17 00:00:00 2001
From: Tianyu Liu <kingcrimsontianyu@gmail.com>
Date: Mon, 3 Mar 2025 12:03:39 -0500
Subject: [PATCH 27/42] Update calls to KvikIO's config setter (#18144)

## Description
KvikIO has changed the function names of the config setters to improve
clarity (https://github.com/rapidsai/kvikio/pull/644). This PR updates
the setter calls in cuDF accordingly.

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 cpp/src/io/utilities/config_utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 46816604918..fa6f04eed73 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -36,10 +36,10 @@ void set_up_kvikio()
     cudaFree(nullptr);
 
     auto const compat_mode = kvikio::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON);
-    kvikio::defaults::compat_mode_reset(compat_mode);
+    kvikio::defaults::set_compat_mode(compat_mode);
 
     auto const nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 4u);
-    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+    kvikio::defaults::set_thread_pool_nthreads(nthreads);
   });
 }
 

From 1c0ea5e7f7968fbeb6852a533df30795ad754b2b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 3 Mar 2025 11:18:37 -0800
Subject: [PATCH 28/42] Reduce memory use when writing tables with very short
 columns to ORC (#18136)

Closes #18059

To avoid estimating the maximum compressed size for each actual block in
the file, ORC writer uses the estimate for the (uncompressed) block size
limit, which defaults to 256KB. However, when we write many small
blocks, this compressed block size estimate is much larger than what is
needed, leading to high memory use for wide/short tables.

This PR adds logic to take the actual block size into account, and to
use the size of the actual largest block in the file, not the largest
possible block. This changes the memory usage by orders of magnitude in
some tests.

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/io/orc/writer_impl.cu     | 20 +++++++++++++++++++-
 cpp/src/utilities/host_memory.cpp |  1 +
 cpp/tests/CMakeLists.txt          |  4 ++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 3a20ffbce19..217aff48d5e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2226,6 +2226,22 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
           std::move(dict_order_owner)};
 }
 
+[[nodiscard]] uint32_t find_largest_stream_size(device_2dspan<stripe_stream const> ss,
+                                                rmm::cuda_stream_view stream)
+{
+  auto const longest_stream = thrust::max_element(
+    rmm::exec_policy(stream),
+    ss.data(),
+    ss.data() + ss.count(),
+    cuda::proclaim_return_type<bool>([] __device__(auto const& lhs, auto const& rhs) {
+      return lhs.stream_size < rhs.stream_size;
+    }));
+
+  auto const h_longest_stream = cudf::detail::make_host_vector_sync(
+    device_span<stripe_stream const>{longest_stream, 1}, stream);
+  return h_longest_stream[0].stream_size;
+}
+
 /**
  * @brief Perform the processing steps needed to convert the input table into the output ORC data
  * for writing, such as compression and ORC encoding.
@@ -2319,7 +2335,9 @@ auto convert_table_to_orc_data(table_view const& input,
   size_t compressed_bfr_size   = 0;
   size_t num_compressed_blocks = 0;
 
-  auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize);
+  auto const largest_stream_size = find_largest_stream_size(strm_descs, stream);
+  auto const max_compressed_block_size =
+    max_compressed_size(compression, std::min<size_t>(largest_stream_size, compression_blocksize));
   auto const padded_max_compressed_block_size =
     util::round_up_unsafe<size_t>(max_compressed_block_size, block_align);
   auto const padded_block_header_size =
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 94d27d976c3..e41d772a479 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -29,6 +29,7 @@
 namespace cudf {
 
 namespace {
+
 class fixed_pinned_pool_memory_resource {
   using upstream_mr    = rmm::mr::pinned_host_memory_resource;
   using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index cfc6a0dc425..e3ca8b70b87 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -309,7 +309,7 @@ ConfigureTest(
 ConfigureTest(
   ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 100
 )
 ConfigureTest(
   PARQUET_TEST
@@ -340,7 +340,7 @@ ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 100
 )
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)

From 34235f4ebacd5982aad4c42d6886706761ac862c Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 3 Mar 2025 17:06:30 -0500
Subject: [PATCH 29/42] Use protocol for dlpack instead of deprecated function
 in cupy notebook (#18147)

Follow up to #18134
---
 docs/cudf/source/user_guide/cupy-interop.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index 112f0bcfca6..93e62d90c0f 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -566,7 +566,7 @@
     "%%timeit\n",
     "\n",
     "fortran_arr = cp.asfortranarray(reshaped_arr)\n",
-    "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())"
+    "reshaped_df = cudf.from_dlpack(fortran_arr.__dlpack__())"
    ]
   },
   {
@@ -1418,7 +1418,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,

From b6a6d390f92080481606e91f40450cc4e140fa97 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 3 Mar 2025 14:22:56 -0800
Subject: [PATCH 30/42] Skip failing test (#18146)

This test is failing in multiple places right now, such as [this
run](https://github.com/rapidsai/cudf/actions/runs/13595690128/job/38014725800)
on https://github.com/rapidsai/cudf/pull/18133 and [this
run](https://github.com/rapidsai/cudf/actions/runs/13636334843/job/38118996773?pr=18136)
on https://github.com/rapidsai/cudf/pull/18136. Let's skip it until we
can debug why so that we unblock other CI.

---------

Co-authored-by: Peter Andreas Entschev <peter@entschev.com>
---
 ci/run_cudf_polars_pytests.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
index e881055e9e3..5a1d5f56bf0 100755
--- a/ci/run_cudf_polars_pytests.sh
+++ b/ci/run_cudf_polars_pytests.sh
@@ -17,5 +17,5 @@ python -m pytest --cache-clear "$@" tests --executor dask-experimental
 # Test the "dask-experimental" executor with Distributed cluster
 # Not all tests pass yet, deselecting by name those that are failing.
 python -m pytest --cache-clear "$@" tests --executor dask-experimental --dask-cluster \
-    -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series" \
-    --cov-fail-under=89  # Override coverage, Distributed cluster coverage not yet 100%
+    -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series and not test_can_convert_lists and not test_executor_basics and not test_replace_literal and not test_hconcat_different_heights and not test_join and not test_dataframescan and not test_strip_chars" \
+    --cov-fail-under=80  # Override coverage, Distributed cluster coverage not yet 100%

From 93d98af8450d466705062ca23f58f6082fca3e98 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 3 Mar 2025 19:02:23 -0500
Subject: [PATCH 31/42] Optimization improvement for substr in
 cudf::string_view (#18062)

Slight optimization improvement sets the character count in the `cudf::string_view` produced by `cudf::string_view::substr` when the number of output characters is known. This can save redundant character counting in downstream usage of the new string.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/18062
---
 cpp/include/cudf/strings/string_view.cuh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index b91748cfc7d..15539c50da9 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -443,10 +443,12 @@ __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, siz
 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
 {
   if (pos < 0 || pos >= length()) { return string_view{}; }
-  auto const itr  = begin() + pos;
-  auto const spos = itr.byte_offset();
-  auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes();
-  return {data() + spos, epos - spos};
+  auto const spos = begin() + pos;
+  auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()};
+  auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
+  // this potentially saves redundant character counting downstream
+  if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
+  return ss;
 }
 
 __device__ inline size_type string_view::character_offset(size_type bytepos) const

From 08f536a602d288f3c31abf7f2a22a8538b13f62d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 3 Mar 2025 19:33:19 -0800
Subject: [PATCH 32/42] Preserve DataFrame.column subclass and type during
 binop (#18113)

closes https://github.com/rapidsai/cudf/issues/11148

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18113
---
 python/cudf/cudf/core/dataframe.py       | 41 ++++++++++++++++--------
 python/cudf/cudf/core/indexed_frame.py   | 10 ++----
 python/cudf/cudf/core/series.py          | 15 ++++-----
 python/cudf/cudf/tests/test_dataframe.py | 15 +++++++++
 4 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 69db055fe87..3cc42dbe982 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2055,18 +2055,28 @@ def _make_operands_and_index_for_binop(
         dict[str | None, tuple[ColumnBase, Any, bool, Any]]
         | NotImplementedType,
         BaseIndex | None,
-        bool,
+        dict[str, Any],
     ]:
         lhs, rhs = self._data, other
         index = self.index
         fill_requires_key = False
         left_default: Any = False
         equal_columns = False
-        can_use_self_column_name = True
+        ca_attributes: dict[str, Any] = {}
+
+        def _fill_same_ca_attributes(
+            attrs: dict[str, Any], ca: ColumnAccessor
+        ) -> dict[str, Any]:
+            attrs["rangeindex"] = ca.rangeindex
+            attrs["multiindex"] = ca.multiindex
+            attrs["label_dtype"] = ca.label_dtype
+            attrs["level_names"] = ca.level_names
+            return attrs
 
         if _is_scalar_or_zero_d_array(other):
             rhs = {name: other for name in self._data}
             equal_columns = True
+            ca_attributes = _fill_same_ca_attributes(ca_attributes, self._data)
         elif isinstance(other, Series):
             if (
                 not (self_pd_columns := self._data.to_pandas_index).equals(
@@ -2085,9 +2095,12 @@ def _make_operands_and_index_for_binop(
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
             equal_columns = other_pd_index.equals(self_pd_columns)
-            can_use_self_column_name = (
-                equal_columns or other_pd_index.names == self_pd_columns.names
-            )
+            if equal_columns:
+                ca_attributes = _fill_same_ca_attributes(
+                    ca_attributes, self._data
+                )
+            elif other_pd_index.names == self_pd_columns.names:
+                ca_attributes["level_names"] = self._data.level_names
         elif isinstance(other, DataFrame):
             if (
                 not can_reindex
@@ -2110,17 +2123,19 @@ def _make_operands_and_index_for_binop(
             # the fill value.
             left_default = fill_value
             equal_columns = self._column_names == other._column_names
-            can_use_self_column_name = (
-                equal_columns
-                or self._data._level_names == other._data._level_names
-            )
+            if self._data.to_pandas_index.equals(other._data.to_pandas_index):
+                ca_attributes = _fill_same_ca_attributes(
+                    ca_attributes, self._data
+                )
+            elif self._data._level_names == other._data._level_names:
+                ca_attributes["level_names"] = self._data.level_names
         elif isinstance(other, (dict, abc.Mapping)):
             # Need to fail early on host mapping types because we ultimately
             # convert everything to a dict.
-            return NotImplemented, None, True
+            return NotImplemented, None, ca_attributes
 
         if not isinstance(rhs, (dict, abc.Mapping)):
-            return NotImplemented, None, True
+            return NotImplemented, None, ca_attributes
 
         operands = {
             k: (
@@ -2150,8 +2165,8 @@ def _make_operands_and_index_for_binop(
                 raise ValueError("other must be a DataFrame or Series.")
 
             sorted_dict = {key: operands[key] for key in column_names_list}
-            return sorted_dict, index, can_use_self_column_name
-        return operands, index, can_use_self_column_name
+            return sorted_dict, index, ca_attributes
+        return operands, index, ca_attributes
 
     @classmethod
     @_performance_tracking
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9d426ad6bf7..8a625dc9225 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4888,20 +4888,16 @@ def _binaryop(
         (
             operands,
             out_index,
-            can_use_self_column_name,
+            ca_attributes,
         ) = self._make_operands_and_index_for_binop(
             other, op, fill_value, reflect, can_reindex
         )
         if operands is NotImplemented:
             return NotImplemented
-
-        level_names = (
-            self._data._level_names if can_use_self_column_name else None
-        )
         return self._from_data(
             ColumnAccessor(
                 type(self)._colwise_binop(operands, op),
-                level_names=level_names,
+                **ca_attributes,
             ),
             index=out_index,
         )
@@ -4917,7 +4913,7 @@ def _make_operands_and_index_for_binop(
         dict[str | None, tuple[ColumnBase, Any, bool, Any]]
         | NotImplementedType,
         cudf.BaseIndex | None,
-        bool,
+        dict[str, Any],
     ]:
         raise NotImplementedError(
             f"Binary operations are not supported for {self.__class__}"
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f6f1b31dc43..d25550553b1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1531,7 +1531,7 @@ def _make_operands_and_index_for_binop(
         dict[str | None, tuple[ColumnBase, Any, bool, Any]]
         | NotImplementedType,
         BaseIndex | None,
-        bool,
+        dict[str, Any],
     ]:
         # Specialize binops to align indices.
         if isinstance(other, Series):
@@ -1547,15 +1547,14 @@ def _make_operands_and_index_for_binop(
         else:
             lhs = self
 
-        try:
-            can_use_self_column_name = cudf.utils.utils._is_same_name(
-                self.name, other.name
-            )
-        except AttributeError:
-            can_use_self_column_name = False
+        ca_attributes = {}
+        if hasattr(other, "name") and cudf.utils.utils._is_same_name(
+            self.name, other.name
+        ):
+            ca_attributes["level_names"] = self._data._level_names
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs.index, can_use_self_column_name
+        return operands, lhs.index, ca_attributes
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 15c11db5a84..d6bbbf601be 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11083,6 +11083,21 @@ def test_dataframe_columns_set_preserve_type(klass):
     pd.testing.assert_index_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "expected",
+    [
+        pd.RangeIndex(1, 2, name="a"),
+        pd.Index([1], dtype=np.int8, name="a"),
+        pd.MultiIndex.from_arrays([[1]], names=["a"]),
+    ],
+)
+@pytest.mark.parametrize("binop", [lambda df: df == df, lambda df: df - 1])
+def test_dataframe_binop_preserves_column_metadata(expected, binop):
+    df = cudf.DataFrame([1], columns=expected)
+    result = binop(df).columns
+    pd.testing.assert_index_equal(result, expected, exact=True)
+
+
 @pytest.mark.parametrize(
     "scalar",
     [

From 43bbd7f0fcafd0f29db80f9b57913f8c63e74fd9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 3 Mar 2025 19:44:05 -0800
Subject: [PATCH 33/42] Remove some unnecessary module imports (#18143)

Noticed while working on https://github.com/rapidsai/cudf/pull/18141.

Also made some imports more specific to make it easier to see what we need

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18143
---
 python/cudf/cudf/core/column/methods.py    | 2 --
 python/cudf/cudf/core/column/string.py     | 1 -
 python/cudf/cudf/core/dataframe.py         | 1 -
 python/cudf/cudf/core/indexed_frame.py     | 4 ++--
 python/cudf/cudf/core/udf/groupby_utils.py | 8 +++-----
 python/cudf/cudf/utils/utils.py            | 1 -
 6 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index b42e4419d72..e545bb4bc5e 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -5,8 +5,6 @@
 from typing import Literal, Union, overload
 
 import cudf
-import cudf.core.column
-import cudf.core.column_accessor
 from cudf.utils.utils import NotIterable
 
 ParentType = Union["cudf.Series", "cudf.core.index.Index"]
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b82ec1958fb..97ec41f4c39 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -16,7 +16,6 @@
 import pylibcudf as plc
 
 import cudf
-import cudf.api.types
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3cc42dbe982..f909d72687c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -35,7 +35,6 @@
 import pylibcudf as plc
 
 import cudf
-import cudf.core.common
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8a625dc9225..2f4ad360d8b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -26,8 +26,8 @@
 import pylibcudf as plc
 
 import cudf
-import cudf.core
 import cudf.core.algorithms
+import cudf.core.common
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -3908,7 +3908,7 @@ def _reindex(
         }
 
         result = self.__class__._from_data(
-            data=cudf.core.column_accessor.ColumnAccessor(
+            data=ColumnAccessor(
                 cols,
                 multiindex=multiindex,
                 level_names=level_names,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 814d3e9fc85..943b6ebfd1c 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 
 import cupy as cp
@@ -8,7 +8,7 @@
 from numba.cuda.cudadrv.devices import get_context
 from numba.np import numpy_support
 
-import cudf.core.udf.utils
+from cudf.core.column import column_empty
 from cudf.core.udf.groupby_typing import (
     SUPPORTED_GROUPBY_NUMPY_TYPES,
     Group,
@@ -154,9 +154,7 @@ def jit_groupby_apply(offsets, grouped_values, function, *args):
     offsets = cp.asarray(offsets)
     ngroups = len(offsets) - 1
 
-    output = cudf.core.column.column_empty(
-        ngroups, dtype=return_type, for_numba=True
-    )
+    output = column_empty(ngroups, dtype=return_type, for_numba=True)
     launch_args = [
         offsets,
         output,
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2678a4f8116..601a7a369e8 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -15,7 +15,6 @@
 import rmm
 
 import cudf
-import cudf.api.types
 from cudf.core import column
 from cudf.core.buffer import as_buffer
 from cudf.utils.dtypes import SIZE_TYPE_DTYPE

From 3636040c366c0af2a6bd95e9beff167665a45b86 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 4 Mar 2025 05:14:19 +0100
Subject: [PATCH 34/42] Replace more deprecated `CUB` functors (#18119)

They will be removed in a future CCCL release

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/18119
---
 cpp/benchmarks/common/generate_input.cu       |  2 +-
 .../cudf/detail/utilities/functional.hpp      | 31 +++++++++++++++++++
 .../cudf/table/experimental/row_operators.cuh |  7 +++--
 cpp/src/binaryop/compiled/binary_ops.cu       |  7 +++--
 cpp/src/filling/repeat.cu                     |  5 +--
 cpp/src/groupby/sort/group_rank_scan.cu       |  5 +--
 cpp/src/groupby/sort/group_replace_nulls.cu   |  5 +--
 cpp/src/groupby/sort/group_scan_util.cuh      |  7 +++--
 .../sort/group_single_pass_reduction_util.cuh |  8 ++---
 cpp/src/io/avro/reader_impl.cu                |  7 +++--
 cpp/src/io/comp/nvcomp_adapter.cu             |  5 +--
 cpp/src/io/fst/logical_stack.cuh              |  5 +--
 cpp/src/io/json/column_tree_construction.cu   |  5 +--
 cpp/src/io/json/host_tree_algorithms.cu       |  5 +--
 cpp/src/io/json/json_column.cu                |  9 +++---
 cpp/src/io/json/json_tree.cu                  |  7 +++--
 cpp/src/io/json/write_json.cu                 |  4 +--
 cpp/src/io/orc/stripe_data.cu                 |  4 ++-
 cpp/src/io/orc/stripe_enc.cu                  | 10 +++---
 cpp/src/io/parquet/delta_enc.cuh              |  6 ++--
 cpp/src/io/parquet/page_string_decode.cu      |  5 ++-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  6 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  3 +-
 .../io/statistics/typed_statistics_chunk.cuh  | 12 ++++---
 cpp/src/io/utilities/data_casting.cu          |  3 +-
 cpp/src/lists/set_operations.cu               |  7 +++--
 cpp/src/quantiles/tdigest/tdigest.cu          |  5 +--
 .../quantiles/tdigest/tdigest_aggregation.cu  | 11 ++++---
 cpp/src/reductions/segmented/simple.cuh       |  5 +--
 .../rolling/detail/rolling_collect_list.cu    |  6 ++--
 cpp/src/sort/rank.cu                          |  7 +++--
 cpp/src/strings/split/split.cu                |  7 +++--
 cpp/src/strings/split/split_re.cu             |  5 +--
 cpp/src/text/bpe/byte_pair_encoding.cu        |  9 ++++--
 cpp/src/text/minhash.cu                       |  3 +-
 cpp/tests/iterator/iterator_tests.cuh         |  9 +++---
 36 files changed, 160 insertions(+), 87 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/functional.hpp

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 8d6aacd2ef1..f1af62eaa87 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -580,7 +580,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
     null_mask.begin(),
     lengths.begin(),
     cuda::proclaim_return_type<cudf::size_type>([] __device__(auto) { return 0; }),
-    thrust::logical_not<bool>{});
+    cuda::std::logical_not<bool>{});
   auto valid_lengths = thrust::make_transform_iterator(
     thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())),
     valid_or_zero{});
diff --git a/cpp/include/cudf/detail/utilities/functional.hpp b/cpp/include/cudf/detail/utilities/functional.hpp
new file mode 100644
index 00000000000..114c69bbe46
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/functional.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuda/functional>
+#include <thrust/functional.h>
+
+namespace cudf::detail {
+
+#if CCCL_MAJOR_VERSION >= 3
+using cuda::maximum;
+using cuda::minimum;
+#else
+using thrust::maximum;
+using thrust::minimum;
+#endif
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index 8214ea6e83b..6ace930c1fe 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <cuda/std/functional>
 #include <cuda/std/limits>
 #include <cuda/std/optional>
 #include <cuda/std/tuple>
@@ -1466,9 +1467,9 @@ class device_row_comparator {
           auto rvalid = detail::make_validity_iterator<true>(rcol);
           if (nulls_are_equal == null_equality::UNEQUAL) {
             if (thrust::any_of(
-                  thrust::seq, lvalid, lvalid + lcol.size(), thrust::logical_not<bool>()) or
+                  thrust::seq, lvalid, lvalid + lcol.size(), cuda::std::logical_not<bool>()) or
                 thrust::any_of(
-                  thrust::seq, rvalid, rvalid + rcol.size(), thrust::logical_not<bool>())) {
+                  thrust::seq, rvalid, rvalid + rcol.size(), cuda::std::logical_not<bool>())) {
               return false;
             }
           } else {
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 3c558f1e264..70e26ae4285 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/structs/utilities.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/memory_resource.hpp>
@@ -241,8 +242,8 @@ struct null_considering_binop {
           return invalid_str;
         else if (lhs_valid && rhs_valid) {
           return (op == binary_operator::NULL_MAX)
-                   ? thrust::maximum<cudf::string_view>()(lhs_value, rhs_value)
-                   : thrust::minimum<cudf::string_view>()(lhs_value, rhs_value);
+                   ? cudf::detail::maximum<cudf::string_view>()(lhs_value, rhs_value)
+                   : cudf::detail::minimum<cudf::string_view>()(lhs_value, rhs_value);
         } else if (lhs_valid)
           return lhs_value;
         else
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 2e78954d78a..2695288af64 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/repeat.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table.hpp>
@@ -81,7 +82,7 @@ struct count_checker {
     if (static_cast<int64_t>(std::numeric_limits<T>::max()) >
         std::numeric_limits<cudf::size_type>::max()) {
       auto max = thrust::reduce(
-        rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, thrust::maximum<T>());
+        rmm::exec_policy(stream), count.begin<T>(), count.end<T>(), 0, cudf::detail::maximum<T>());
       CUDF_EXPECTS(max <= std::numeric_limits<cudf::size_type>::max(),
                    "count exceeds the column size limit",
                    std::overflow_error);
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 583357d9090..a0ba81bccb2 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <cuda/std/limits>
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -146,7 +147,7 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
                                 group_labels_begin + group_labels.size(),
                                 mutable_rank_begin,
                                 mutable_rank_begin,
-                                thrust::equal_to{},
+                                cuda::std::equal_to{},
                                 scan_op);
   return ranks;
 }
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 088ed05e5eb..f94ae71a23c 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -55,7 +56,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
     thrust::make_tuple(gather_map.begin(), thrust::make_discard_iterator()));
 
   auto func = cudf::detail::replace_policy_functor();
-  thrust::equal_to<cudf::size_type> eq;
+  cuda::std::equal_to<cudf::size_type> eq;
   if (replace_policy == cudf::replace_policy::PRECEDING) {
     thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
                                   group_labels.begin(),
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index a90445fabe1..160d0a3b276 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -37,6 +37,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
@@ -122,7 +123,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to{},
+                                    cuda::std::equal_to{},
                                     binop);
     };
 
@@ -167,7 +168,7 @@ struct group_scan_functor<K,
                                     group_labels.end(),
                                     inp_iter,
                                     out_iter,
-                                    thrust::equal_to{},
+                                    cuda::std::equal_to{},
                                     binop);
     };
 
@@ -209,7 +210,7 @@ struct group_scan_functor<K,
                                   group_labels.end(),
                                   thrust::make_counting_iterator<size_type>(0),
                                   gather_map.begin(),
-                                  thrust::equal_to{},
+                                  cuda::std::equal_to{},
                                   binop_generator.binop());
 
     //
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 662c380eff5..9dba468bf14 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -175,7 +175,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to{},
+                            cuda::std::equal_to{},
                             binop);
     };
 
@@ -201,7 +201,7 @@ struct group_reduction_functor<
       rmm::device_uvector<bool> validity(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or{});
+                   cuda::std::logical_or{});
 
       auto [null_mask, null_count] =
         cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr);
@@ -238,7 +238,7 @@ struct group_reduction_functor<
                             inp_iter,
                             thrust::make_discard_iterator(),
                             out_iter,
-                            thrust::equal_to{},
+                            cuda::std::equal_to{},
                             binop);
     };
 
@@ -254,7 +254,7 @@ struct group_reduction_functor<
       auto validity           = rmm::device_uvector<bool>(num_groups, stream);
       do_reduction(cudf::detail::make_validity_iterator(*d_values_ptr),
                    validity.begin(),
-                   thrust::logical_or{});
+                   cuda::std::logical_or{});
 
       auto [null_mask, null_count] =
         cudf::detail::valid_if(validity.begin(), validity.end(), cuda::std::identity{}, stream, mr);
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 11d5749ee38..2be2e42c2b3 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -21,6 +21,7 @@
 #include "io/utilities/hostdevice_vector.hpp"
 
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/avro.hpp>
@@ -300,8 +301,10 @@ rmm::device_buffer decompress_data(datasource& source,
 
     size_t const uncompressed_data_size =
       std::reduce(uncompressed_data_sizes.begin(), uncompressed_data_sizes.end());
-    size_t const max_uncomp_block_size = std::reduce(
-      uncompressed_data_sizes.begin(), uncompressed_data_sizes.end(), 0, thrust::maximum<size_t>());
+    size_t const max_uncomp_block_size = std::reduce(uncompressed_data_sizes.begin(),
+                                                     uncompressed_data_sizes.end(),
+                                                     0,
+                                                     cudf::detail::maximum<size_t>());
 
     size_t temp_size = 0;
     status =
diff --git a/cpp/src/io/comp/nvcomp_adapter.cu b/cpp/src/io/comp/nvcomp_adapter.cu
index cf5996dfd93..30501c3f2e2 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cu
+++ b/cpp/src/io/comp/nvcomp_adapter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 #include "nvcomp_adapter.cuh"
 
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -122,7 +123,7 @@ std::pair<size_t, size_t> max_chunk_and_total_input_size(device_span<size_t cons
                                   input_sizes.begin(),
                                   input_sizes.end(),
                                   0ul,
-                                  thrust::maximum<size_t>());
+                                  cudf::detail::maximum<size_t>());
   auto const sum = thrust::reduce(rmm::exec_policy(stream), input_sizes.begin(), input_sizes.end());
   return {max, sum};
 }
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index 7b217d08da3..4b80b981030 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -27,6 +27,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/std/functional>
 #include <thrust/device_ptr.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -400,7 +401,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
       d_kv_operations.Current(),
       detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
       num_symbols_in,
-      cub::Equality{},
+      cuda::std::equal_to{},
       stream));
     stack_level_scan_bytes = std::max(gen_segments_scan_bytes, scan_by_key_bytes);
   } else {
@@ -499,7 +500,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
       d_kv_operations.Current(),
       detail::AddStackLevelFromStackOp<StackSymbolToStackOpTypeT>{symbol_to_stack_op},
       num_symbols_in,
-      cub::Equality{},
+      cuda::std::equal_to{},
       stream));
   } else {
     CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(
diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu
index c4fe7926706..13d1751e03d 100644
--- a/cpp/src/io/json/column_tree_construction.cu
+++ b/cpp/src/io/json/column_tree_construction.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "nested_json.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
@@ -208,7 +209,7 @@ std::tuple<compressed_sparse_row, column_tree_properties> reduce_to_column_tree(
                           thrust::make_constant_iterator(1),
                           non_leaf_nodes.begin(),
                           non_leaf_nodes_children.begin(),
-                          thrust::equal_to<TreeDepthT>());
+                          cuda::std::equal_to<TreeDepthT>());
 
     thrust::scatter(rmm::exec_policy_nosync(stream),
                     non_leaf_nodes_children.begin(),
diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu
index e506d60a2be..712d280c11f 100644
--- a/cpp/src/io/json/host_tree_algorithms.cu
+++ b/cpp/src/io/json/host_tree_algorithms.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1007,13 +1008,13 @@ void scatter_offsets(tree_meta_t const& tree,
                              col.string_offsets.begin(),
                              col.string_offsets.end(),
                              col.string_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
+                             cudf::detail::maximum<json_column::row_offset_t>{});
     } else if (col.type == json_col_t::ListColumn) {
       thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.child_offsets.begin(),
                              col.child_offsets.end(),
                              col.child_offsets.begin(),
-                             thrust::maximum<json_column::row_offset_t>{});
+                             cudf::detail::maximum<json_column::row_offset_t>{});
     }
   }
   stream.synchronize();
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 1fe58a0449f..c0790c2f73d 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -130,8 +131,8 @@ reduce_to_column_tree(tree_meta_t const& tree,
                         ordered_row_offsets,
                         unique_col_ids.begin(),
                         max_row_offsets.begin(),
-                        thrust::equal_to<size_type>(),
-                        thrust::maximum<size_type>());
+                        cuda::std::equal_to<size_type>(),
+                        cudf::detail::maximum<size_type>());
 
   // 3. reduce_by_key {col_id}, {node_categories} - custom opp (*+v=*, v+v=v, *+#=E)
   rmm::device_uvector<NodeT> column_categories(num_columns, stream);
@@ -142,7 +143,7 @@ reduce_to_column_tree(tree_meta_t const& tree,
     thrust::make_permutation_iterator(tree.node_categories.begin(), ordered_node_ids.begin()),
     unique_col_ids.begin(),
     column_categories.begin(),
-    thrust::equal_to<size_type>(),
+    cuda::std::equal_to<size_type>(),
     [] __device__(NodeT type_a, NodeT type_b) -> NodeT {
       auto is_a_leaf = (type_a == NC_VAL || type_a == NC_STR);
       auto is_b_leaf = (type_b == NC_VAL || type_b == NC_STR);
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index e2fe926ea19..e0d6f51aad9 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/hashing/detail/default_hash.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
@@ -213,8 +214,8 @@ void propagate_first_sibling_to_other(cudf::device_span<TreeDepthT const> node_l
     sorted_node_levels.end(),
     thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
     thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
-    thrust::equal_to<TreeDepthT>{},
-    thrust::maximum<NodeIndexT>{});
+    cuda::std::equal_to<TreeDepthT>{},
+    cudf::detail::maximum<NodeIndexT>{});
 }
 
 // Generates a tree representation of the given tokens, token_indices.
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 1587c4da9c8..b8f0fe7cb07 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -333,8 +333,8 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                   validity_iterator,
                                   d_str_separator.begin(),
                                   false,
-                                  thrust::equal_to<size_type>{},
-                                  thrust::logical_or<bool>{});
+                                  cuda::std::equal_to<size_type>{},
+                                  cuda::std::logical_or<bool>{});
     thrust::for_each(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      thrust::make_counting_iterator<size_type>(total_rows),
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index c0887304db9..426e470a151 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -18,6 +18,7 @@
 #include "io/utilities/column_buffer.hpp"
 #include "orc_gpu.hpp"
 
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/io/orc_types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -1511,10 +1512,11 @@ static __device__ void DecodeRowPositions(orcdec_state_s* s,
       }
       if (t == nrows - 1) { s->u.rowdec.nz_count = min(nz_count, s->top.data.max_vals); }
       __syncthreads();
+
       // TBD: Brute-forcing this, there might be a more efficient way to find the thread with the
       // last row
       last_row = (nz_count == s->u.rowdec.nz_count) ? row_plus1 : 0;
-      last_row = block_reduce(temp_storage).Reduce(last_row, cub::Max());
+      last_row = block_reduce(temp_storage).Reduce(last_row, cudf::detail::maximum{});
       nz_pos   = (valid) ? nz_count : 0;
       if (t == 0) { s->top.data.nrows = last_row; }
       if (valid && nz_pos - 1 < s->u.rowdec.nz_count) { s->u.rowdec.row[nz_pos - 1] = row_plus1; }
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 3a1f3a88da4..2ccf3f5d284 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/batched_memcpy.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
@@ -366,8 +367,9 @@ static __device__ uint32_t IntegerRLE(
   orcenc_state_s* s, T const* inbuf, uint32_t inpos, uint32_t numvals, int t, Storage& temp_storage)
 {
   using block_reduce = cub::BlockReduce<T, block_size>;
-  uint8_t* dst       = s->stream.data_ptrs[cid] + s->strm_pos[cid];
-  uint32_t out_cnt   = 0;
+
+  uint8_t* dst     = s->stream.data_ptrs[cid] + s->strm_pos[cid];
+  uint32_t out_cnt = 0;
   __shared__ uint64_t block_vmin;
 
   while (numvals > 0) {
@@ -413,9 +415,9 @@ static __device__ uint32_t IntegerRLE(
       T vmin = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::max();
       T vmax = (t < literal_run) ? v0 : cuda::std::numeric_limits<T>::min();
       uint32_t literal_mode, literal_w;
-      vmin = block_reduce(temp_storage).Reduce(vmin, cub::Min());
+      vmin = block_reduce(temp_storage).Reduce(vmin, cudf::detail::minimum{});
       __syncthreads();
-      vmax = block_reduce(temp_storage).Reduce(vmax, cub::Max());
+      vmax = block_reduce(temp_storage).Reduce(vmax, cudf::detail::maximum{});
       if (t == 0) {
         uint32_t mode1_w, mode2_w;
         typename std::make_unsigned<T>::type vrange_mode1, vrange_mode2;
diff --git a/cpp/src/io/parquet/delta_enc.cuh b/cpp/src/io/parquet/delta_enc.cuh
index 56b7c8065ee..8dba755b73a 100644
--- a/cpp/src/io/parquet/delta_enc.cuh
+++ b/cpp/src/io/parquet/delta_enc.cuh
@@ -19,6 +19,7 @@
 #include "parquet_gpu.hpp"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <cub/cub.cuh>
@@ -221,6 +222,7 @@ class delta_binary_packer {
   inline __device__ uint8_t* flush()
   {
     using cudf::detail::warp_size;
+
     __shared__ T block_min;
 
     int const t       = threadIdx.x;
@@ -240,7 +242,7 @@ class delta_binary_packer {
                                             : cuda::std::numeric_limits<T>::max();
 
     // Find min delta for the block.
-    auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cub::Min());
+    auto const min_delta = block_reduce(*_block_tmp).Reduce(delta, cudf::detail::minimum{});
 
     if (t == 0) { block_min = min_delta; }
     __syncthreads();
@@ -250,7 +252,7 @@ class delta_binary_packer {
 
     // Get max normalized delta for each warp, and use that to determine how many bits to use
     // for the bitpacking of this warp.
-    U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cub::Max());
+    U const warp_max = warp_reduce(_warp_tmp[warp_id]).Reduce(norm_delta, cudf::detail::maximum{});
     __syncwarp();
 
     if (lane_id == 0) { _mb_bits[warp_id] = sizeof(long long) * 8 - __clzll(warp_max); }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 7d670057cf9..fe9b05c8054 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -21,6 +21,7 @@
 #include "rle_stream.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/strings/detail/gather.cuh>
 
@@ -498,6 +499,7 @@ __device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* d
 {
   using cudf::detail::warp_size;
   using WarpReduce = cub::WarpReduce<uleb128_t>;
+
   __shared__ typename WarpReduce::TempStorage temp_storage[2];
 
   __shared__ __align__(16) delta_binary_decoder prefixes;
@@ -550,7 +552,8 @@ __device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* d
     // note: warp_sum will only be valid on lane 0.
     auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
     __syncwarp();
-    auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max());
+    auto const warp_max =
+      WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cudf::detail::maximum{});
 
     if (lane_id == 0) {
       total_bytes += warp_sum;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index be1e7d38fff..5242b18b574 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1149,7 +1149,7 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 page_keys + pages.size(),
                                 decomp_iter,
                                 decomp_info.begin(),
-                                thrust::equal_to<int32_t>{},
+                                cuda::std::equal_to<int32_t>{},
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
@@ -1388,7 +1388,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
                                   page_keys + pass.pages.size(),
                                   page_size,
                                   c_info.begin(),
-                                  thrust::equal_to{},
+                                  cuda::std::equal_to{},
                                   cumulative_page_sum{});
 
     // include scratch space needed for decompression. for certain codecs (eg ZSTD) this
@@ -1703,7 +1703,7 @@ void reader::impl::compute_output_chunks_for_subpass()
                                 page_keys + subpass.pages.size(),
                                 page_input,
                                 c_info.begin(),
-                                thrust::equal_to{},
+                                cuda::std::equal_to{},
                                 cumulative_page_sum{});
   auto iter = thrust::make_counting_iterator(0);
   // cap the max row in all pages by the max row we expect in the subpass. input chunking
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e1e9bac5a07..052ed80bc14 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/batched_memset.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -608,7 +609,7 @@ void decode_page_headers(pass_intermediate_data& pass,
                                             level_bit_size,
                                             level_bit_size + pass.chunks.size(),
                                             0,
-                                            thrust::maximum<int>());
+                                            cudf::detail::maximum<int>());
   pass.level_type_size     = std::max(1, cudf::util::div_rounding_up_safe(max_level_bits, 8));
 
   // sort the pages in chunk/schema order.
diff --git a/cpp/src/io/statistics/typed_statistics_chunk.cuh b/cpp/src/io/statistics/typed_statistics_chunk.cuh
index dc023e69423..34e663447e3 100644
--- a/cpp/src/io/statistics/typed_statistics_chunk.cuh
+++ b/cpp/src/io/statistics/typed_statistics_chunk.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include "statistics_type_identification.cuh"
 #include "temp_storage_wrapper.cuh"
 
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -202,11 +203,12 @@ __inline__ __device__ typed_statistics_chunk<T, include_aggregate> block_reduce(
   using E              = typename detail::extrema_type<T>::type;
   using extrema_reduce = cub::BlockReduce<E, block_size>;
   using count_reduce   = cub::BlockReduce<uint32_t, block_size>;
-  output_chunk.minimum_value =
-    extrema_reduce(storage.template get<E>()).Reduce(output_chunk.minimum_value, cub::Min());
+
+  output_chunk.minimum_value = extrema_reduce(storage.template get<E>())
+                                 .Reduce(output_chunk.minimum_value, cudf::detail::minimum{});
   __syncthreads();
-  output_chunk.maximum_value =
-    extrema_reduce(storage.template get<E>()).Reduce(output_chunk.maximum_value, cub::Max());
+  output_chunk.maximum_value = extrema_reduce(storage.template get<E>())
+                                 .Reduce(output_chunk.maximum_value, cudf::detail::maximum{});
   __syncthreads();
   output_chunk.non_nulls =
     count_reduce(storage.template get<uint32_t>()).Sum(output_chunk.non_nulls);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 2750a17d328..c6391d49294 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -814,7 +815,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     str_tuples + col_size,
     cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
     size_type{0},
-    thrust::maximum<size_type>{});
+    cudf::detail::maximum<size_type>{});
 
   auto sizes           = rmm::device_uvector<size_type>(col_size, stream);
   auto d_sizes         = sizes.data();
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 6f2acbb0712..0ed4b5193b7 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,6 +34,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/std/functional>
 #include <thrust/distance.h>
 #include <thrust/functional.h>
 #include <thrust/reduce.h>
@@ -103,8 +104,8 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                          contained.begin(),  // values to reduce
                                          list_indices.begin(),     // out keys
                                          overlap_results.begin(),  // out values
-                                         thrust::equal_to{},  // comp for keys
-                                         thrust::logical_or{});  // reduction op for values
+                                         cuda::std::equal_to{},  // comp for keys
+                                         cuda::std::logical_or{});  // reduction op for values
   auto const num_non_empty_segments = thrust::distance(overlap_results.begin(), end.second);
 
   auto [null_mask, null_count] =
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 3a365477366..83423649507 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/quantiles.hpp>
@@ -395,7 +396,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
       return std::pair<rmm::device_buffer, size_type>{rmm::device_buffer{}, null_count};
     }
     return cudf::detail::valid_if(
-      tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
+      tdigest_is_empty, tdigest_is_empty + tdv.size(), cuda::std::logical_not{}, stream, mr);
   }();
 
   return cudf::make_lists_column(input.size(),
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index fd98d262154..f07b8695024 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -793,7 +794,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                         centroids_begin,                  // values
                         thrust::make_discard_iterator(),  // key output
                         output,                           // output
-                        thrust::equal_to{},               // key equality check
+                        cuda::std::equal_to{},            // key equality check
                         merge_centroids{});
 
   // create final tdigest column
@@ -1161,8 +1162,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                         min_iter,
                         thrust::make_discard_iterator(),
                         merged_min_col->mutable_view().begin<double>(),
-                        thrust::equal_to{},  // key equality check
-                        thrust::minimum{});
+                        cuda::std::equal_to{},  // key equality check
+                        cudf::detail::minimum{});
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
@@ -1176,8 +1177,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                         max_iter,
                         thrust::make_discard_iterator(),
                         merged_max_col->mutable_view().begin<double>(),
-                        thrust::equal_to{},  // key equality check
-                        thrust::maximum{});
+                        cuda::std::equal_to{},  // key equality check
+                        cudf::detail::maximum{});
 
   auto tdigest_offsets = tdv.centroids().offsets();
 
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 6c35e750e6b..d9b1fefe09a 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/types.hpp>
@@ -249,7 +250,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
                                             counts.begin(),
                                             counts.end(),
                                             size_type{0},
-                                            thrust::maximum<size_type>{});
+                                            cudf::detail::maximum<size_type>{});
 
       auto const new_scale = numeric::scale_type{col.type().scale() * max_count};
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index 8a98b65b406..d189b397afd 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -53,6 +54,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
   //   offsets        == [0, 2, 5, 5, 8, 11, 13]
   //   scatter result == [0, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0]
   //
+
   auto const num_child_rows{
     cudf::detail::get_value<size_type>(offsets, offsets.size() - 1, stream)};
   auto per_row_mapping = make_fixed_width_column(
@@ -83,7 +85,7 @@ std::unique_ptr<column> get_list_child_to_list_row_mapping(cudf::column_view con
                          per_row_mapping_begin,
                          per_row_mapping_begin + num_child_rows,
                          per_row_mapping_begin,
-                         thrust::maximum{});
+                         cudf::detail::maximum{});
   return per_row_mapping;
 }
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index e7dca2277ec..35a9a3ec38d 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -145,7 +146,7 @@ void tie_break_ranks_transform(cudf::device_span<size_type const> dense_rank_sor
                         tie_iter,
                         thrust::make_discard_iterator(),
                         tie_sorted.begin(),
-                        thrust::equal_to{},
+                        cuda::std::equal_to{},
                         tie_breaker);
   using TransformerReturnType =
     cuda::std::decay_t<cuda::std::invoke_result_t<Transformer, TieType>>;
@@ -202,7 +203,7 @@ void rank_min(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::minimum{},
+                                       cudf::detail::minimum{},
                                        cuda::std::identity{},
                                        stream);
 }
@@ -220,7 +221,7 @@ void rank_max(cudf::device_span<size_type const> group_keys,
                                        thrust::make_counting_iterator<size_type>(1),
                                        sorted_order_view,
                                        rank_mutable_view.begin<outputType>(),
-                                       thrust::maximum{},
+                                       cudf::detail::maximum{},
                                        cuda::std::identity{},
                                        stream);
 }
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 352ca83c8b2..9d30e3d0026 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/strings/detail/split_utils.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split.hpp>
@@ -135,7 +136,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
     }),
     0,
-    thrust::maximum{});
+    cudf::detail::maximum{});
 
   // build strings columns for each token position
   for (size_type col = 0; col < columns_count; ++col) {
@@ -346,7 +347,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 
   // column count is the maximum number of tokens for any string
   size_type const columns_count = thrust::reduce(
-    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, thrust::maximum{});
+    rmm::exec_policy(stream), token_counts.begin(), token_counts.end(), 0, cudf::detail::maximum{});
 
   std::vector<std::unique_ptr<column>> results;
   // boundary case: if no columns, return one null column (issue #119)
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index ef96b9d3f36..68b610bcb93 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -227,7 +228,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
     }),
     0,
-    thrust::maximum<size_type>{});
+    cudf::detail::maximum<size_type>{});
 
   // boundary case: if no columns, return one all-null column (custrings issue #119)
   if (columns_count == 0) {
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 0aacfd16f67..972bcc32077 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -212,7 +213,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
     }
   }
   // compute the min rank across the block
-  auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+  auto const reduce_rank =
+    block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid);
   if (lane_idx == 0) { block_min_rank = reduce_rank; }
   __syncthreads();
 
@@ -277,7 +279,8 @@ CUDF_KERNEL void bpe_parallel_fn(cudf::column_device_view const d_strings,
     }
 
     // re-compute the minimum rank across the block (since new pairs are created above)
-    auto const reduce_rank = block_reduce(temp_storage).Reduce(min_rank, cub::Min(), num_valid);
+    auto const reduce_rank =
+      block_reduce(temp_storage).Reduce(min_rank, cudf::detail::minimum{}, num_valid);
     if (lane_idx == 0) { block_min_rank = reduce_rank; }
     __syncthreads();
   }  // if no min ranks are found we are done, otherwise start again
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 663595af5df..61a7375772b 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -24,6 +24,7 @@
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
@@ -368,7 +369,7 @@ CUDF_KERNEL void minhash_kernel(offsets_type offsets_itr,
       auto const values = block_values + (lane_idx * block_size);
       // cooperative groups does not have a min function and cub::BlockReduce was slower
       auto const minv =
-        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+        thrust::reduce(thrust::seq, values, values + block_size, init, cudf::detail::minimum{});
       if constexpr (blocks_per_row > 1) {
         // accumulates mins for each block into d_output
         cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 119d8e7b138..d6a991f675c 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -19,6 +19,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/functional.hpp>
 #include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -28,7 +29,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cub/device/device_reduce.cuh>
-#include <cuda/std/functional>
+#include <cuda/functional>
 #include <thrust/distance.h>
 #include <thrust/equal.h>
 #include <thrust/execution_policy.h>
@@ -59,7 +60,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum{},
+                              cudf::detail::minimum{},
                               init,
                               cudf::get_default_stream().value());
 
@@ -72,7 +73,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               d_in,
                               dev_result.begin(),
                               num_items,
-                              thrust::minimum{},
+                              cudf::detail::minimum{},
                               init,
                               cudf::get_default_stream().value());
 
@@ -98,7 +99,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                       d_in_last,
                       dev_expected.begin(),
                       dev_results.begin(),
-                      thrust::equal_to{});
+                      cuda::std::equal_to{});
     auto result = thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                                  dev_results.begin(),
                                  dev_results.end(),

From 45bd05d51435fe4b50ee48a256b3eb4772c5b086 Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Mon, 3 Mar 2025 23:27:38 -0500
Subject: [PATCH 35/42] Port all conda recipes to `rattler-build` (#18054)

Port all condabuild recipes over to use `rattler-build` instead.

Contributes to rapidsai/build-planning#47

- To satisfy `rattler`, this changes all the licenses in the `pyproject.toml` files to the SPDX-compliant `Apache-2.0` instead of `Apache 2.0`

Authors:
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18054
---
 ci/build_cpp.sh                               |  20 +-
 ci/build_python.sh                            | 115 +++++--
 conda/recipes/cudf-polars/build.sh            |   4 -
 conda/recipes/cudf-polars/meta.yaml           |  61 ----
 conda/recipes/cudf-polars/recipe.yaml         |  67 ++++
 conda/recipes/cudf/build.sh                   |   4 -
 conda/recipes/cudf/meta.yaml                  | 119 -------
 conda/recipes/cudf/recipe.yaml                | 126 +++++++
 conda/recipes/cudf_kafka/build.sh             |   3 -
 conda/recipes/cudf_kafka/meta.yaml            |  86 -----
 conda/recipes/cudf_kafka/recipe.yaml          |  85 +++++
 conda/recipes/custreamz/build.sh              |   4 -
 conda/recipes/custreamz/meta.yaml             |  65 ----
 conda/recipes/custreamz/recipe.yaml           |  54 +++
 conda/recipes/dask-cudf/build.sh              |   4 -
 conda/recipes/dask-cudf/meta.yaml             |  62 ----
 conda/recipes/dask-cudf/recipe.yaml           |  50 +++
 conda/recipes/libcudf/build.sh                |   9 -
 conda/recipes/libcudf/install_libcudf.sh      |   4 -
 .../libcudf/install_libcudf_example.sh        |   5 -
 .../recipes/libcudf/install_libcudf_kafka.sh  |   4 -
 .../recipes/libcudf/install_libcudf_tests.sh  |   5 -
 conda/recipes/libcudf/meta.yaml               | 220 ------------
 conda/recipes/libcudf/recipe.yaml             | 323 ++++++++++++++++++
 conda/recipes/pylibcudf/build.sh              |   4 -
 conda/recipes/pylibcudf/meta.yaml             | 100 ------
 conda/recipes/pylibcudf/recipe.yaml           | 106 ++++++
 python/cudf/pyproject.toml                    |   2 +-
 python/cudf_kafka/pyproject.toml              |   2 +-
 python/cudf_polars/pyproject.toml             |   2 +-
 python/custreamz/pyproject.toml               |   2 +-
 python/dask_cudf/pyproject.toml               |   2 +-
 python/libcudf/pyproject.toml                 |   2 +-
 python/pylibcudf/pyproject.toml               |   2 +-
 34 files changed, 915 insertions(+), 808 deletions(-)
 delete mode 100644 conda/recipes/cudf-polars/build.sh
 delete mode 100644 conda/recipes/cudf-polars/meta.yaml
 create mode 100644 conda/recipes/cudf-polars/recipe.yaml
 delete mode 100644 conda/recipes/cudf/build.sh
 delete mode 100644 conda/recipes/cudf/meta.yaml
 create mode 100644 conda/recipes/cudf/recipe.yaml
 delete mode 100644 conda/recipes/cudf_kafka/build.sh
 delete mode 100644 conda/recipes/cudf_kafka/meta.yaml
 create mode 100644 conda/recipes/cudf_kafka/recipe.yaml
 delete mode 100644 conda/recipes/custreamz/build.sh
 delete mode 100644 conda/recipes/custreamz/meta.yaml
 create mode 100644 conda/recipes/custreamz/recipe.yaml
 delete mode 100644 conda/recipes/dask-cudf/build.sh
 delete mode 100644 conda/recipes/dask-cudf/meta.yaml
 create mode 100644 conda/recipes/dask-cudf/recipe.yaml
 delete mode 100644 conda/recipes/libcudf/build.sh
 delete mode 100644 conda/recipes/libcudf/install_libcudf.sh
 delete mode 100644 conda/recipes/libcudf/install_libcudf_example.sh
 delete mode 100644 conda/recipes/libcudf/install_libcudf_kafka.sh
 delete mode 100644 conda/recipes/libcudf/install_libcudf_tests.sh
 delete mode 100644 conda/recipes/libcudf/meta.yaml
 create mode 100644 conda/recipes/libcudf/recipe.yaml
 delete mode 100644 conda/recipes/pylibcudf/build.sh
 delete mode 100644 conda/recipes/pylibcudf/meta.yaml
 create mode 100644 conda/recipes/pylibcudf/recipe.yaml

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 0c324d01cdf..78a15bc8092 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -17,10 +17,24 @@ rapids-logger "Begin cpp build"
 
 sccache --zero-stats
 
-# With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry build \
-    conda/recipes/libcudf
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version)
+export RAPIDS_PACKAGE_VERSION
+
+source rapids-rattler-channel-string
+
+# --no-build-id allows for caching with `sccache`
+# more info is available at
+# https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build
+rattler-build build --recipe conda/recipes/libcudf \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
 
+# remove build_cache directory
+rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
+
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_python.sh b/ci/build_python.sh
index abbdc3f3a3b..1dd8b67dfbb 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -3,8 +3,6 @@
 
 set -euo pipefail
 
-rapids-configure-conda-channels
-
 source rapids-configure-sccache
 
 source rapids-date-string
@@ -19,53 +17,100 @@ rapids-logger "Begin py build"
 
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION)
+export RAPIDS_PACKAGE_VERSION
+
+# populates `RATTLER_CHANNELS` array
+source rapids-rattler-channel-string
+
+rapids-logger "Prepending channel ${CPP_CHANNEL} to RATTLER_CHANNELS"
+
+RATTLER_CHANNELS=("--channel" "${CPP_CHANNEL}" "${RATTLER_CHANNELS[@]}")
+
 sccache --zero-stats
 
-# TODO: Remove `--no-test` flag once importing on a CPU
-# node works correctly
-# With boa installed conda build forwards to the boa builder
+rapids-logger "Building pylibcudf"
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  conda/recipes/pylibcudf
+# TODO: Remove `--test skip` flag once importing on a CPU node works correctly
+# --no-build-id allows for caching with `sccache`
+# more info is available at
+# https://rattler.build/latest/tips_and_tricks/#using-sccache-or-ccache-with-rattler-build
+rattler-build build --recipe conda/recipes/pylibcudf \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cudf
+rapids-logger "Building cudf"
+
+rattler-build build --recipe conda/recipes/cudf \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
+
+sccache --show-adv-stats
+sccache --zero-stats
+
+rapids-logger "Building dask-cudf"
+
+rattler-build build --recipe conda/recipes/dask-cudf \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
+
+sccache --show-adv-stats
+sccache --zero-stats
+
+rapids-logger "Building cudf_kafka"
+
+rattler-build build --recipe conda/recipes/cudf_kafka \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
+
+sccache --show-adv-stats
+sccache --zero-stats
+
+rapids-logger "Building custreamz"
+
+rattler-build build --recipe conda/recipes/custreamz \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/dask-cudf
+rapids-logger "Building cudf-polars"
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cudf_kafka
+rattler-build build --recipe conda/recipes/cudf-polars \
+                    --experimental \
+                    --no-build-id \
+                    --channel-priority disabled \
+                    --output-dir "$RAPIDS_CONDA_BLD_OUTPUT_DIR" \
+                    --test skip \
+                    "${RATTLER_CHANNELS[@]}"
 
 sccache --show-adv-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/custreamz
-
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
-  --no-test \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
-  conda/recipes/cudf-polars
+# remove build_cache directory
+rm -rf "$RAPIDS_CONDA_BLD_OUTPUT_DIR"/build_cache
 
 rapids-upload-conda-to-s3 python
diff --git a/conda/recipes/cudf-polars/build.sh b/conda/recipes/cudf-polars/build.sh
deleted file mode 100644
index 06e2f1bcb99..00000000000
--- a/conda/recipes/cudf-polars/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh cudf_polars
diff --git a/conda/recipes/cudf-polars/meta.yaml b/conda/recipes/cudf-polars/meta.yaml
deleted file mode 100644
index 64a147d3c63..00000000000
--- a/conda/recipes/cudf-polars/meta.yaml
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cudf-polars
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=cudf-polars-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=cudf-polars-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - setuptools
-    - cuda-version ={{ cuda_version }}
-  run:
-    - python
-    - pylibcudf ={{ version }}
-    - polars >=1.20,<1.24
-    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - cudf_polars
-
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: cudf-polars library
diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml
new file mode 100644
index 00000000000..8eaf7e4f843
--- /dev/null
+++ b/conda/recipes/cudf-polars/recipe.yaml
@@ -0,0 +1,67 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: cudf-polars
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh cudf_polars
+    secrets:
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN
+    env:
+      CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+      CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+      CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+      CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      SCCACHE_S3_KEY_PREFIX: cudf-polars-${{ env.get("RAPIDS_CONDA_ARCH") }}
+
+requirements:
+  host:
+    - python =${{ py_version }}
+    - pip
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
+    - cuda-version =${{ cuda_version }}
+  run:
+    - python
+    - pylibcudf =${{ version }}
+    - polars >=1.20,<1.24
+    - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+  ignore_run_exports:
+    by_name:
+      - cuda-version
+
+tests:
+  - python:
+      imports:
+        - cudf_polars
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/cudf_polars/pyproject.toml").project.description }}
diff --git a/conda/recipes/cudf/build.sh b/conda/recipes/cudf/build.sh
deleted file mode 100644
index 43d046402c7..00000000000
--- a/conda/recipes/cudf/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh cudf
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
deleted file mode 100644
index 43060ef1c87..00000000000
--- a/conda/recipes/cudf/meta.yaml
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cudf
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=cudf-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=cudf-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-  ignore_run_exports_from:
-    - {{ compiler('cuda') }}
-    {% if cuda_major != "11" %}
-    - cuda-cudart-dev
-    - libcufile-dev  # [linux64]
-    {% endif %}
-
-requirements:
-  build:
-    - cmake {{ cmake_version }}
-    - ninja
-    - {{ compiler('c') }}
-    - {{ compiler('cxx') }}
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda') }} ={{ cuda_version }}
-    {% else %}
-    - {{ compiler('cuda') }}
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-    - {{ stdlib("c") }}
-  host:
-    - python
-    - cython >=3.0.3
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.10.0
-    - dlpack >=0.8,<1.0
-    - libcudf ={{ version }}
-    - pylibcudf ={{ version }}
-    - rmm ={{ minor_version }}
-    {% if cuda_major == "11" %}
-    - cudatoolkit
-    {% else %}
-    - cuda-cudart-dev
-    - cuda-nvrtc
-    - libcufile-dev  # [linux64]
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-  run:
-    - python
-    - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.4dev0
-    - cupy >=12.0.0
-    - numba-cuda >=0.4.0,<0.5.0a0
-    - numba >=0.59.1,<0.62.0a0
-    - numpy >=1.23,<2.1
-    - pyarrow>=14.0.0,<20.0.0a0
-    - libcudf ={{ version }}
-    - pylibcudf ={{ version }}
-    - {{ pin_compatible('rmm', max_pin='x.x') }}
-    - fsspec >=0.6.0
-    {% if cuda_major == "11" %}
-    - cudatoolkit
-    - ptxcompiler >=0.7.0
-    - cubinlinker  # CUDA enhanced compatibility.
-    - cuda-python >=11.8.5,<12.0a0
-    {% else %}
-    - cuda-cudart
-    - libcufile  # [linux64]
-    # Needed by Numba for CUDA support
-    - cuda-nvcc-impl
-    # TODO: Add nvjitlink here
-    # xref: https://github.com/rapidsai/cudf/issues/12822
-    - cuda-nvrtc
-    - cuda-python >=12.6.2,<13.0a0
-    - pynvjitlink
-    {% endif %}
-    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - nvtx >=0.2.1
-    - packaging
-    - cachetools
-    - rich
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - cudf
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: cuDF GPU DataFrame core library
diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml
new file mode 100644
index 00000000000..2cb330fb76d
--- /dev/null
+++ b/conda/recipes/cudf/recipe.yaml
@@ -0,0 +1,126 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: cudf
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh cudf
+    secrets:
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN
+    env:
+      CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+      CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+      CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+      CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      SCCACHE_S3_KEY_PREFIX: cudf-${{ env.get("RAPIDS_CONDA_ARCH") }}
+
+requirements:
+  build:
+    - cmake ${{ cmake_version }}
+    - ninja
+    - ${{ compiler("c") }}
+    - ${{ compiler("cxx") }}
+    - ${{ compiler("cuda") }}
+    - cuda-version =${{ cuda_version }}
+    - ${{ stdlib("c") }}
+  host:
+    - python =${{ py_version }}
+    - pip
+    - cython >=3.0.3
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - scikit-build-core >=0.10.0
+    - dlpack >=0.8,<1.0
+    - libcudf =${{ version }}
+    - pylibcudf =${{ version }}
+    - rmm =${{ minor_version }}
+    - if: cuda_major == "11"
+      then:
+        - cudatoolkit
+      else:
+        - cuda-cudart-dev
+        - cuda-nvrtc
+        - if: linux64
+          then:
+            - libcufile-dev
+    - cuda-version =${{ cuda_version }}
+  run:
+    - python
+    - typing_extensions >=4.0.0
+    - pandas >=2.0,<2.2.4dev0
+    - cupy >=12.0.0
+    - numba-cuda >=0.4.0,<0.5.0a0
+    - numba >=0.59.1,<0.62.0a0
+    - numpy >=1.23,<2.1
+    - pyarrow>=14.0.0,<20.0.0a0
+    - libcudf =${{ version }}
+    - pylibcudf =${{ version }}
+    - ${{ pin_compatible("rmm", upper_bound="x.x") }}
+    - fsspec >=0.6.0
+    - if: cuda_major == "11"
+      then:
+        - cudatoolkit
+        - ptxcompiler >=0.7.0
+        - cubinlinker  # CUDA enhanced compatibility.
+        - cuda-python >=11.8.5,<12.0a0
+      else:
+        - cuda-cudart
+        # Needed by Numba for CUDA support
+        - cuda-nvcc-impl
+        # TODO: Add nvjitlink here
+        # xref: https://github.com/rapidsai/cudf/issues/12822
+        - cuda-nvrtc
+        - cuda-python >=12.6.2,<13.0a0
+        - pynvjitlink
+        - if: linux64
+          then:
+            - libcufile
+    - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+    - nvtx >=0.2.1
+    - packaging
+    - cachetools
+    - rich
+  ignore_run_exports:
+    from_package:
+      - if: cuda_major != "11"
+        then:
+          - cuda-cudart-dev
+          - if: linux64
+            then: libcufile-dev
+    by_name:
+      - cuda-version
+
+tests:
+  - python:
+      imports:
+        - cudf
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/cudf/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/cudf/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/cudf/pyproject.toml").project.description }}
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
deleted file mode 100644
index 9458349d101..00000000000
--- a/conda/recipes/cudf_kafka/build.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
deleted file mode 100644
index a070c041d99..00000000000
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: cudf_kafka
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=cudf-kafka-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-  ignore_run_exports_from:
-    - {{ compiler('cuda') }}
-    {% if cuda_major != "11" %}
-    - cuda-cudart-dev
-    {% endif %}
-
-requirements:
-  build:
-    - cmake {{ cmake_version }}
-    - ninja
-    - {{ compiler('c') }}
-    - {{ compiler('cxx') }}
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda') }} ={{ cuda_version }}
-    {% else %}
-    - {{ compiler('cuda') }}
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-    - {{ stdlib("c") }}
-  host:
-    - python
-    - cython >=3.0.3
-    - cuda-version ={{ cuda_version }}
-    - pylibcudf ={{ version }}
-    - libcudf_kafka ={{ version }}
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.10.0
-    {% if cuda_major != "11" %}
-    - cuda-cudart-dev
-    {% endif %}
-  run:
-    - python
-    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-    - libcudf_kafka ={{ version }}
-    - pylibcudf ={{ version }}
-    {% if cuda_major != "11" %}
-    - cuda-cudart
-    {% endif %}
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - cudf_kafka
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: libcudf_kafka library
diff --git a/conda/recipes/cudf_kafka/recipe.yaml b/conda/recipes/cudf_kafka/recipe.yaml
new file mode 100644
index 00000000000..aba9d979e44
--- /dev/null
+++ b/conda/recipes/cudf_kafka/recipe.yaml
@@ -0,0 +1,85 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: cudf_kafka
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh cudf_kafka
+    secrets:
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN
+    env:
+      CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+      CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+      CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+      CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      SCCACHE_S3_KEY_PREFIX: cudf-kafka-${{ env.get("RAPIDS_CONDA_ARCH") }}
+
+requirements:
+  build:
+    - cmake ${{ cmake_version }}
+    - ninja
+    - ${{ compiler("c") }}
+    - ${{ compiler("cxx") }}
+    - ${{ compiler("cuda") }}
+    - cuda-version =${{ cuda_version }}
+    - ${{ stdlib("c") }}
+  host:
+    - python =${{ py_version }}
+    - pip
+    - cython >=3.0.3
+    - cuda-version =${{ cuda_version }}
+    - pylibcudf =${{ version }}
+    - libcudf_kafka =${{ version }}
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - scikit-build-core >=0.10.0
+    - if: cuda_major != "11"
+      then: cuda-cudart-dev
+  run:
+    - python
+    - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+    - libcudf_kafka =${{ version }}
+    - pylibcudf =${{ version }}
+    - if: cuda_major != "11"
+      then: cuda-cudart
+  ignore_run_exports:
+    from_package:
+      - if: cuda_major != "11"
+        then: cuda-cudart-dev
+    by_name:
+      - cuda-version
+
+tests:
+  - python:
+      imports:
+        - cudf_kafka
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/cudf_kafka/pyproject.toml").project.description }}
diff --git a/conda/recipes/custreamz/build.sh b/conda/recipes/custreamz/build.sh
deleted file mode 100644
index 88fccf90c69..00000000000
--- a/conda/recipes/custreamz/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh -v custreamz
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
deleted file mode 100644
index a031f05a73a..00000000000
--- a/conda/recipes/custreamz/meta.yaml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: custreamz
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=custreamz-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=custreamz-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - setuptools
-    - python-confluent-kafka >=2.5.0,<2.6.0a0
-    - cudf_kafka ={{ version }}
-    - cuda-version ={{ cuda_version }}
-  run:
-    - python
-    - streamz
-    - cudf ={{ version }}
-    - cudf_kafka ={{ version }}
-    - rapids-dask-dependency ={{ minor_version }}
-    - python-confluent-kafka >=2.5.0,<2.6.0a0
-    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - custreamz
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: cuStreamz library
diff --git a/conda/recipes/custreamz/recipe.yaml b/conda/recipes/custreamz/recipe.yaml
new file mode 100644
index 00000000000..4713df9efad
--- /dev/null
+++ b/conda/recipes/custreamz/recipe.yaml
@@ -0,0 +1,54 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: custreamz
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh custreamz
+
+requirements:
+  host:
+    - python =${{ py_version }}
+    - pip
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
+    - cudf_kafka =${{ version }}
+    - cuda-version =${{ cuda_version }}
+  run:
+    - python
+    - streamz
+    - cudf =${{ version }}
+    - cudf_kafka =${{ version }}
+    - rapids-dask-dependency =${{ minor_version }}
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
+    - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+
+tests:
+  - python:
+      imports:
+        - custreamz
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/custreamz/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/custreamz/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/custreamz/pyproject.toml").project.description }}
diff --git a/conda/recipes/dask-cudf/build.sh b/conda/recipes/dask-cudf/build.sh
deleted file mode 100644
index 473f52c28a0..00000000000
--- a/conda/recipes/dask-cudf/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2018-2019, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh dask_cudf
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
deleted file mode 100644
index a476d5d53df..00000000000
--- a/conda/recipes/dask-cudf/meta.yaml
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: dask-cudf
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=dask-cudf-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=dask-cudf-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-
-requirements:
-  host:
-    - python
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - setuptools
-    - cuda-version ={{ cuda_version }}
-  run:
-    - python
-    - cudf ={{ version }}
-    - pynvml >=12.0.0,<13.0.0a0
-    - rapids-dask-dependency ={{ minor_version }}
-    - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - dask_cudf
-
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: dask-cudf library
diff --git a/conda/recipes/dask-cudf/recipe.yaml b/conda/recipes/dask-cudf/recipe.yaml
new file mode 100644
index 00000000000..997150d2832
--- /dev/null
+++ b/conda/recipes/dask-cudf/recipe.yaml
@@ -0,0 +1,50 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: dask-cudf
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh dask_cudf
+
+requirements:
+  host:
+    - python =${{ py_version }}
+    - pip
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
+    - cuda-version =${{ cuda_version }}
+  run:
+    - python
+    - cudf =${{ version }}
+    - pynvml >=12.0.0,<13.0.0a0
+    - rapids-dask-dependency =${{ minor_version }}
+    - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+
+tests:
+  - python:
+      imports:
+        - dask_cudf
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/dask_cudf/pyproject.toml").project.description }}
diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
deleted file mode 100644
index a3a0415575b..00000000000
--- a/conda/recipes/libcudf/build.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-export cudf_ROOT="$(realpath ./cpp/build)"
-
-./build.sh -n -v \
-    libcudf libcudf_kafka benchmarks tests \
-    --build_metrics --incl_cache_stats --allgpuarch \
-    --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\"
diff --git a/conda/recipes/libcudf/install_libcudf.sh b/conda/recipes/libcudf/install_libcudf.sh
deleted file mode 100644
index 173f8cfa90f..00000000000
--- a/conda/recipes/libcudf/install_libcudf.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-
-cmake --install cpp/build
diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh
deleted file mode 100644
index 1a52dec99e3..00000000000
--- a/conda/recipes/libcudf/install_libcudf_example.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-# build and install libcudf examples
-./cpp/examples/build.sh --install
diff --git a/conda/recipes/libcudf/install_libcudf_kafka.sh b/conda/recipes/libcudf/install_libcudf_kafka.sh
deleted file mode 100644
index 9eae2510027..00000000000
--- a/conda/recipes/libcudf/install_libcudf_kafka.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-
-cmake --install cpp/libcudf_kafka/build
diff --git a/conda/recipes/libcudf/install_libcudf_tests.sh b/conda/recipes/libcudf/install_libcudf_tests.sh
deleted file mode 100644
index 069462eec9d..00000000000
--- a/conda/recipes/libcudf/install_libcudf_tests.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
-
-cmake --install cpp/build --component testing
-cmake --install cpp/libcudf_kafka/build --component testing
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
deleted file mode 100644
index f7bd7280f0f..00000000000
--- a/conda/recipes/libcudf/meta.yaml
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: libcudf-split
-
-source:
-  path: ../../..
-
-build:
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - RAPIDS_ARTIFACTS_DIR
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=libcudf-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=libcudf-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-
-requirements:
-  build:
-    - cmake {{ cmake_version }}
-    - {{ compiler('c') }}
-    - {{ compiler('cxx') }}
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda') }} ={{ cuda_version }}
-    {% else %}
-    - {{ compiler('cuda') }}
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-    - ninja
-    - {{ stdlib("c") }}
-  host:
-    - librmm ={{ minor_version }}
-    - libkvikio ={{ minor_version }}
-    {% if cuda_major == "11" %}
-    - cudatoolkit
-    - libcufile {{ cuda11_libcufile_host_version }}  # [linux64]
-    - libcufile-dev {{ cuda11_libcufile_host_version }}  # [linux64]
-    - libcurand {{ cuda11_libcurand_host_version }}
-    - libcurand-dev {{ cuda11_libcurand_host_version }}
-    - cuda-nvrtc ={{ cuda_version }}
-    - cuda-nvrtc-dev ={{ cuda_version }}
-    - cuda-nvtx ={{ cuda_version }}
-    {% else %}
-    - cuda-nvrtc-dev
-    - cuda-nvtx-dev
-    - libcufile-dev  # [linux64]
-    - libcurand-dev
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-    - nvcomp {{ nvcomp_version }}
-    - dlpack {{ dlpack_version }}
-    - librdkafka {{ librdkafka_version }}
-    - flatbuffers {{ flatbuffers_version }}
-    - rapids-logger =0.1
-    - zlib {{ zlib_version }}
-
-outputs:
-  - name: libcudf
-    version: {{ version }}
-    script: install_libcudf.sh
-    build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-      run_exports:
-        - {{ pin_subpackage("libcudf", max_pin="x.x") }}
-      ignore_run_exports_from:
-        - {{ compiler('cuda') }}
-    requirements:
-      build:
-        - cmake {{ cmake_version }}
-      host:
-        - cuda-version ={{ cuda_version }}
-      run:
-        - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-        {% if cuda_major == "11" %}
-        - cudatoolkit
-        - libcufile {{ cuda11_libcufile_run_version }}  # [linux64]
-        {% else %}
-        - cuda-nvrtc
-        - libcufile  # [linux64]
-        {% endif %}
-        - nvcomp {{ nvcomp_version }}
-        - librmm ={{ minor_version }}
-        - libkvikio ={{ minor_version }}
-        - dlpack {{ dlpack_version }}
-        - rapids-logger =0.1
-    test:
-      commands:
-        - test -f $PREFIX/lib/libcudf.so
-        - test -f $PREFIX/include/cudf/column/column.hpp
-    about:
-      home: https://rapids.ai/
-      license: Apache-2.0
-      license_family: APACHE
-      license_file: LICENSE
-      summary: libcudf library
-  - name: libcudf_kafka
-    version: {{ version }}
-    script: install_libcudf_kafka.sh
-    build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-      ignore_run_exports_from:
-        - {{ compiler('cuda') }}
-    requirements:
-      build:
-        - cmake {{ cmake_version }}
-      host:
-        - librdkafka {{ librdkafka_version }}
-        - {{ pin_subpackage('libcudf', exact=True) }}
-      run:
-        - librdkafka {{ librdkafka_version }}
-        - {{ pin_subpackage('libcudf', exact=True) }}
-    test:
-      commands:
-        - test -f $PREFIX/lib/libcudf_kafka.so
-    about:
-      home: https://rapids.ai/
-      license: Apache-2.0
-      license_family: APACHE
-      license_file: LICENSE
-      summary: libcudf_kafka library
-  - name: libcudf-example
-    version: {{ version }}
-    script: install_libcudf_example.sh
-    build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-      ignore_run_exports_from:
-        - {{ compiler('cuda') }}
-        {% if cuda_major != "11" %}
-        - cuda-nvtx-dev
-        {% endif %}
-    requirements:
-      build:
-        - cmake {{ cmake_version }}
-        - {{ compiler('c') }}
-        - {{ compiler('cxx') }}
-        {% if cuda_major == "11" %}
-        - {{ compiler('cuda') }} ={{ cuda_version }}
-        {% else %}
-        - {{ compiler('cuda') }}
-        {% endif %}
-        - cuda-version ={{ cuda_version }}
-        - ninja
-        - {{ stdlib("c") }}
-      host:
-        - {{ pin_subpackage('libcudf', exact=True) }}
-        {% if cuda_major == "11" %}
-        - cuda-nvtx ={{ cuda_version }}
-        {% else %}
-        - cuda-nvtx-dev
-        {% endif %}
-        - cuda-version ={{ cuda_version }}
-      run:
-        - {{ pin_subpackage('libcudf', exact=True) }}
-        - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-        {% if cuda_major != "11" %}
-        - cuda-nvtx
-        {% endif %}
-    about:
-      home: https://rapids.ai/
-      license: Apache-2.0
-      license_family: APACHE
-      license_file: LICENSE
-      summary: libcudf example executables
-  - name: libcudf-tests
-    version: {{ version }}
-    script: install_libcudf_tests.sh
-    build:
-      number: {{ GIT_DESCRIBE_NUMBER }}
-      string: cuda{{ cuda_major }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-      ignore_run_exports_from:
-        - {{ compiler('cuda') }}
-        {% if cuda_major != "11" %}
-        - libcurand-dev
-        {% endif %}
-    requirements:
-      build:
-        - cmake {{ cmake_version }}
-      host:
-        - {{ pin_subpackage('libcudf', exact=True) }}
-        - {{ pin_subpackage('libcudf_kafka', exact=True) }}
-        - cuda-version ={{ cuda_version }}
-        {% if cuda_major == "11" %}
-        - libcurand {{ cuda11_libcurand_run_version }}
-        {% else %}
-        - libcurand-dev
-        {% endif %}
-      run:
-        - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
-        - {{ pin_subpackage('libcudf', exact=True) }}
-        - {{ pin_subpackage('libcudf_kafka', exact=True) }}
-        {% if cuda_major == "11" %}
-        - libcurand {{ cuda11_libcurand_run_version }}
-        {% else %}
-        - libcurand
-        {% endif %}
-    about:
-      home: https://rapids.ai/
-      license: Apache-2.0
-      license_family: APACHE
-      license_file: LICENSE
-      summary: libcudf test & benchmark executables
diff --git a/conda/recipes/libcudf/recipe.yaml b/conda/recipes/libcudf/recipe.yaml
new file mode 100644
index 00000000000..8653dc68a9f
--- /dev/null
+++ b/conda/recipes/libcudf/recipe.yaml
@@ -0,0 +1,323 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+recipe:
+  name: libcudf-split
+
+cache:
+  source:
+    path: ../../..
+
+  build:
+    script:
+      content: |
+
+        # Remove `-fdebug-prefix-map` line from CFLAGS and CXXFLAGS so the
+        # incrementing version number in the compile line doesn't break the
+        # cache
+        set -x
+        export CFLAGS=$(echo $CFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g')
+        export CXXFLAGS=$(echo $CXXFLAGS | sed -E 's@\-fdebug\-prefix\-map[^ ]*@@g')
+        set +x
+
+        ./build.sh -n -v \
+            libcudf libcudf_kafka benchmarks tests \
+            --build_metrics --incl_cache_stats --allgpuarch \
+            --cmake-args=\"-DCUDF_ENABLE_ARROW_S3=ON\"
+      secrets:
+        - AWS_ACCESS_KEY_ID
+        - AWS_SECRET_ACCESS_KEY
+        - AWS_SESSION_TOKEN
+      env:
+        CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+        CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+        CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+        CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+        PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }}
+        RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }}
+        SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+        SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+        SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+        SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+        SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+        SCCACHE_S3_KEY_PREFIX: libcudf-${{ env.get("RAPIDS_CONDA_ARCH") }}
+
+  requirements:
+    build:
+      - ${{ compiler("c") }}
+      - ${{ compiler("cxx") }}
+      - ${{ compiler("cuda") }}
+      - cuda-version =${{ cuda_version }}
+      - cmake ${{ cmake_version }}
+      - ninja
+      - ${{ stdlib("c") }}
+    host:
+      - librmm =${{ minor_version }}
+      - libkvikio =${{ minor_version }}
+      - if: cuda_major == "11"
+        then:
+          - cudatoolkit
+          - libcurand =${{ cuda11_libcurand_host_version }}
+          - libcurand-dev =${{ cuda11_libcurand_host_version }}
+          - cuda-nvrtc =${{ cuda_version }}
+          - cuda-nvrtc-dev =${{ cuda_version }}
+          - cuda-nvtx =${{ cuda_version }}
+          - if: linux64
+            then:
+              - libcufile =${{ cuda11_libcufile_host_version }}
+              - libcufile-dev =${{ cuda11_libcufile_host_version }}
+        else:
+          - cuda-nvrtc-dev
+          - cuda-nvtx-dev
+          - libcurand-dev
+          - if: linux64
+            then:
+              - libcufile-dev
+      - cuda-version =${{ cuda_version }}
+      - nvcomp ${{ nvcomp_version }}
+      - dlpack ${{ dlpack_version }}
+      - librdkafka ${{ librdkafka_version }}
+      - flatbuffers =${{ flatbuffers_version }}
+      - rapids-logger =0.1
+      - zlib ${{ zlib_version }}
+
+outputs:
+  - package:
+      name: libcudf
+      version: ${{ version }}
+    build:
+      script:
+        - cmake --install cpp/build
+      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      dynamic_linking:
+        overlinking_behavior: "error"
+    requirements:
+      build:
+        - cmake ${{ cmake_version }}
+        - ${{ compiler("c") }}
+      host:
+        - cuda-version =${{ cuda_version }}
+        - libkvikio =${{ minor_version }}
+        - nvcomp ${{ nvcomp_version }}
+        - rapids-logger =0.1
+        - zlib ${{ zlib_version }}
+        - if: cuda_major == "11"
+          then: cudatoolkit
+          else: cuda-cudart-dev
+      run:
+        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+        - if: cuda_major == "11"
+          then:
+            - cudatoolkit
+            - if: linux64
+              then:
+                - libcufile ${{ cuda11_libcufile_run_version }}
+          else:
+            - cuda-nvrtc
+            - if: linux64
+              then:
+                - libcufile
+        - nvcomp ${{ nvcomp_version }}
+        - librmm =${{ minor_version }}
+        - libkvikio =${{ minor_version }}
+        - dlpack ${{ dlpack_version }}
+        - rapids-logger =0.1
+      run_exports:
+        - ${{ pin_subpackage("libcudf", upper_bound="x.x") }}
+      ignore_run_exports:
+        by_name:
+          - cuda-cudart
+          - cuda-nvrtc
+          - cuda-nvtx
+          - cuda-version
+          - flatbuffers
+          - libcurand
+          - libkvikio
+          - librdkafka
+          - librmm
+          - nvcomp
+    tests:
+      - script:
+        - test -f $PREFIX/lib/libcudf.so
+        - test -f $PREFIX/include/cudf/column/column.hpp
+    about:
+      homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }}
+      license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }}
+      summary: ${{ load_from_file("python/libcudf/pyproject.toml").project.description }}
+
+  - package:
+      name: libcudf_kafka
+      version: ${{ version }}
+    build:
+      script:
+        - cmake --install cpp/libcudf_kafka/build
+      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      dynamic_linking:
+        overlinking_behavior: "error"
+    requirements:
+      build:
+        - cmake ${{ cmake_version }}
+        - ${{ stdlib("c") }}
+      host:
+        - librdkafka ${{ librdkafka_version }}
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+      run:
+        - librdkafka ${{ librdkafka_version }}
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+      ignore_run_exports:
+        by_name:
+          - cuda-cudart
+          - cuda-nvrtc
+          - cuda-nvtx
+          - cuda-version
+          - flatbuffers
+          - libcurand
+          - libkvikio
+          - librdkafka
+          - librmm
+          - nvcomp
+    tests:
+      - script:
+        - test -f $PREFIX/lib/libcudf_kafka.so
+    about:
+      homepage: https://rapids.ai/
+      license: Apache-2.0
+      summary: libcudf_kafka library
+
+  - package:
+      name: libcudf-example
+      version: ${{ version }}
+    build:
+      script:
+        content: |
+          ./cpp/examples/build.sh --install
+        secrets:
+          - AWS_ACCESS_KEY_ID
+          - AWS_SECRET_ACCESS_KEY
+          - AWS_SESSION_TOKEN
+        env:
+          CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+          CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+          CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+          CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+          PARALLEL_LEVEL: ${{ env.get("PARALLEL_LEVEL") }}
+          RAPIDS_ARTIFACTS_DIR: ${{ env.get("RAPIDS_ARTIFACTS_DIR") }}
+          SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+          SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+          SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+          SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+          SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+          SCCACHE_S3_KEY_PREFIX: libcudf-${{ env.get("RAPIDS_CONDA_ARCH") }}
+      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      dynamic_linking:
+        overlinking_behavior: "error"
+    requirements:
+      build:
+        - ${{ compiler("c") }}
+        - ${{ compiler("cuda") }}
+        - ${{ compiler("cxx") }}
+        - ${{ stdlib("c") }}
+        - cmake ${{ cmake_version }}
+        - cuda-version =${{ cuda_version }}
+        - ninja
+      host:
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+        - cuda-version =${{ cuda_version }}
+        - if: cuda_major == "11"
+          then:
+            - cuda-nvtx =${{ cuda_version }}
+            - cudatoolkit
+          else:
+            - cuda-nvtx-dev
+            - cuda-cudart-dev
+      run:
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+        - if: cuda_major != "11"
+          then:
+            - cuda-nvtx
+      ignore_run_exports:
+        from_package:
+          - if: cuda_major != "11"
+            then:
+              - cuda-nvtx-dev
+        by_name:
+          - cuda-cudart
+          - cuda-nvrtc
+          - cuda-nvtx
+          - cuda-version
+          - flatbuffers
+          - libcurand
+          - libkvikio
+          - librdkafka
+          - librmm
+          - nvcomp
+    about:
+      homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }}
+      license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }}
+      summary: libcudf example executables
+
+  - package:
+      name: libcudf-tests
+      version: ${{ version }}
+    build:
+      script:
+        - cmake --install cpp/build --component testing
+        - cmake --install cpp/libcudf_kafka/build --component testing
+      string: cuda${{ cuda_major }}_${{ date_string }}_${{ head_rev }}
+      dynamic_linking:
+        overlinking_behavior: "error"
+        missing_dso_allowlist:
+          - "libnvidia-ml.so.1"
+    requirements:
+      build:
+        - cmake ${{ cmake_version }}
+        - ${{ stdlib("c") }}
+      host:
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+        - ${{ pin_subpackage("libcudf_kafka", exact=True) }}
+        - cuda-version =${{ cuda_version }}
+        - if: cuda_major == "11"
+          then:
+            - libcurand ${{ cuda11_libcurand_run_version }}
+            - cudatoolkit
+          else:
+            - libcurand-dev
+            - cuda-cudart-dev
+      run:
+        - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
+        - ${{ pin_subpackage("libcudf", exact=True) }}
+        - ${{ pin_subpackage("libcudf_kafka", exact=True) }}
+        - if: cuda_major == "11"
+          then:
+            - libcurand ${{ cuda11_libcurand_run_version }}
+          else:
+            - libcurand
+      ignore_run_exports:
+        from_package:
+          - if: cuda_major != "11"
+            then:
+              - libcurand-dev
+        by_name:
+          - cuda-cudart
+          - cuda-nvrtc
+          - cuda-nvtx
+          - cuda-version
+          - flatbuffers
+          - libcurand
+          - libkvikio
+          - librdkafka
+          - librmm
+          - nvcomp
+    about:
+      homepage: ${{ load_from_file("python/libcudf/pyproject.toml").project.urls.Homepage }}
+      license: ${{ load_from_file("python/libcudf/pyproject.toml").project.license.text }}
+      summary: libcudf test & benchmark executables
diff --git a/conda/recipes/pylibcudf/build.sh b/conda/recipes/pylibcudf/build.sh
deleted file mode 100644
index 483346504db..00000000000
--- a/conda/recipes/pylibcudf/build.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
-
-# This assumes the script is executed from the root of the repo directory
-./build.sh pylibcudf
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
deleted file mode 100644
index ae02cf8d4e5..00000000000
--- a/conda/recipes/pylibcudf/meta.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2018-2025, NVIDIA CORPORATION.
-
-{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
-{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
-{% set py_version = environ['CONDA_PY'] %}
-{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
-{% set cuda_major = cuda_version.split('.')[0] %}
-{% set date_string = environ['RAPIDS_DATE_STRING'] %}
-
-package:
-  name: pylibcudf
-  version: {{ version }}
-
-source:
-  path: ../../..
-
-build:
-  number: {{ GIT_DESCRIBE_NUMBER }}
-  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
-  script_env:
-    - AWS_ACCESS_KEY_ID
-    - AWS_SECRET_ACCESS_KEY
-    - AWS_SESSION_TOKEN
-    - CMAKE_C_COMPILER_LAUNCHER
-    - CMAKE_CUDA_COMPILER_LAUNCHER
-    - CMAKE_CXX_COMPILER_LAUNCHER
-    - CMAKE_GENERATOR
-    - PARALLEL_LEVEL
-    - SCCACHE_BUCKET
-    - SCCACHE_IDLE_TIMEOUT
-    - SCCACHE_REGION
-    - SCCACHE_S3_KEY_PREFIX=pylibcudf-aarch64 # [aarch64]
-    - SCCACHE_S3_KEY_PREFIX=pylibcudf-linux64 # [linux64]
-    - SCCACHE_S3_USE_SSL
-    - SCCACHE_S3_NO_CREDENTIALS
-  ignore_run_exports_from:
-    - {{ compiler('cuda') }}
-    {% if cuda_major != "11" %}
-    - cuda-cudart-dev
-    - libcufile-dev  # [linux64]
-    {% endif %}
-
-requirements:
-  build:
-    - cmake {{ cmake_version }}
-    - ninja
-    - {{ compiler('c') }}
-    - {{ compiler('cxx') }}
-    {% if cuda_major == "11" %}
-    - {{ compiler('cuda') }} ={{ cuda_version }}
-    {% else %}
-    - {{ compiler('cuda') }}
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-    - {{ stdlib("c") }}
-  host:
-    - python
-    - cython >=3.0.3
-    - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.10.0
-    - dlpack >=0.8,<1.0
-    - libcudf ={{ version }}
-    - rmm ={{ minor_version }}
-    {% if cuda_major == "11" %}
-    - cudatoolkit
-    {% else %}
-    - cuda-cudart-dev
-    - cuda-nvrtc
-    - libcufile-dev  # [linux64]
-    {% endif %}
-    - cuda-version ={{ cuda_version }}
-  run:
-    - python
-    - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.4dev0
-    - numpy >=1.23,<2.1
-    - pyarrow>=14.0.0,<20.0.0a0
-    - libcudf ={{ version }}
-    - {{ pin_compatible('rmm', max_pin='x.x') }}
-    - fsspec >=0.6.0
-    {% if cuda_major == "11" %}
-    - cuda-python >=11.8.5,<12.0a0
-    {% else %}
-    - cuda-python >=12.6.2,<13.0a0
-    {% endif %}
-    - nvtx >=0.2.1
-    - packaging
-
-test:
-  requires:
-    - cuda-version ={{ cuda_version }}
-  imports:
-    - pylibcudf
-
-about:
-  home: https://rapids.ai/
-  license: Apache-2.0
-  license_family: APACHE
-  license_file: LICENSE
-  summary: pylibcudf library
diff --git a/conda/recipes/pylibcudf/recipe.yaml b/conda/recipes/pylibcudf/recipe.yaml
new file mode 100644
index 00000000000..476f4d83960
--- /dev/null
+++ b/conda/recipes/pylibcudf/recipe.yaml
@@ -0,0 +1,106 @@
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
+schema_version: 1
+
+context:
+  version: ${{ env.get("RAPIDS_PACKAGE_VERSION") }}
+  minor_version: ${{ (version | split("."))[:2] | join(".") }}
+  cuda_version: ${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[:2] | join(".") }}
+  cuda_major: '${{ (env.get("RAPIDS_CUDA_VERSION") | split("."))[0] }}'
+  date_string: '${{ env.get("RAPIDS_DATE_STRING") }}'
+  py_version: ${{ env.get("RAPIDS_PY_VERSION") }}
+  py_buildstring: ${{ py_version | version_to_buildstring }}
+  head_rev: ${{ git.head_rev(".")[:8] }}
+
+package:
+  name: pylibcudf
+  version: ${{ version }}
+
+source:
+  path: ../../..
+
+build:
+  string: cuda${{ cuda_major }}_py${{ py_buildstring }}_${{ date_string }}_${{ head_rev }}
+  script:
+    content: |
+      ./build.sh pylibcudf
+    secrets:
+      - AWS_ACCESS_KEY_ID
+      - AWS_SECRET_ACCESS_KEY
+      - AWS_SESSION_TOKEN
+    env:
+      CMAKE_C_COMPILER_LAUNCHER: ${{ env.get("CMAKE_C_COMPILER_LAUNCHER") }}
+      CMAKE_CUDA_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CUDA_COMPILER_LAUNCHER") }}
+      CMAKE_CXX_COMPILER_LAUNCHER: ${{ env.get("CMAKE_CXX_COMPILER_LAUNCHER") }}
+      CMAKE_GENERATOR: ${{ env.get("CMAKE_GENERATOR") }}
+      SCCACHE_BUCKET: ${{ env.get("SCCACHE_BUCKET") }}
+      SCCACHE_IDLE_TIMEOUT: ${{ env.get("SCCACHE_IDLE_TIMEOUT") }}
+      SCCACHE_REGION: ${{ env.get("SCCACHE_REGION") }}
+      SCCACHE_S3_USE_SSL: ${{ env.get("SCCACHE_S3_USE_SSL") }}
+      SCCACHE_S3_NO_CREDENTIALS: ${{ env.get("SCCACHE_S3_NO_CREDENTIALS") }}
+      SCCACHE_S3_KEY_PREFIX: pylibcudf-${{ env.get("RAPIDS_CONDA_ARCH") }}
+
+requirements:
+  build:
+    - cmake ${{ cmake_version }}
+    - ninja
+    - ${{ compiler("c") }}
+    - ${{ compiler("cxx") }}
+    - ${{ compiler("cuda") }}
+    - cuda-version =${{ cuda_version }}
+    - ${{ stdlib("c") }}
+  host:
+    - python =${{ py_version }}
+    - pip
+    - cython >=3.0.3
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - scikit-build-core >=0.10.0
+    - dlpack >=0.8,<1.0
+    - libcudf =${{ version }}
+    - rmm =${{ minor_version }}
+    - if: cuda_major == "11"
+      then:
+        - cudatoolkit
+      else:
+        - cuda-cudart-dev
+        - cuda-nvrtc
+        - if: linux64
+          then:
+            - libcufile-dev
+    - cuda-version =${{ cuda_version }}
+  run:
+    - python
+    - typing_extensions >=4.0.0
+    - pandas >=2.0,<2.2.4dev0
+    - numpy >=1.23,<2.1
+    - pyarrow>=14.0.0,<20.0.0a0
+    - libcudf =${{ version }}
+    - ${{ pin_compatible("rmm", upper_bound="x.x") }}
+    - fsspec >=0.6.0
+    - if: cuda_major == "11"
+      then:
+        - cuda-python >=11.8.5,<12.0a0
+      else:
+        - cuda-python >=12.6.2,<13.0a0
+    - nvtx >=0.2.1
+    - packaging
+  ignore_run_exports:
+    from_package:
+      - if: cuda_major != "11"
+        then:
+          - cuda-cudart-dev
+          - if: linux64
+            then:
+              - libcufile-dev
+    by_name:
+      - cuda-version
+
+tests:
+  - python:
+      imports:
+        - pylibcudf
+      pip_check: false
+
+about:
+  homepage: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.urls.Homepage }}
+  license: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.license.text }}
+  summary: ${{ load_from_file("python/pylibcudf/pyproject.toml").project.description }}
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 8b8abe90ac9..2ce5131ea8e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cachetools",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 424010e632c..764c8c64a7e 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cudf==25.4.*,>=0.0.0a0",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index e9fc054efc2..fb44caaa0c0 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -16,7 +16,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "polars>=1.20,<1.24",
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 665b0a76ecf..b1fbe901189 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -16,7 +16,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "confluent-kafka>=2.5.0,<2.6.0a0",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 83493d7f2a4..fd2bac3c0d2 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -16,7 +16,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cudf==25.4.*,>=0.0.0a0",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 01fe6097936..784a0c49894 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -27,7 +27,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e12d1ffdb39..8ea6f0e94a4 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -15,7 +15,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
     { name = "NVIDIA Corporation" },
 ]
-license = { text = "Apache 2.0" }
+license = { text = "Apache-2.0" }
 requires-python = ">=3.10"
 dependencies = [
     "cuda-python>=11.8.5,<12.0a0",

From 45d80669367c6bf3b9dc0cd122f0ea36072cb7ea Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:25:11 -0800
Subject: [PATCH 36/42] Remove cudf.Scalar from shift/fillna (#17922)

Toward https://github.com/rapidsai/cudf/issues/17843

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/17922
---
 python/cudf/cudf/core/column/categorical.py |  9 +++++--
 python/cudf/cudf/core/column/column.py      | 23 +++++++++++------
 python/cudf/cudf/core/column/datetime.py    | 14 +++++++++++
 python/cudf/cudf/core/column/decimal.py     | 28 ++++++++++++++++++---
 python/cudf/cudf/core/column/numerical.py   | 13 +++++++---
 python/cudf/cudf/core/column/timedelta.py   | 15 +++++++++++
 6 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d41e448254c..c75d285e7de 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -20,6 +20,7 @@
 from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.utils.dtypes import (
     SIZE_TYPE_DTYPE,
+    cudf_dtype_to_pa_type,
     find_common_type,
     is_mixed_with_object_dtype,
     min_signed_type,
@@ -1042,7 +1043,7 @@ def notnull(self) -> ColumnBase:
 
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
-    ) -> cudf.Scalar | ColumnBase:
+    ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if cudf.api.types.is_scalar(fill_value):
             if fill_value != _DEFAULT_CATEGORICAL_VALUE:
@@ -1052,7 +1053,11 @@ def _validate_fillna_value(
                     raise ValueError(
                         f"{fill_value=} must be in categories"
                     ) from err
-            return cudf.Scalar(fill_value, dtype=self.codes.dtype)
+            return pa_scalar_to_plc_scalar(
+                pa.scalar(
+                    fill_value, type=cudf_dtype_to_pa_type(self.codes.dtype)
+                )
+            )
         else:
             fill_value = column.as_column(fill_value, nan_as_null=False)
             if isinstance(fill_value.dtype, CategoricalDtype):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 61f4f7d52fb..0d36fd3855b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -891,12 +891,11 @@ def _fill(
 
     @acquire_spill_lock()
     def shift(self, offset: int, fill_value: ScalarLike) -> Self:
-        if not isinstance(fill_value, cudf.Scalar):
-            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
+        plc_fill_value = self._scalar_to_plc_scalar(fill_value)
         plc_col = plc.copying.shift(
             self.to_pylibcudf(mode="read"),
             offset,
-            fill_value.device_value,
+            plc_fill_value,
         )
         return type(self).from_pylibcudf(plc_col)  # type: ignore[return-value]
 
@@ -1188,13 +1187,21 @@ def _check_scatter_key_length(
                 f"{num_keys}"
             )
 
+    def _scalar_to_plc_scalar(self, scalar: ScalarLike) -> plc.Scalar:
+        """Return a pylibcudf.Scalar that matches the type of self.dtype"""
+        if not isinstance(scalar, pa.Scalar):
+            scalar = pa.scalar(scalar)
+        return pa_scalar_to_plc_scalar(
+            scalar.cast(cudf_dtype_to_pa_type(self.dtype))
+        )
+
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
-    ) -> cudf.Scalar | ColumnBase:
+    ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if is_scalar(fill_value):
-            return cudf.Scalar(fill_value, dtype=self.dtype)
-        return as_column(fill_value)
+            return self._scalar_to_plc_scalar(fill_value)
+        return as_column(fill_value).astype(self.dtype)
 
     @acquire_spill_lock()
     def replace(
@@ -1240,8 +1247,8 @@ def fillna(
                     if method == "ffill"
                     else plc.replace.ReplacePolicy.FOLLOWING
                 )
-            elif is_scalar(fill_value):
-                plc_replace = cudf.Scalar(fill_value).device_value
+            elif isinstance(fill_value, plc.Scalar):
+                plc_replace = fill_value
             else:
                 plc_replace = fill_value.to_pylibcudf(mode="read")
             plc_column = plc.replace.replace_nulls(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 213e91d7b3f..64ddcae72a7 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -45,6 +45,7 @@
 
     from cudf._typing import (
         ColumnBinaryOperand,
+        ColumnLike,
         DatetimeLikeScalar,
         Dtype,
         DtypeObj,
@@ -269,6 +270,19 @@ def __contains__(self, item: ScalarLike) -> bool:
             "cudf.core.column.NumericalColumn", self.astype(np.dtype(np.int64))
         )
 
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> plc.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if (
+            isinstance(fill_value, np.datetime64)
+            and self.time_unit != np.datetime_data(fill_value)[0]
+        ):
+            fill_value = fill_value.astype(self.dtype)
+        elif isinstance(fill_value, str) and fill_value.lower() == "nat":
+            fill_value = np.datetime64(fill_value, self.time_unit)
+        return super()._validate_fillna_value(fill_value)
+
     @functools.cached_property
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 8db6f805bce..848faf6a9ee 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -24,7 +24,8 @@
     DecimalDtype,
 )
 from cudf.core.mixins import BinaryOperand
-from cudf.utils.dtypes import CUDF_STRING_DTYPE
+from cudf.core.scalar import pa_scalar_to_plc_scalar
+from cudf.utils.dtypes import CUDF_STRING_DTYPE, cudf_dtype_to_pa_type
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
 if TYPE_CHECKING:
@@ -165,16 +166,35 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
 
         return result
 
+    def _scalar_to_plc_scalar(self, scalar: ScalarLike) -> plc.Scalar:
+        """Return a pylibcudf.Scalar that matches the type of self.dtype"""
+        if not isinstance(scalar, pa.Scalar):
+            # e.g casting int to decimal type isn't allow, but OK in the constructor?
+            pa_scalar = pa.scalar(
+                scalar, type=cudf_dtype_to_pa_type(self.dtype)
+            )
+        else:
+            pa_scalar = scalar.cast(cudf_dtype_to_pa_type(self.dtype))
+        plc_scalar = pa_scalar_to_plc_scalar(pa_scalar)
+        if isinstance(self.dtype, (Decimal32Dtype, Decimal64Dtype)):
+            # pyarrow.Scalar only supports Decimal128 so conversion
+            # from pyarrow would only return a pylibcudf.Scalar with Decimal128
+            col = ColumnBase.from_pylibcudf(
+                plc.Column.from_scalar(plc_scalar, 1)
+            ).astype(self.dtype)
+            return plc.copying.get_element(col.to_pylibcudf(mode="read"), 0)
+        return plc_scalar
+
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
-    ) -> cudf.Scalar | ColumnBase:
+    ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if isinstance(fill_value, (int, Decimal)):
-            return cudf.Scalar(fill_value, dtype=self.dtype)
+            return super()._validate_fillna_value(fill_value)
         elif isinstance(fill_value, ColumnBase) and (
             isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu"
         ):
-            return fill_value.astype(self.dtype)
+            return super()._validate_fillna_value(fill_value)
         raise TypeError(
             "Decimal columns only support using fillna with decimal and "
             "integer values"
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index eecb294acee..77c5a6b6caf 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -559,15 +559,20 @@ def find_and_replace(
 
     def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
-    ) -> cudf.Scalar | ColumnBase:
+    ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if is_scalar(fill_value):
-            cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value)
-            if not as_column(cudf_obj).can_cast_safely(self.dtype):
+            cudf_obj = ColumnBase.from_pylibcudf(
+                plc.Column.from_scalar(
+                    pa_scalar_to_plc_scalar(pa.scalar(fill_value)), 1
+                )
+            )
+            if not cudf_obj.can_cast_safely(self.dtype):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
                     f"{type(fill_value).__name__} to {self.dtype.name}"
                 )
+            return super()._validate_fillna_value(fill_value)
         else:
             cudf_obj = as_column(fill_value, nan_as_null=False)
             if not cudf_obj.can_cast_safely(self.dtype):  # type: ignore[attr-defined]
@@ -576,7 +581,7 @@ def _validate_fillna_value(
                     f"{cudf_obj.dtype.type.__name__} to "
                     f"{self.dtype.type.__name__}"
                 )
-        return cudf_obj.astype(self.dtype)
+            return cudf_obj.astype(self.dtype)
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e4d47f492c2..654d2c2b800 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -30,9 +30,11 @@
 
     from cudf._typing import (
         ColumnBinaryOperand,
+        ColumnLike,
         DatetimeLikeScalar,
         Dtype,
         DtypeObj,
+        ScalarLike,
     )
 
 _unit_to_nanoseconds_conversion = {
@@ -142,6 +144,19 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             "cudf.core.column.NumericalColumn", self.astype(np.dtype(np.int64))
         )
 
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> plc.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if (
+            isinstance(fill_value, np.timedelta64)
+            and self.time_unit != np.datetime_data(fill_value)[0]
+        ):
+            fill_value = fill_value.astype(self.dtype)
+        elif isinstance(fill_value, str) and fill_value.lower() == "nat":
+            fill_value = np.timedelta64(fill_value, self.time_unit)
+        return super()._validate_fillna_value(fill_value)
+
     @property
     def values(self):
         """

From 8645992542792870cf2d1a1416c8994db83553b5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:25:53 -0800
Subject: [PATCH 37/42] Add pylibcudf.Scalar.from_numpy for bool/int/float/str
 types (#18020)

Towards https://github.com/rapidsai/cudf/issues/17054

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18020
---
 python/pylibcudf/pylibcudf/scalar.pyx         | 148 +++++++++++++++++-
 .../pylibcudf/pylibcudf/tests/test_scalar.py  |  42 +++++
 2 files changed, 189 insertions(+), 1 deletion(-)

diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index 35abab7e838..e252d3072aa 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -2,7 +2,16 @@
 
 from cpython cimport bool as py_bool, datetime
 from cython cimport no_gc_clear
-from libc.stdint cimport int64_t
+from libc.stdint cimport (
+    int8_t,
+    int16_t,
+    int32_t,
+    int64_t,
+    uint8_t,
+    uint16_t,
+    uint32_t,
+    uint64_t,
+)
 from libcpp cimport bool as cbool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -25,6 +34,13 @@ from .types cimport DataType
 
 from functools import singledispatch
 
+try:
+    import numpy as np
+    np_error = None
+except ImportError as err:
+    np = None
+    np_error = err
+
 __all__ = ["Scalar"]
 
 
@@ -111,6 +127,24 @@ cdef class Scalar:
         """
         return _from_py(py_val)
 
+    @classmethod
+    def from_numpy(cls, np_val):
+        """
+        Convert a NumPy scalar to a Scalar.
+
+        Parameters
+        ----------
+        np_val: numpy.generic
+            Value to convert to a pylibcudf.Scalar
+
+        Returns
+        -------
+        Scalar
+            New pylibcudf.Scalar
+        """
+        return _from_numpy(np_val)
+
+
 cdef Scalar _new_scalar(unique_ptr[scalar] c_obj, DataType dtype):
     cdef Scalar s = Scalar.__new__(Scalar)
     s.c_obj.swap(c_obj)
@@ -166,3 +200,115 @@ def _(py_val):
     cdef unique_ptr[scalar] c_obj = make_string_scalar(py_val.encode())
     cdef Scalar slr = _new_scalar(move(c_obj), dtype)
     return slr
+
+
+@singledispatch
+def _from_numpy(np_val):
+    if np_error is not None:
+        raise np_error
+    raise TypeError(f"{type(np_val).__name__} cannot be converted to pylibcudf.Scalar")
+
+
+if np is not None:
+    @_from_numpy.register(np.datetime64)
+    @_from_numpy.register(np.timedelta64)
+    def _(np_val):
+        raise NotImplementedError(
+            f"{type(np_val).__name__} is currently not supported."
+        )
+
+    @_from_numpy.register(np.bool_)
+    def _(np_val):
+        cdef DataType dtype = DataType(type_id.BOOL8)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        cdef cbool c_val = np_val
+        (<numeric_scalar[cbool]*>c_obj.get()).set_value(c_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.str_)
+    def _(np_val):
+        cdef DataType dtype = DataType(type_id.STRING)
+        cdef unique_ptr[scalar] c_obj = make_string_scalar(np_val.item().encode())
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.int8)
+    def _(np_val):
+        dtype = DataType(type_id.INT8)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[int8_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.int16)
+    def _(np_val):
+        dtype = DataType(type_id.INT16)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[int16_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.int32)
+    def _(np_val):
+        dtype = DataType(type_id.INT32)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[int32_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.int64)
+    def _(np_val):
+        dtype = DataType(type_id.INT64)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[int64_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.uint8)
+    def _(np_val):
+        dtype = DataType(type_id.UINT8)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[uint8_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.uint16)
+    def _(np_val):
+        dtype = DataType(type_id.UINT16)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[uint16_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.uint32)
+    def _(np_val):
+        dtype = DataType(type_id.UINT32)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[uint32_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.uint64)
+    def _(np_val):
+        dtype = DataType(type_id.UINT64)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[uint64_t]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.float32)
+    def _(np_val):
+        dtype = DataType(type_id.FLOAT32)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[float]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
+
+    @_from_numpy.register(np.float64)
+    def _(np_val):
+        dtype = DataType(type_id.FLOAT64)
+        cdef unique_ptr[scalar] c_obj = make_numeric_scalar(dtype.c_obj)
+        (<numeric_scalar[double]*>c_obj.get()).set_value(np_val)
+        cdef Scalar slr = _new_scalar(move(c_obj), dtype)
+        return slr
diff --git a/python/pylibcudf/pylibcudf/tests/test_scalar.py b/python/pylibcudf/pylibcudf/tests/test_scalar.py
index 45afae91c9a..056fcd5f63c 100644
--- a/python/pylibcudf/pylibcudf/tests/test_scalar.py
+++ b/python/pylibcudf/pylibcudf/tests/test_scalar.py
@@ -7,6 +7,11 @@
 import pylibcudf as plc
 
 
+@pytest.fixture(scope="module")
+def np():
+    return pytest.importorskip("numpy")
+
+
 @pytest.mark.parametrize(
     "val", [True, False, -1, 0, 1 - 1.0, 0.0, 1.52, "", "a1!"]
 )
@@ -28,3 +33,40 @@ def test_from_py_notimplemented(val):
 def test_from_py_typeerror(val):
     with pytest.raises(TypeError):
         plc.Scalar.from_py(val)
+
+
+@pytest.mark.parametrize(
+    "np_type",
+    [
+        "bool_",
+        "str_",
+        "int8",
+        "int16",
+        "int32",
+        "int64",
+        "uint8",
+        "uint16",
+        "uint32",
+        "uint64",
+        "float32",
+        "float64",
+    ],
+)
+def test_from_numpy(np, np_type):
+    np_klass = getattr(np, np_type)
+    np_val = np_klass("1" if np_type == "str_" else 1)
+    result = plc.Scalar.from_numpy(np_val)
+    expected = pa.scalar(np_val)
+    assert plc.interop.to_arrow(result).equals(expected)
+
+
+@pytest.mark.parametrize("np_type", ["datetime64", "timedelta64"])
+def test_from_numpy_notimplemented(np, np_type):
+    np_val = getattr(np, np_type)(1, "ns")
+    with pytest.raises(NotImplementedError):
+        plc.Scalar.from_numpy(np_val)
+
+
+def test_from_numpy_typeerror(np):
+    with pytest.raises(TypeError):
+        plc.Scalar.from_numpy(np.void(5))

From c0c9dfe6ede37ed3d5160891fab747f9a0fab29a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 3 Mar 2025 21:25:57 -0800
Subject: [PATCH 38/42] Use more, cheaper dtype checking utilities in cudf
 Python (#18139)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoids using potentially more expensive dtype checking utilities referenced in https://github.com/rapidsai/cudf/issues/12494

`is_string_dtype` -> `== CUDF_STRING_DTYPE`
`is_decimal_dtype` -> `isinstance`
`is_numeric_dtype` -> (new) `is_dtype_obj_numeric`

```python
In [1]: import numpy as np

In [2]: from cudf.api.types import is_numeric_dtype

In [3]: from cudf.utils.dtypes import is_dtype_obj_numeric

In [4]: dtype = np.dtype(np.int64)

In [5]: %timeit is_dtype_obj_numeric(dtype)
211 ns ± 2.26 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)

In [6]: %timeit is_numeric_dtype(dtype)
1.14 μs ± 2.61 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
```

Also standardizes some imports from `cudf.api.types`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18139
---
 python/cudf/cudf/api/types.py                | 13 -------
 python/cudf/cudf/core/_internals/where.py    | 12 ++++--
 python/cudf/cudf/core/column/categorical.py  | 11 +++---
 python/cudf/cudf/core/column/column.py       | 16 ++++----
 python/cudf/cudf/core/column/lists.py        |  8 ++--
 python/cudf/cudf/core/column/numerical.py    |  4 +-
 python/cudf/cudf/core/column/string.py       |  4 +-
 python/cudf/cudf/core/dataframe.py           | 40 ++++++++++----------
 python/cudf/cudf/core/groupby/groupby.py     | 21 +++++-----
 python/cudf/cudf/core/index.py               | 14 ++++---
 python/cudf/cudf/core/indexed_frame.py       | 28 +++++++-------
 python/cudf/cudf/core/join/_join_helpers.py  | 18 ++++++---
 python/cudf/cudf/core/multiindex.py          | 22 +++++------
 python/cudf/cudf/core/reshape.py             |  4 +-
 python/cudf/cudf/core/scalar.py              |  4 +-
 python/cudf/cudf/core/series.py              |  6 ++-
 python/cudf/cudf/core/single_column_frame.py |  8 ++--
 python/cudf/cudf/core/tools/datetimes.py     |  2 +-
 python/cudf/cudf/core/tools/numeric.py       | 13 ++++---
 python/cudf/cudf/core/window/ewm.py          |  4 +-
 python/cudf/cudf/io/dlpack.py                |  7 ++--
 python/cudf/cudf/testing/testing.py          | 10 ++---
 python/cudf/cudf/utils/dtypes.py             | 14 +++++++
 23 files changed, 155 insertions(+), 128 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 37ef83c8820..8d7d64ab31e 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -73,19 +73,6 @@ def is_numeric_dtype(obj):
     return pd_types.is_numeric_dtype(obj)
 
 
-# A version of numerical type check that does not include cudf decimals for
-# places where we need to distinguish fixed and floating point numbers.
-def _is_non_decimal_numeric_dtype(obj):
-    if isinstance(obj, _BaseDtype) or isinstance(
-        getattr(obj, "dtype", None), _BaseDtype
-    ):
-        return False
-    try:
-        return pd_types.is_numeric_dtype(obj)
-    except TypeError:
-        return False
-
-
 def is_integer(obj):
     """Return True if given object is integer.
 
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 73011d6ffe0..cf49dfb2194 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -7,9 +7,13 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
+from cudf.utils.dtypes import (
+    find_common_type,
+    is_dtype_obj_numeric,
+    is_mixed_with_object_dtype,
+)
 
 if TYPE_CHECKING:
     from cudf._typing import DtypeObj, ScalarLike
@@ -18,7 +22,7 @@
 
 def _normalize_categorical(input_col, other):
     if isinstance(input_col, cudf.core.column.CategoricalColumn):
-        if cudf.api.types.is_scalar(other):
+        if is_scalar(other):
             try:
                 other = input_col._encode(other)
             except ValueError:
@@ -81,7 +85,7 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+    if is_dtype_obj_numeric(source_dtype, include_decimal=False) and as_column(
         other
     ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index c75d285e7de..ed285934161 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -14,6 +14,7 @@
 import pylibcudf as plc
 
 import cudf
+from cudf.api.types import is_scalar
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -623,12 +624,10 @@ def ordered(self) -> bool:
         return self.dtype.ordered
 
     def __setitem__(self, key, value):
-        if cudf.api.types.is_scalar(
-            value
-        ) and cudf.utils.utils._is_null_host_scalar(value):
+        if is_scalar(value) and cudf.utils.utils._is_null_host_scalar(value):
             to_add_categories = 0
         else:
-            if cudf.api.types.is_scalar(value):
+            if is_scalar(value):
                 arr = column.as_column(value, length=1, nan_as_null=False)
             else:
                 arr = column.as_column(value, nan_as_null=False)
@@ -644,7 +643,7 @@ def __setitem__(self, key, value):
                 "category, set the categories first"
             )
 
-        if cudf.api.types.is_scalar(value):
+        if is_scalar(value):
             value = self._encode(value) if value is not None else value
         else:
             value = cudf.core.column.as_column(value).astype(self.dtype)
@@ -1045,7 +1044,7 @@ def _validate_fillna_value(
         self, fill_value: ScalarLike | ColumnLike
     ) -> plc.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
-        if cudf.api.types.is_scalar(fill_value):
+        if is_scalar(fill_value):
             if fill_value != _DEFAULT_CATEGORICAL_VALUE:
                 try:
                     fill_value = self._encode(fill_value)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 0d36fd3855b..5a8064dc49d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -23,13 +23,10 @@
 
 import cudf
 from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
-    is_decimal_dtype,
     is_dtype_equal,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
 from cudf.core._internals import (
@@ -69,6 +66,7 @@
     find_common_type,
     get_time_unit,
     is_column_like,
+    is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
     min_signed_type,
     min_unsigned_type,
@@ -858,7 +856,7 @@ def _fill(
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
-        if not inplace or is_string_dtype(self.dtype):
+        if not inplace or self.dtype == CUDF_STRING_DTYPE:
             with acquire_spill_lock():
                 result = type(self).from_pylibcudf(
                     plc.filling.fill(
@@ -868,7 +866,7 @@ def _fill(
                         fill_value,
                     )
                 )
-            if is_string_dtype(self.dtype):
+            if self.dtype == CUDF_STRING_DTYPE:
                 return self._mimic_inplace(result, inplace=True)
             return result  # type: ignore[return-value]
 
@@ -1599,7 +1597,10 @@ def cast(self, dtype: Dtype) -> ColumnBase:
                 self.to_pylibcudf(mode="read"), dtype_to_pylibcudf_type(dtype)
             )
         )
-        if is_decimal_dtype(result.dtype):
+        if isinstance(
+            result.dtype,
+            (cudf.Decimal128Dtype, cudf.Decimal64Dtype, cudf.Decimal32Dtype),
+        ):
             result.dtype.precision = dtype.precision  # type: ignore[union-attr]
         return result
 
@@ -2993,7 +2994,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        is_dtype_obj_numeric(dtype, include_decimal=False)
+        and dtype.kind == "M"
         for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 837763ee30c..ca29f83225b 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -14,7 +14,7 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column.column import ColumnBase, as_column
 from cudf.core.column.methods import ColumnMethods, ParentType
@@ -22,7 +22,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 from cudf.core.scalar import pa_scalar_to_plc_scalar
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
@@ -718,8 +718,8 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
                 "lists_indices and list column is of different size."
             )
         if (
-            not _is_non_decimal_numeric_dtype(
-                lists_indices_col.children[1].dtype
+            not is_dtype_obj_numeric(
+                lists_indices_col.children[1].dtype, include_decimal=False
             )
             or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 77c5a6b6caf..249afe9aba6 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -14,7 +14,7 @@
 
 import cudf
 import cudf.core.column.column as column
-from cudf.api.types import is_integer, is_scalar
+from cudf.api.types import infer_dtype, is_integer, is_scalar
 from cudf.core._internals import binaryop
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.core.column.column import ColumnBase, as_column
@@ -439,7 +439,7 @@ def _process_values_for_isin(
         except (MixedTypeError, TypeError) as e:
             # There is a corner where `values` can be of `object` dtype
             # but have values of homogeneous type.
-            inferred_dtype = cudf.api.types.infer_dtype(values)
+            inferred_dtype = infer_dtype(values)
             if (
                 self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
             ) or (
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 97ec41f4c39..9f3512369a0 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -18,7 +18,7 @@
 import cudf
 import cudf.core.column.column as column
 import cudf.core.column.datetime as datetime
-from cudf.api.types import is_integer, is_scalar, is_string_dtype
+from cudf.api.types import is_integer, is_scalar
 from cudf.core._internals import binaryop
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column.column import ColumnBase
@@ -75,7 +75,7 @@ def __init__(self, parent):
             if isinstance(parent.dtype, cudf.ListDtype)
             else parent.dtype
         )
-        if not is_string_dtype(value_type):
+        if value_type != CUDF_STRING_DTYPE:
             raise AttributeError(
                 "Can only use .str accessor with string values"
             )
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f909d72687c..eec0bacd5c8 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -41,10 +41,7 @@
     is_dict_like,
     is_dtype_equal,
     is_list_like,
-    is_numeric_dtype,
-    is_object_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import column, indexing_utils, reshape
 from cudf.core._compat import PANDAS_LT_300
@@ -90,6 +87,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
+    is_dtype_obj_numeric,
     min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
@@ -145,7 +143,7 @@ def __setitem__(self, key, value):
         return self._setitem_tuple_arg(key, value)
 
     @_performance_tracking
-    def _can_downcast_to_series(self, df, arg):
+    def _can_downcast_to_series(self, df: DataFrame, arg):
         """
         This method encapsulates the logic used
         to determine whether or not the result of a loc/iloc
@@ -170,8 +168,8 @@ def _can_downcast_to_series(self, df, arg):
                     arg[1], slice
                 ):
                     return True
-            dtypes = df.dtypes.values.tolist()
-            all_numeric = all(is_numeric_dtype(t) for t in dtypes)
+            dtypes = [dtype for _, dtype in df._dtypes]
+            all_numeric = all(is_dtype_obj_numeric(t) for t in dtypes)
             if all_numeric or (
                 len(dtypes) and all(t == dtypes[0] for t in dtypes)
             ):
@@ -348,7 +346,7 @@ def _getitem_tuple_arg(self, arg):
                     df.index.name = columns_df.index.name
                     if not isinstance(
                         df.index, MultiIndex
-                    ) and is_numeric_dtype(df.index.dtype):
+                    ) and is_dtype_obj_numeric(df.index.dtype):
                         # Preserve the original index type.
                         df.index = df.index.astype(self._frame.index.dtype)
                     df = df.sort_values(by=[tmp_col_name, cantor_name])
@@ -3158,7 +3156,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
         # If other was provided, process that next.
         if isinstance(other, DataFrame):
             other_cols = [other._data[col] for col in self._column_names]
-        elif cudf.api.types.is_scalar(other):
+        elif is_scalar(other):
             other_cols = [other] * len(self._column_names)
         elif isinstance(other, cudf.Series):
             other_cols = other.to_pandas()
@@ -3788,14 +3786,14 @@ def agg(self, aggs, axis=None):
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
         """
-        dtypes = [self[col].dtype for col in self._column_names]
+        dtypes = [dtype for _, dtype in self._dtypes]
         common_dtype = find_common_type(dtypes)
         if common_dtype.kind != "b" and any(
             dtype.kind == "b" for dtype in dtypes
         ):
             raise MixedTypeError("Cannot create a column with mixed types")
 
-        if any(is_string_dtype(dt) for dt in dtypes):
+        if any(dt == CUDF_STRING_DTYPE for dt in dtypes):
             raise NotImplementedError(
                 "DataFrame.agg() is not supported for "
                 "frames containing string columns"
@@ -4934,7 +4932,7 @@ def apply_rows(
         """
         for col in incols:
             current_col_dtype = self._data[col].dtype
-            if is_string_dtype(current_col_dtype) or isinstance(
+            if current_col_dtype == CUDF_STRING_DTYPE or isinstance(
                 current_col_dtype, cudf.CategoricalDtype
             ):
                 raise TypeError(
@@ -6294,8 +6292,8 @@ def make_false_column_like_self():
                     else:
                         # These checks must happen after the conversions above
                         # since numpy can't handle categorical dtypes.
-                        self_is_str = is_string_dtype(self_col.dtype)
-                        other_is_str = is_string_dtype(other_col.dtype)
+                        self_is_str = self_col.dtype == CUDF_STRING_DTYPE
+                        other_is_str = other_col.dtype == CUDF_STRING_DTYPE
 
                     if self_is_str != other_is_str:
                         # Strings can't compare to anything else.
@@ -6352,8 +6350,8 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         common_dtype = find_common_type(filtered.dtypes)
         if (
             not numeric_only
-            and is_string_dtype(common_dtype)
-            and any(not is_string_dtype(dt) for dt in filtered.dtypes)
+            and common_dtype == CUDF_STRING_DTYPE
+            and any(dtype != CUDF_STRING_DTYPE for dtype in filtered._dtypes)
         ):
             raise TypeError(
                 f"Cannot perform row-wise {method} across mixed-dtype columns,"
@@ -6476,7 +6474,9 @@ def _reduce(
 
         if numeric_only:
             numeric_cols = (
-                name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
+                name
+                for name, dtype in self._dtypes
+                if is_dtype_obj_numeric(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
@@ -6507,7 +6507,7 @@ def _reduce(
                         raise NotImplementedError(
                             f"Column {col_label} with type {col.dtype} does not support {op}"
                         ) from err
-                    elif not is_numeric_dtype(col.dtype):
+                    elif not is_dtype_obj_numeric(col.dtype):
                         raise TypeError(
                             "Non numeric columns passed with "
                             "`numeric_only=False`, pass `numeric_only=True` "
@@ -6523,9 +6523,9 @@ def _reduce(
                 source_dtypes = [dtype for _, dtype in source._dtypes]
                 common_dtype = find_common_type(source_dtypes)
                 if (
-                    is_object_dtype(common_dtype)
+                    common_dtype == CUDF_STRING_DTYPE
                     and any(
-                        not is_object_dtype(dtype) for dtype in source_dtypes
+                        dtype != CUDF_STRING_DTYPE for dtype in source_dtypes
                     )
                     or common_dtype.kind != "b"
                     and any(dtype.kind == "b" for dtype in source_dtypes)
@@ -8603,7 +8603,7 @@ def _find_common_dtypes_and_categories(
         # default to the first non-null dtype
         dtypes[idx] = cols[0].dtype
         # If all the non-null dtypes are int/float, find a common dtype
-        if all(is_numeric_dtype(col.dtype) for col in cols):
+        if all(is_dtype_obj_numeric(col.dtype) for col in cols):
             dtypes[idx] = find_common_type([col.dtype for col in cols])
         # If all categorical dtypes, combine the categories
         elif all(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 38b519c6d5f..df11ebd4f94 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -20,11 +20,7 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_list_like,
-    is_numeric_dtype,
-    is_string_dtype,
-)
+from cudf.api.types import is_list_like, is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core._internals import aggregation, sorting, stream_compaction
 from cudf.core.abc import Serializable
@@ -44,7 +40,12 @@
 from cudf.core.multiindex import MultiIndex
 from cudf.core.scalar import pa_scalar_to_plc_scalar
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type
+from cudf.utils.dtypes import (
+    CUDF_STRING_DTYPE,
+    SIZE_TYPE_DTYPE,
+    cudf_dtype_to_pa_type,
+    is_dtype_obj_numeric,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
@@ -91,7 +92,7 @@
 
 @singledispatch
 def get_valid_aggregation(dtype):
-    if is_string_dtype(dtype):
+    if dtype == CUDF_STRING_DTYPE:
         return _STRING_AGGS
     return "ALL"
 
@@ -1788,7 +1789,7 @@ def _post_process_chunk_results(
     ):
         if not len(chunk_results):
             return self.obj.head(0)
-        if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar(
+        if isinstance(chunk_results, ColumnBase) or is_scalar(
             chunk_results[0]
         ):
             data = ColumnAccessor(
@@ -3077,7 +3078,9 @@ def _reduce_numeric_only(self, op: str):
         columns = list(
             name
             for name, dtype in self.obj._dtypes
-            if (is_numeric_dtype(dtype) and name not in self.grouping.names)
+            if (
+                is_dtype_obj_numeric(dtype) and name not in self.grouping.names
+            )
         )
         return self[columns].agg(op)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f4e5f6e96ae..05a2a46c051 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -20,12 +20,11 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
     is_dtype_equal,
+    is_hashable,
     is_integer,
     is_list_like,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
@@ -57,6 +56,7 @@
     cudf_dtype_from_pa_type,
     cudf_dtype_to_pa_type,
     find_common_type,
+    is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
 )
 from cudf.utils.performance_tracking import _performance_tracking
@@ -232,7 +232,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
-        if not cudf.api.types.is_hashable(name):
+        if not is_hashable(name):
             raise ValueError("Name must be a hashable value.")
         self._name = name
         if dtype is not None and cudf.dtype(dtype).kind != "i":
@@ -1786,7 +1786,7 @@ def isin(self, values, level=None) -> cupy.ndarray:
     @property
     @_performance_tracking
     def str(self):
-        if is_string_dtype(self.dtype):
+        if self.dtype == CUDF_STRING_DTYPE:
             return StringMethods(parent=self)
         else:
             raise AttributeError(
@@ -3366,7 +3366,7 @@ def interval_range(
             "freq, exactly three must be specified"
         )
 
-    if periods is not None and not cudf.api.types.is_integer(periods):
+    if periods is not None and not is_integer(periods):
         warnings.warn(
             "Non-integer 'periods' in cudf.date_range, and cudf.interval_range"
             " are deprecated and will raise in a future version.",
@@ -3390,7 +3390,9 @@ def interval_range(
     pa_freq = pa.scalar(freq)
 
     if any(
-        not _is_non_decimal_numeric_dtype(cudf_dtype_from_pa_type(x.type))
+        not is_dtype_obj_numeric(
+            cudf_dtype_from_pa_type(x.type), include_decimal=False
+        )
         for x in (pa_start, pa.scalar(periods), pa_freq, pa_end)
     ):
         raise ValueError("start, end, periods, freq must be numeric values.")
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2f4ad360d8b..2f33a860608 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -30,7 +30,6 @@
 import cudf.core.common
 from cudf.api.extensions import no_default
 from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
     is_dict_like,
     is_list_like,
     is_scalar,
@@ -60,7 +59,11 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE
+from cudf.utils.dtypes import (
+    SIZE_TYPE_DTYPE,
+    is_column_like,
+    is_dtype_obj_numeric,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
@@ -71,6 +74,7 @@
         ColumnLike,
         DataFrameOrSeries,
         Dtype,
+        DtypeObj,
         NotImplementedType,
     )
 
@@ -6402,9 +6406,9 @@ def rank(
         dropped_cols = False
         source = self
         if numeric_only:
-            if isinstance(
-                source, cudf.Series
-            ) and not _is_non_decimal_numeric_dtype(self.dtype):  # type: ignore[attr-defined]
+            if isinstance(source, cudf.Series) and not is_dtype_obj_numeric(
+                source.dtype, include_decimal=False
+            ):  # type: ignore[attr-defined]
                 raise TypeError(
                     "Series.rank does not allow numeric_only=True with "
                     "non-numeric dtype."
@@ -6412,7 +6416,7 @@ def rank(
             numeric_cols = (
                 name
                 for name, dtype in self._dtypes
-                if _is_non_decimal_numeric_dtype(dtype)
+                if is_dtype_obj_numeric(dtype, include_decimal=False)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
@@ -6554,7 +6558,7 @@ def _check_duplicate_level_names(specified, level_names):
 
 @_performance_tracking
 def _get_replacement_values_for_columns(
-    to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
+    to_replace: Any, value: Any, columns_dtype_map: dict[Any, DtypeObj]
 ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
     """
     Returns a per column mapping for the values to be replaced, new
@@ -6587,24 +6591,22 @@ def _get_replacement_values_for_columns(
     if is_scalar(to_replace) and is_scalar(value):
         to_replace_columns = {col: [to_replace] for col in columns_dtype_map}
         values_columns = {col: [value] for col in columns_dtype_map}
-    elif cudf.api.types.is_list_like(to_replace) or isinstance(
+    elif is_list_like(to_replace) or isinstance(
         to_replace, (ColumnBase, BaseIndex)
     ):
         if is_scalar(value):
             to_replace_columns = {col: to_replace for col in columns_dtype_map}
             values_columns = {
                 col: [value]
-                if _is_non_decimal_numeric_dtype(columns_dtype_map[col])
+                if is_dtype_obj_numeric(dtype, include_decimal=False)
                 else as_column(
                     value,
                     length=len(to_replace),
                     dtype=cudf.dtype(type(value)),
                 )
-                for col in columns_dtype_map
+                for col, dtype in columns_dtype_map.items()
             }
-        elif cudf.api.types.is_list_like(
-            value
-        ) or cudf.utils.dtypes.is_column_like(value):
+        elif is_list_like(value) or is_column_like(value):
             if len(to_replace) != len(value):
                 raise ValueError(
                     f"Replacement lists must be "
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index c329bf11d97..331aa57fca4 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,9 +9,15 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
+from cudf.api.types import is_dtype_equal
 from cudf.core.column import CategoricalColumn
-from cudf.core.dtypes import CategoricalDtype
+from cudf.core.dtypes import (
+    CategoricalDtype,
+    Decimal32Dtype,
+    Decimal64Dtype,
+    Decimal128Dtype,
+)
+from cudf.utils.dtypes import is_dtype_obj_numeric
 
 if TYPE_CHECKING:
     from cudf.core.column import ColumnBase
@@ -81,15 +87,17 @@ def _match_join_keys(
     if is_dtype_equal(ltype, rtype):
         return lcol, rcol
 
-    if is_decimal_dtype(ltype) or is_decimal_dtype(rtype):
+    if isinstance(
+        ltype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype)
+    ) or isinstance(rtype, (Decimal32Dtype, Decimal64Dtype, Decimal128Dtype)):
         raise TypeError(
             "Decimal columns can only be merged with decimal columns "
             "of the same precision and scale"
         )
 
     if (
-        is_numeric_dtype(ltype)
-        and is_numeric_dtype(rtype)
+        is_dtype_obj_numeric(ltype)
+        and is_dtype_obj_numeric(rtype)
         and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 87a8849a260..f681c043186 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -17,7 +17,7 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
 from cudf.core._internals import sorting
@@ -33,7 +33,11 @@
     ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like
+from cudf.utils.dtypes import (
+    CUDF_STRING_DTYPE,
+    SIZE_TYPE_DTYPE,
+    is_column_like,
+)
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
@@ -42,7 +46,7 @@
 
     from typing_extensions import Self
 
-    from cudf._typing import DataFrameOrSeries
+    from cudf._typing import DataFrameOrSeries, Dtype
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
@@ -233,8 +237,8 @@ def to_series(self, index=None, name=None):
         )
 
     @_performance_tracking
-    def astype(self, dtype, copy: bool = True) -> Self:
-        if not is_object_dtype(dtype):
+    def astype(self, dtype: Dtype, copy: bool = True) -> Self:
+        if cudf.dtype(dtype) != CUDF_STRING_DTYPE:
             raise TypeError(
                 "Setting a MultiIndex dtype to anything other than object is "
                 "not supported"
@@ -1699,16 +1703,12 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
             Returns True, if sorted as expected by ``ascending`` and
             ``null_position``, False otherwise.
         """
-        if ascending is not None and not cudf.api.types.is_list_like(
-            ascending
-        ):
+        if ascending is not None and not is_list_like(ascending):
             raise TypeError(
                 f"Expected a list-like or None for `ascending`, got "
                 f"{type(ascending)}"
             )
-        if null_position is not None and not cudf.api.types.is_list_like(
-            null_position
-        ):
+        if null_position is not None and not is_list_like(null_position):
             raise TypeError(
                 f"Expected a list-like or None for `null_position`, got "
                 f"{type(null_position)}"
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 7d76907916f..b7412f2cc85 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import is_scalar
+from cudf.api.types import is_list_like, is_scalar
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import (
     ColumnBase,
@@ -1362,7 +1362,7 @@ def _one_hot_encode_column(
 
 
 def _length_check_params(obj, columns, name):
-    if cudf.api.types.is_list_like(obj):
+    if is_list_like(obj):
         if len(obj) != len(columns):
             raise ValueError(
                 f"Length of '{name}' ({len(obj)}) did not match the "
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 29139768a36..8579b7398f0 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -9,7 +9,6 @@
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
-import pandas as pd
 import pyarrow as pa
 
 import pylibcudf as plc
@@ -25,6 +24,7 @@
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
 from cudf.utils.dtypes import (
+    CUDF_STRING_DTYPE,
     cudf_dtype_from_pa_type,
     get_allowed_combinations_for_operator,
     to_cudf_compatible_scalar,
@@ -191,7 +191,7 @@ def _to_plc_scalar(value: ScalarLike, dtype: Dtype) -> plc.Scalar:
 
     if isinstance(dtype, cudf.core.dtypes._BaseDtype):
         pa_type = dtype.to_arrow()
-    elif pd.api.types.is_string_dtype(dtype):
+    elif dtype == CUDF_STRING_DTYPE:
         # Have to manually convert object types, which we use internally
         # for strings but pyarrow only supports as unicode 'U'
         pa_type = pa.string()
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d25550553b1..42247ce689e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -20,7 +20,6 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
     is_dict_like,
     is_integer,
@@ -64,6 +63,7 @@
 from cudf.utils.dtypes import (
     can_convert_to_column,
     find_common_type,
+    is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
@@ -357,7 +357,9 @@ def _loc_to_iloc(self, arg):
                 "as labels (consistent with DataFrame behavior). To access "
                 "a value by position, use `ser.iloc[pos]`"
             )
-            if not _is_non_decimal_numeric_dtype(index_dtype) and not (
+            if not is_dtype_obj_numeric(
+                index_dtype, include_decimal=False
+            ) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
                 and index_dtype.categories.dtype.kind in "iu"
             ):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f9713ca62d1..aa59d3af640 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -12,12 +12,12 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_integer,
-    is_numeric_dtype,
+    is_scalar,
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
-from cudf.utils.dtypes import SIZE_TYPE_DTYPE
+from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_dtype_obj_numeric
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
@@ -54,7 +54,7 @@ def _reduce(
         if axis not in (None, 0, no_default):
             raise NotImplementedError("axis parameter is not implemented yet")
 
-        if numeric_only and not is_numeric_dtype(self.dtype):
+        if numeric_only and not is_dtype_obj_numeric(self.dtype):
             raise TypeError(
                 f"Series.{op} does not allow numeric_only={numeric_only} "
                 "with non-numeric dtypes."
@@ -374,7 +374,7 @@ def where(self, cond, other=None, inplace=False):
                 """Array conditional must be same shape as self"""
             )
 
-        if not cudf.api.types.is_scalar(other):
+        if not is_scalar(other):
             other = cudf.core.column.as_column(other)
 
         input_col, other = _check_and_cast_columns_with_other(
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 4478be2fd04..89abc120de9 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -882,7 +882,7 @@ def date_range(
             "three must be specified"
         )
 
-    if periods is not None and not cudf.api.types.is_integer(periods):
+    if periods is not None and not is_integer(periods):
         warnings.warn(
             "Non-integer 'periods' in cudf.date_range, and cudf.interval_range"
             " are deprecated and will raise in a future version.",
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 9746234cfb1..18e96ee4a68 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -8,11 +8,14 @@
 import pandas as pd
 
 import cudf
-from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.index import ensure_index
-from cudf.utils.dtypes import can_convert_to_column
+from cudf.utils.dtypes import (
+    CUDF_STRING_DTYPE,
+    can_convert_to_column,
+    is_dtype_obj_numeric,
+)
 
 if TYPE_CHECKING:
     from cudf.core.column.numerical import NumericalColumn
@@ -142,7 +145,7 @@ def to_numeric(
                     return arg
                 else:
                     raise e
-    elif is_string_dtype(dtype):
+    elif dtype == CUDF_STRING_DTYPE:
         try:
             col = _convert_str_col(col, errors, downcast)  # type: ignore[arg-type]
         except ValueError as e:
@@ -152,7 +155,7 @@ def to_numeric(
                 raise e
     elif isinstance(dtype, (cudf.ListDtype, cudf.StructDtype)):
         raise ValueError("Input does not support nested datatypes")
-    elif _is_non_decimal_numeric_dtype(dtype):
+    elif is_dtype_obj_numeric(dtype, include_decimal=False):
         pass
     else:
         raise ValueError("Unrecognized datatype")
@@ -218,7 +221,7 @@ def _convert_str_col(
     -------
     Converted numeric column
     """
-    if not is_string_dtype(col):
+    if col.dtype != CUDF_STRING_DTYPE:
         raise TypeError("col must be string dtype.")
 
     if col.is_integer().all():
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 3e8a6ab400c..4b94e3e52b1 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -6,8 +6,8 @@
 
 import numpy as np
 
-from cudf.api.types import is_numeric_dtype
 from cudf.core.window.rolling import _RollingBase
+from cudf.utils.dtypes import is_dtype_obj_numeric
 
 if TYPE_CHECKING:
     from cudf.core.column.column import ColumnBase
@@ -184,7 +184,7 @@ def cov(
     def _apply_agg_column(
         self, source_column: ColumnBase, agg_name: str
     ) -> ColumnBase:
-        if not is_numeric_dtype(source_column.dtype):
+        if not is_dtype_obj_numeric(source_column.dtype):
             raise TypeError("No numeric types to aggregate")
 
         # libcudf ewm has special casing for nulls only
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index 3b3fd5f7c56..e7b224a40e7 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import pylibcudf as plc
@@ -6,6 +6,7 @@
 import cudf
 from cudf.core.column import ColumnBase
 from cudf.utils import ioutils
+from cudf.utils.dtypes import find_common_type, is_dtype_obj_numeric
 
 
 def from_dlpack(pycapsule_obj) -> cudf.Series | cudf.DataFrame:
@@ -83,12 +84,12 @@ def to_dlpack(cudf_obj: cudf.Series | cudf.DataFrame | cudf.BaseIndex):
         )
 
     if any(
-        not cudf.api.types._is_non_decimal_numeric_dtype(dtype)
+        not is_dtype_obj_numeric(dtype, include_decimal=False)
         for _, dtype in gdf._dtypes  # type: ignore[union-attr]
     ):
         raise TypeError("non-numeric data not yet supported")
 
-    dtype = cudf.utils.dtypes.find_common_type(
+    dtype = find_common_type(
         [dtype for _, dtype in gdf._dtypes]  # type: ignore[union-attr]
     )
     gdf = gdf.astype(dtype)
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 9c20a42d215..e1b0c17eb00 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -10,15 +10,15 @@
 from pandas import testing as tm
 
 import cudf
-from cudf.api.types import is_numeric_dtype, is_string_dtype
 from cudf.core.missing import NA, NaT
+from cudf.utils.dtypes import CUDF_STRING_DTYPE, is_dtype_obj_numeric
 
 
 def dtype_can_compare_equal_to_other(dtype):
     # return True if values of this dtype can compare
     # as equal to equal values of a different dtype
     return not (
-        is_string_dtype(dtype)
+        dtype == CUDF_STRING_DTYPE
         or isinstance(
             dtype,
             (
@@ -218,10 +218,10 @@ def assert_column_equal(
     elif not (
         (
             not dtype_can_compare_equal_to_other(left.dtype)
-            and is_numeric_dtype(right.dtype)
+            and is_dtype_obj_numeric(right.dtype)
         )
         or (
-            is_numeric_dtype(left.dtype)
+            is_dtype_obj_numeric(left.dtype)
             and not dtype_can_compare_equal_to_other(right.dtype)
         )
     ):
@@ -234,7 +234,7 @@ def assert_column_equal(
             if (
                 columns_equal
                 and not check_exact
-                and is_numeric_dtype(left.dtype)
+                and is_dtype_obj_numeric(left.dtype)
             ):
                 # non-null values must be the same
                 columns_equal = cp.allclose(
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 489b804583a..adee17e7bfb 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -612,6 +612,20 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
         return dtype.base
 
 
+def is_dtype_obj_numeric(
+    dtype: DtypeObj, include_decimal: bool = True
+) -> bool:
+    """Like is_numeric_dtype but does not introspect argument."""
+    is_non_decimal = dtype.kind in set("iufb")
+    if include_decimal:
+        return is_non_decimal or isinstance(
+            dtype,
+            (cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype),
+        )
+    else:
+        return is_non_decimal
+
+
 def dtype_to_pylibcudf_type(dtype) -> plc.DataType:
     if isinstance(dtype, cudf.ListDtype):
         return plc.DataType(plc.TypeId.LIST)

From 54fc0c708f0d9252a695b57b3cc109aba961a431 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 4 Mar 2025 00:32:34 -0500
Subject: [PATCH 39/42] Minor typo fix in filling.pxd (#18120)

Found this misspelled word while working on other things.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/18120
---
 cpp/examples/interop/interop.cpp               | 4 ++--
 python/pylibcudf/pylibcudf/libcudf/filling.pxd | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
index 133a4e3a514..b01b04489a6 100644
--- a/cpp/examples/interop/interop.cpp
+++ b/cpp/examples/interop/interop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
 #include <arrow/array/array_binary.h>
 #include <arrow/type.h>
 
-// Helper functuons to create StringViews
+// Helper functions to create StringViews
 inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size)
 {
   arrow::StringViewType::c_type out;
diff --git a/python/pylibcudf/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
index f0bfe8ca80b..d9ae573d23b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -23,7 +23,7 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
 
     cdef void fill_in_place(
         const mutable_column_view & destination,
-        size_type beign,
+        size_type begin,
         size_type end,
         const scalar & value
     ) except +libcudf_exception_handler

From 1420ef2c792cf56d3c91d7240560c3d0d2cb7629 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 3 Mar 2025 22:20:34 -0800
Subject: [PATCH 40/42] Add `host_read_async` interfaces to `datasource`
 (#18018)

kvikIO supports asynchronous host reads, but we don't utilize them to optimize host reads such as metadata access.
This PR adds the async versions of the `host_read` APIs to allow efficient use of the kvikIO pool for host reads. The `datasource`s that are not backed by kvikIO implement these as deferred calls to the synchronous versions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/18018
---
 cpp/include/cudf/io/datasource.hpp     |  39 +++++-
 cpp/src/io/orc/reader_impl_chunking.cu |  36 +++---
 cpp/src/io/utilities/datasource.cpp    | 163 +++++++++++++------------
 3 files changed, 137 insertions(+), 101 deletions(-)

diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 7bec40893fd..92859ec0895 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,7 +67,7 @@ class datasource {
     /**
      * @brief Base class destructor
      */
-    virtual ~buffer() {}
+    virtual ~buffer() = default;
 
     /**
      * @brief Factory to construct a datasource buffer object from a container.
@@ -156,7 +156,7 @@ class datasource {
   /**
    * @brief Base class destructor
    */
-  virtual ~datasource(){};
+  virtual ~datasource() = default;
 
   /**
    * @brief Returns a buffer with a subset of data from the source.
@@ -168,6 +168,21 @@ class datasource {
    */
   virtual std::unique_ptr<datasource::buffer> host_read(size_t offset, size_t size) = 0;
 
+  /**
+   * @brief Asynchronously reads a specified portion of data from the datasource.
+   *
+   * This function initiates an asynchronous read operation that reads `size` bytes of data
+   * starting from the given `offset` in the datasource. Depending on the concrete datasource
+   * implementation, the read operation may be deferred until the returned future is waited upon.
+   *
+   * @param offset The starting position in the datasource from which to read.
+   * @param size The number of bytes to read from the datasource.
+   * @return A std::future that will hold a unique pointer to a datasource::buffer containing
+   *         the read data once the operation completes.
+   */
+  virtual std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
+                                                                           size_t size);
+
   /**
    * @brief Reads a selected range into a preallocated buffer.
    *
@@ -179,6 +194,22 @@ class datasource {
    */
   virtual size_t host_read(size_t offset, size_t size, uint8_t* dst) = 0;
 
+  /**
+   * @brief Asynchronously reads data from the source into the provided host memory buffer.
+   *
+   * This function initiates an asynchronous read operation from the data source starting at the
+   * specified offset and reads the specified number of bytes into the destination buffer. Depending
+   * on the concrete datasource implementation, the read operation may be deferred and will be
+   * executed when the returned future is waited upon.
+   *
+   * @param offset The starting position in the data source from which to read.
+   * @param size The number of bytes to read from the data source.
+   * @param dst Pointer to the destination buffer where the read data will be stored.
+   * @return A std::future object that will hold the number of bytes read once the operation
+   * completes.
+   */
+  virtual std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst);
+
   /**
    * @brief Whether or not this source supports reading directly into device memory.
    *
@@ -296,7 +327,7 @@ class datasource {
    */
   class non_owning_buffer : public buffer {
    public:
-    non_owning_buffer() {}
+    non_owning_buffer() = default;
 
     /**
      * @brief Construct a new non owning buffer object
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5c663950b00..5b0c7ae11a9 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -486,13 +486,11 @@ void reader_impl::load_next_stripe_data(read_mode mode)
   // Load stripe data into memory:
   //
 
-  // If we load data from sources into host buffers, we need to transfer (async) data to device
-  // memory. Such host buffers need to be kept alive until we sync the transfers.
-  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
-
-  // If we load data directly from sources into device memory, the loads are also async.
-  // Thus, we need to make sure to sync all them at the end.
+  // Storing the future and the expected size of the read data
   std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
+  // Storing the future, the expected size of the read data and the device destination pointer
+  std::vector<std::tuple<std::future<std::unique_ptr<datasource::buffer>>, std::size_t, uint8_t*>>
+    host_read_tasks;
 
   // Range of the read info (offset, length) to read for the current being loaded stripes.
   auto const [read_begin, read_end] =
@@ -518,24 +516,22 @@ void reader_impl::load_next_stripe_data(read_mode mode)
         source_ptr->device_read_async(
           read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
         read_info.length);
-
     } else {
-      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
-      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
-      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
-                                    buffer->data(),
-                                    read_info.length,
-                                    cudaMemcpyDefault,
-                                    _stream.value()));
-      host_read_buffers.emplace_back(std::move(buffer));
+      host_read_tasks.emplace_back(source_ptr->host_read_async(read_info.offset, read_info.length),
+                                   read_info.length,
+                                   dst_base + read_info.dst_pos);
     }
   }
-
-  if (host_read_buffers.size() > 0) {  // if there was host read
-    _stream.synchronize();
-    host_read_buffers.clear();  // its data was copied to device memory after stream sync
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+  for (auto& [fut, expected_size, dev_dst] : host_read_tasks) {  // if there were host reads
+    host_read_buffers.emplace_back(fut.get());
+    auto* host_buffer = host_read_buffers.back().get();
+    CUDF_EXPECTS(host_buffer->size() == expected_size, "Unexpected discrepancy in bytes read.");
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      dev_dst, host_buffer->data(), host_buffer->size(), cudaMemcpyDefault, _stream.value()));
   }
-  for (auto& task : device_read_tasks) {  // if there was device read
+
+  for (auto& task : device_read_tasks) {  // if there were device reads
     CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
   }
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 2cb2b303cb3..2f181188fb2 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -44,37 +44,56 @@ namespace io {
 namespace {
 
 /**
- * @brief Base class for file input. Only implements direct device reads.
+ * @brief Base class for kvikIO-based data sources.
  */
-class file_source : public datasource {
- public:
-  explicit file_source(char const* filepath)
-  {
-    kvikio_integration::set_up_kvikio();
-    _kvikio_file = kvikio::FileHandle(filepath, "r");
-    CUDF_EXPECTS(!_kvikio_file.closed(), "KvikIO did not open the file successfully.");
-    CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.",
-                  _kvikio_file.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off");
-  }
+template <typename HandleT>
+class kvikio_source : public datasource {
+  class kvikio_initializer {
+   public:
+    kvikio_initializer() { kvikio_integration::set_up_kvikio(); }
+  };
 
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  std::pair<std::vector<uint8_t>, std::future<size_t>> clamped_read_to_vector(size_t offset,
+                                                                              size_t size)
   {
     // Clamp length to available data
     auto const read_size = std::min(size, this->size() - offset);
     std::vector<uint8_t> v(read_size);
-    CUDF_EXPECTS(_kvikio_file.pread(v.data(), read_size, offset).get() == read_size, "read failed");
+    return {std::move(v), _kvikio_handle.pread(v.data(), read_size, offset)};
+  }
+
+ public:
+  kvikio_source(HandleT&& h) : _kvikio_handle(std::move(h)) {}
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    auto [v, fut] = clamped_read_to_vector(offset, size);
+    fut.get();
     return buffer::create(std::move(v));
   }
 
+  std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
+                                                                   size_t size) override
+  {
+    auto clamped_read = clamped_read_to_vector(offset, size);
+    return std::async(std::launch::deferred, [cr = std::move(clamped_read)]() mutable {
+      cr.second.get();
+      return buffer::create(std::move(cr.first));
+    });
+  }
+
   size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    return host_read_async(offset, size, dst).get();
+  }
+
+  std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst) override
   {
     // Clamp length to available data
     auto const read_size = std::min(size, this->size() - offset);
-    CUDF_EXPECTS(_kvikio_file.pread(dst, read_size, offset).get() == read_size, "read failed");
-    return read_size;
+    return _kvikio_handle.pread(dst, read_size, offset);
   }
 
-  ~file_source() override = default;
+  ~kvikio_source() override = default;
 
   [[nodiscard]] bool supports_device_read() const override { return true; }
 
@@ -91,7 +110,7 @@ class file_source : public datasource {
     CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
 
     auto const read_size = std::min(size, this->size() - offset);
-    return _kvikio_file.pread(dst, read_size, offset);
+    return _kvikio_handle.pread(dst, read_size, offset);
   }
 
   size_t device_read(size_t offset,
@@ -113,10 +132,29 @@ class file_source : public datasource {
     return datasource::buffer::create(std::move(out_data));
   }
 
-  [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); }
+  [[nodiscard]] size_t size() const override { return _kvikio_handle.nbytes(); }
+
+  kvikio_initializer _;
 
  protected:
-  kvikio::FileHandle _kvikio_file;
+  HandleT _kvikio_handle;
+};
+
+/**
+ * @brief A class representing a file source using kvikIO.
+ *
+ * This class is derived from `kvikio_source` and is used to handle file operations
+ * using kvikIO library.
+ */
+class file_source : public kvikio_source<kvikio::FileHandle> {
+ public:
+  explicit file_source(char const* filepath) : kvikio_source{kvikio::FileHandle(filepath, "r")}
+  {
+    CUDF_EXPECTS(!_kvikio_handle.closed(), "KvikIO did not open the file successfully.");
+    CUDF_LOG_INFO(
+      "Reading a file using kvikIO, with compatibility mode %s.",
+      _kvikio_handle.get_compat_mode_manager().is_compat_mode_preferred() ? "on" : "off");
+  }
 };
 
 /**
@@ -132,7 +170,7 @@ class memory_mapped_source : public file_source {
   {
     if (this->size() != 0) {
       // Memory mapping is not exclusive, so we can include the whole region we expect to read
-      map(_kvikio_file.fd(), offset, max_size_estimate);
+      map(_kvikio_handle.fd(), offset, max_size_estimate);
     }
   }
 
@@ -331,6 +369,17 @@ class user_datasource_wrapper : public datasource {
     return source->host_read(offset, size);
   }
 
+  std::future<size_t> host_read_async(size_t offset, size_t size, uint8_t* dst) override
+  {
+    return source->host_read_async(offset, size, dst);
+  }
+
+  std::future<std::unique_ptr<datasource::buffer>> host_read_async(size_t offset,
+                                                                   size_t size) override
+  {
+    return source->host_read_async(offset, size);
+  }
+
   [[nodiscard]] bool supports_device_read() const override
   {
     return source->supports_device_read();
@@ -376,68 +425,18 @@ class user_datasource_wrapper : public datasource {
 /**
  * @brief Remote file source backed by KvikIO, which handles S3 filepaths seamlessly.
  */
-class remote_file_source : public datasource {
-  static std::unique_ptr<kvikio::S3Endpoint> create_s3_endpoint(char const* filepath)
+class remote_file_source : public kvikio_source<kvikio::RemoteHandle> {
+  static auto create_s3_handle(char const* filepath)
   {
     auto [bucket_name, bucket_object] = kvikio::S3Endpoint::parse_s3_url(filepath);
-    return std::make_unique<kvikio::S3Endpoint>(bucket_name, bucket_object);
+    return kvikio::RemoteHandle{std::make_unique<kvikio::S3Endpoint>(bucket_name, bucket_object)};
   }
 
  public:
-  explicit remote_file_source(char const* filepath) : _kvikio_file{create_s3_endpoint(filepath)} {}
+  explicit remote_file_source(char const* filepath) : kvikio_source{create_s3_handle(filepath)} {}
 
   ~remote_file_source() override = default;
 
-  [[nodiscard]] bool supports_device_read() const override { return true; }
-
-  [[nodiscard]] bool is_device_read_preferred(size_t size) const override { return true; }
-
-  [[nodiscard]] size_t size() const override { return _kvikio_file.nbytes(); }
-
-  std::future<size_t> device_read_async(size_t offset,
-                                        size_t size,
-                                        uint8_t* dst,
-                                        rmm::cuda_stream_view stream) override
-  {
-    CUDF_EXPECTS(supports_device_read(), "Device reads are not supported for this file.");
-
-    auto const read_size = std::min(size, this->size() - offset);
-    return _kvikio_file.pread(dst, read_size, offset);
-  }
-
-  size_t device_read(size_t offset,
-                     size_t size,
-                     uint8_t* dst,
-                     rmm::cuda_stream_view stream) override
-  {
-    return device_read_async(offset, size, dst, stream).get();
-  }
-
-  std::unique_ptr<datasource::buffer> device_read(size_t offset,
-                                                  size_t size,
-                                                  rmm::cuda_stream_view stream) override
-  {
-    rmm::device_buffer out_data(size, stream);
-    size_t const read =
-      device_read(offset, size, reinterpret_cast<uint8_t*>(out_data.data()), stream);
-    out_data.resize(read, stream);
-    return datasource::buffer::create(std::move(out_data));
-  }
-
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
-  {
-    auto const read_size = std::min(size, this->size() - offset);
-    return _kvikio_file.pread(dst, read_size, offset).get();
-  }
-
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
-  {
-    auto const count = std::min(size, this->size() - offset);
-    std::vector<uint8_t> h_data(count);
-    this->host_read(offset, count, h_data.data());
-    return datasource::buffer::create(std::move(h_data));
-  }
-
   /**
    * @brief Is `url` referring to a remote file supported by KvikIO?
    *
@@ -449,9 +448,6 @@ class remote_file_source : public datasource {
     static std::regex const pattern{R"(^s3://)", std::regex_constants::icase};
     return std::regex_search(url, pattern);
   }
-
- private:
-  kvikio::RemoteHandle _kvikio_file;
 };
 #else
 /**
@@ -509,5 +505,18 @@ std::unique_ptr<datasource> datasource::create(datasource* source)
   return std::make_unique<user_datasource_wrapper>(source);
 }
 
+std::future<std::unique_ptr<datasource::buffer>> datasource::host_read_async(size_t offset,
+                                                                             size_t size)
+{
+  return std::async(std::launch::deferred,
+                    [this, offset, size] { return host_read(offset, size); });
+}
+
+std::future<size_t> datasource::host_read_async(size_t offset, size_t size, uint8_t* dst)
+{
+  return std::async(std::launch::deferred,
+                    [this, offset, size, dst] { return host_read(offset, size, dst); });
+}
+
 }  // namespace io
 }  // namespace cudf

From d9e64b2361083f30785d61e5ad03bbd9bc353220 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 4 Mar 2025 11:12:11 +0100
Subject: [PATCH 41/42] Add `pylibcudf.gpumemoryview` support for
 `len()`/`nbytes` (#18133)

Add support for `len()` and `nbytes` in `pylibcudf.gpumemoryview`. Having those methods is helpful to ensure proper serialization in Dask/Distributed, as utility methods that serialize objects, in this case used by cudf-polars, may use the appropriate method or property to determine the size of the object being transferred.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/18133
---
 python/pylibcudf/pylibcudf/gpumemoryview.pyi  |  3 +
 python/pylibcudf/pylibcudf/gpumemoryview.pyx  | 20 ++++++-
 .../pylibcudf/tests/test_gpumemoryview.py     | 58 +++++++++++++++++++
 3 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py

diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyi b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
index 50f1f39a515..236ff6e56a6 100644
--- a/python/pylibcudf/pylibcudf/gpumemoryview.pyi
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyi
@@ -7,3 +7,6 @@ class gpumemoryview:
     def __init__(self, data: Any): ...
     @property
     def __cuda_array_interface__(self) -> Mapping[str, Any]: ...
+    def __len__(self) -> int: ...
+    @property
+    def nbytes(self) -> int: ...
diff --git a/python/pylibcudf/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
index 41316eddb60..954d35a6ce3 100644
--- a/python/pylibcudf/pylibcudf/gpumemoryview.pyx
+++ b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
@@ -1,4 +1,7 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Copyright (c) 2023-2025, NVIDIA CORPORATION.
+
+import functools
+import operator
 
 __all__ = ["gpumemoryview"]
 
@@ -27,4 +30,19 @@ cdef class gpumemoryview:
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
 
+    def __len__(self):
+        return self.obj.__cuda_array_interface__["shape"][0]
+
+    @property
+    def nbytes(self):
+        cai = self.obj.__cuda_array_interface__
+        shape, typestr = cai["shape"], cai["typestr"]
+
+        # Get element size from typestr, format is two character specifying
+        # the type and the latter part is the number of bytes. E.g., '<f4' for
+        # 32-bit (4-byte) float.
+        element_size = int(typestr[2:])
+
+        return functools.reduce(operator.mul, shape) * element_size
+
     __hash__ = None
diff --git a/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py b/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py
new file mode 100644
index 00000000000..187857c935a
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_gpumemoryview.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+import itertools
+
+import numpy as np
+import pytest
+
+import rmm
+
+import pylibcudf as plc
+
+DTYPES = [
+    "u1",
+    "i2",
+    "f4",
+    "f8",
+    "f16",
+]
+SIZES = [
+    0,
+    1,
+    1000,
+    1024,
+    10000,
+]
+
+
+@pytest.fixture(params=tuple(itertools.product(SIZES, DTYPES)), ids=repr)
+def np_array(request):
+    size, dtype = request.param
+    return np.empty((size,), dtype=dtype)
+
+
+def test_cuda_array_interface(np_array):
+    buf = rmm.DeviceBuffer(
+        ptr=np_array.__array_interface__["data"][0], size=np_array.nbytes
+    )
+    gpumemview = plc.gpumemoryview(buf)
+
+    np_array_view = np_array.view("u1")
+
+    ai = np_array_view.__array_interface__
+    cai = gpumemview.__cuda_array_interface__
+    assert cai["shape"] == ai["shape"]
+    assert cai["strides"] == ai["strides"]
+    assert cai["typestr"] == ai["typestr"]
+
+
+def test_len(np_array):
+    buf = rmm.DeviceBuffer(
+        ptr=np_array.__array_interface__["data"][0], size=np_array.nbytes
+    )
+    gpumemview = plc.gpumemoryview(buf)
+
+    np_array_view = np_array.view("u1")
+
+    assert len(gpumemview) == len(np_array_view)
+    assert gpumemview.nbytes == np_array.nbytes

From 8ca4bc43d4650ae364d1f9ee412a5597f310b4f7 Mon Sep 17 00:00:00 2001
From: Basit Ayantunde <rlamarrr@gmail.com>
Date: Tue, 4 Mar 2025 12:16:08 +0000
Subject: [PATCH 42/42] Added polynomials benchmark (#17695)

This merge request implements benchmarks for comparing the AST, UDF Transform, and BINARY_OP methods by computing a polynomial.

Closes https://github.com/rapidsai/cudf/issues/17561

Authors:
  - Basit Ayantunde (https://github.com/lamarrr)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17695
---
 cpp/benchmarks/CMakeLists.txt            |  11 ++-
 cpp/benchmarks/ast/polynomials.cpp       |  94 +++++++++++++++++++
 cpp/benchmarks/binaryop/polynomials.cpp  | 101 +++++++++++++++++++++
 cpp/benchmarks/transform/polynomials.cpp | 109 +++++++++++++++++++++++
 4 files changed, 313 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/ast/polynomials.cpp
 create mode 100644 cpp/benchmarks/binaryop/polynomials.cpp
 create mode 100644 cpp/benchmarks/transform/polynomials.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 03f11cc957b..549cb8e5d5d 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -344,11 +344,18 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)
 
 # ##################################################################################################
 # * ast benchmark ---------------------------------------------------------------------------------
-ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
+ConfigureNVBench(AST_NVBENCH ast/polynomials.cpp ast/transform.cpp)
 
 # ##################################################################################################
 # * binaryop benchmark ----------------------------------------------------------------------------
-ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
+ConfigureNVBench(
+  BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp binaryop/polynomials.cpp
+)
+
+# ##################################################################################################
+# * transform benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------
diff --git a/cpp/benchmarks/ast/polynomials.cpp b/cpp/benchmarks/ast/polynomials.cpp
new file mode 100644
index 00000000000..b8e4ca46b72
--- /dev/null
+++ b/cpp/benchmarks/ast/polynomials.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+#include <nvbench/types.cuh>
+
+#include <random>
+
+template <typename key_type>
+static void BM_ast_polynomials(nvbench::state& state)
+{
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const order    = static_cast<cudf::size_type>(state.get_int64("order"));
+
+  CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");
+
+  data_profile profile;
+  profile.set_distribution_params(cudf::type_to_id<key_type>(),
+                                  distribution_id::NORMAL,
+                                  static_cast<key_type>(0),
+                                  static_cast<key_type>(1));
+  auto table = create_random_table({cudf::type_to_id<key_type>()}, row_count{num_rows}, profile);
+  auto column_view = table->get_column(0);
+
+  std::vector<cudf::numeric_scalar<key_type>> constants;
+  {
+    std::random_device random_device;
+    std::mt19937 generator;
+    std::uniform_real_distribution<key_type> distribution{0, 1};
+
+    std::transform(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(order + 1),
+                   std::back_inserter(constants),
+                   [&](int) { return distribution(generator); });
+  }
+
+  cudf::ast::tree tree{};
+
+  auto& column_ref = tree.push(cudf::ast::column_reference{0});
+
+  // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....
+  tree.push(cudf::ast::literal{constants[0]});
+
+  for (cudf::size_type i = 0; i < order; i++) {
+    auto& product =
+      tree.push(cudf::ast::operation{cudf::ast::ast_operator::MUL, tree.back(), column_ref});
+    auto& constant = tree.push(cudf::ast::literal{constants[i + 1]});
+    tree.push(cudf::ast::operation{cudf::ast::ast_operator::ADD, product, constant});
+  }
+
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(num_rows);
+  state.add_global_memory_writes<key_type>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    cudf::scoped_range range{"benchmark_iteration"};
+    cudf::compute_column(*table, tree.back(), launch.get_stream().get_stream());
+  });
+}
+
+#define AST_POLYNOMIAL_BENCHMARK_DEFINE(name, key_type)                          \
+  static void name(::nvbench::state& st) { ::BM_ast_polynomials<key_type>(st); } \
+  NVBENCH_BENCH(name)                                                            \
+    .set_name(#name)                                                             \
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})   \
+    .add_int64_axis("order", {1, 2, 4, 8, 16, 32})
+
+AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float32, float);
+
+AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float64, double);
diff --git a/cpp/benchmarks/binaryop/polynomials.cpp b/cpp/benchmarks/binaryop/polynomials.cpp
new file mode 100644
index 00000000000..782ae1db927
--- /dev/null
+++ b/cpp/benchmarks/binaryop/polynomials.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <random>
+
+template <typename key_type>
+static void BM_binaryop_polynomials(nvbench::state& state)
+{
+  auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
+  auto const order{static_cast<cudf::size_type>(state.get_int64("order"))};
+
+  CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");
+
+  data_profile profile;
+  profile.set_distribution_params(cudf::type_to_id<key_type>(),
+                                  distribution_id::NORMAL,
+                                  static_cast<key_type>(0),
+                                  static_cast<key_type>(1));
+  auto table = create_random_table({cudf::type_to_id<key_type>()}, row_count{num_rows}, profile);
+  auto column_view = table->get_column(0);
+
+  std::vector<cudf::numeric_scalar<key_type>> constants;
+  {
+    std::random_device random_device;
+    std::mt19937 generator;
+    std::uniform_real_distribution<key_type> distribution{0, 1};
+
+    std::transform(thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(order + 1),
+                   std::back_inserter(constants),
+                   [&](int) { return cudf::numeric_scalar<key_type>(distribution(generator)); });
+  }
+
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(num_rows);
+  state.add_global_memory_writes<key_type>(num_rows);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....
+    cudf::scoped_range range{"benchmark_iteration"};
+    rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
+    std::vector<std::unique_ptr<cudf::column>> intermediates;
+
+    auto result = cudf::make_column_from_scalar(constants[0], num_rows, stream);
+
+    for (cudf::size_type i = 0; i < order; i++) {
+      auto product = cudf::binary_operation(result->view(),
+                                            column_view,
+                                            cudf::binary_operator::MUL,
+                                            cudf::data_type{cudf::type_to_id<key_type>()},
+                                            stream);
+      auto sum     = cudf::binary_operation(product->view(),
+                                        constants[i + 1],
+                                        cudf::binary_operator::ADD,
+                                        cudf::data_type{cudf::type_to_id<key_type>()},
+                                        stream);
+      intermediates.push_back(std::move(product));
+      intermediates.push_back(std::move(result));
+      result = std::move(sum);
+    }
+  });
+}
+
+#define BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type)                         \
+                                                                                      \
+  static void name(::nvbench::state& st) { ::BM_binaryop_polynomials<key_type>(st); } \
+  NVBENCH_BENCH(name)                                                                 \
+    .set_name(#name)                                                                  \
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})        \
+    .add_int64_axis("order", {1, 2, 4, 8, 16, 32})
+
+BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float32, float);
+
+BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float64, double);
diff --git a/cpp/benchmarks/transform/polynomials.cpp b/cpp/benchmarks/transform/polynomials.cpp
new file mode 100644
index 00000000000..07f8a47c771
--- /dev/null
+++ b/cpp/benchmarks/transform/polynomials.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nvbench/nvbench.cuh>
+
+#include <algorithm>
+#include <random>
+
+template <typename key_type>
+static void BM_transform_polynomials(nvbench::state& state)
+{
+  auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
+  auto const order{static_cast<cudf::size_type>(state.get_int64("order"))};
+
+  CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");
+
+  data_profile profile;
+  profile.set_distribution_params(cudf::type_to_id<key_type>(),
+                                  distribution_id::NORMAL,
+                                  static_cast<key_type>(0),
+                                  static_cast<key_type>(1));
+  auto column = create_random_column(cudf::type_to_id<key_type>(), row_count{num_rows}, profile);
+
+  std::vector<std::unique_ptr<cudf::column>> constants;
+
+  std::transform(
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(order + 1),
+    std::back_inserter(constants),
+    [&](int) { return create_random_column(cudf::type_to_id<key_type>(), row_count{1}, profile); });
+
+  // Use the number of bytes read from global memory
+  state.add_global_memory_reads<key_type>(num_rows);
+  state.add_global_memory_writes<key_type>(num_rows);
+
+  std::vector<cudf::column_view> inputs{*column};
+  std::transform(constants.begin(),
+                 constants.end(),
+                 std::back_inserter(inputs),
+                 [](auto& col) -> cudf::column_view { return *col; });
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    // computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....
+
+    cudf::scoped_range range{"benchmark_iteration"};
+
+    std::string type = cudf::type_to_name(cudf::data_type{cudf::type_to_id<key_type>()});
+
+    std::string params_decl = type + " c0";
+    std::string expr        = "c0";
+
+    for (cudf::size_type i = 1; i < order + 1; i++) {
+      expr = "( " + expr + " ) * x +  c" + std::to_string(i);
+      params_decl += ", " + type + " c" + std::to_string(i);
+    }
+
+    static_assert(std::is_same_v<key_type, float> || std::is_same_v<key_type, double>);
+
+    // clang-format off
+    std::string udf =
+    "__device__ inline void compute_polynomial(" + type + "* out, " + type + " x, " + params_decl + ")" +
+"{ "
+" *out = " + expr + ";"
+"}";
+
+    // clang-format on
+
+    cudf::transform(inputs,
+                    udf,
+                    cudf::data_type{cudf::type_to_id<key_type>()},
+                    false,
+                    launch.get_stream().get_stream());
+  });
+}
+
+#define TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type)                         \
+                                                                                       \
+  static void name(::nvbench::state& st) { ::BM_transform_polynomials<key_type>(st); } \
+  NVBENCH_BENCH(name)                                                                  \
+    .set_name(#name)                                                                   \
+    .add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000})         \
+    .add_int64_axis("order", {1, 2, 4, 8, 16, 32})
+
+TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float32, float);
+
+TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float64, double);