diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 38b890893d0..2c583598f54 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -24,7 +24,6 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
-      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-libcudf
@@ -192,16 +191,6 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
-  static-configure:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
-    with:
-      build_type: pull-request
-      # Use the wheel container so we can skip conda solves and since our
-      # primary static consumers (Spark) are not in conda anyway.
-      container_image: "rapidsai/ci-wheel:latest"
-      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 12f6d751493..7046fd0e5dc 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -168,3 +168,14 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: "ci/test_cudf_polars_polars_tests.sh"
+  narwhals-tests:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-25.04
+    with:
+      build_type: ${{ inputs.build_type }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      node_type: "gpu-l4-latest-1"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: ci/test_narwhals.sh
diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 3d06eacf9ff..0c324d01cdf 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -18,7 +18,7 @@ rapids-logger "Begin cpp build"
 sccache --zero-stats
 
 # With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry build \
     conda/recipes/libcudf
 
 sccache --show-adv-stats
diff --git a/ci/build_python.sh b/ci/build_python.sh
index ed90041cc77..abbdc3f3a3b 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -25,7 +25,7 @@ sccache --zero-stats
 # node works correctly
 # With boa installed conda build forwards to the boa builder
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/pylibcudf
@@ -33,7 +33,7 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -42,13 +42,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 sccache --show-adv-stats
 sccache --zero-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
@@ -56,13 +56,13 @@ RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
 
 sccache --show-adv-stats
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/custreamz
 
-RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry build \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
deleted file mode 100755
index 3d0647a96f6..00000000000
--- a/ci/configure_cpp_static.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024-2025, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-source rapids-date-string
-
-rapids-logger "Configure static cpp build"
-
-ENV_YAML_DIR="$(mktemp -d)"
-REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
-
-rapids-dependency-file-generator \
-  --output requirements \
-  --file-key test_static_build \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
-
-rapids-pip-retry install -r "${REQUIREMENTS_FILE}"
-pyenv rehash
-
-cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e7e0cb17606..a23981b4e72 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -54,9 +54,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==4.2.0.11
diff --git a/conda/environments/all_cuda-128_arch-x86_64.yaml b/conda/environments/all_cuda-128_arch-x86_64.yaml
index c95b8b10299..e2b9302dc36 100644
--- a/conda/environments/all_cuda-128_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-128_arch-x86_64.yaml
@@ -53,9 +53,9 @@ dependencies:
 - nbsphinx
 - ninja
 - notebook
-- numba-cuda>=0.2.0,<0.3.0a0
-- numba>=0.59.1,<0.61.0a0
-- numpy>=1.23,<3.0a0
+- numba-cuda>=0.4.0,<0.5.0a0
+- numba>=0.59.1,<0.62.0a0
+- numpy>=1.23,<2.1
 - numpydoc
 - nvcomp==4.2.0.11
 - nvtx>=0.2.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index f817bc12c5b..43060ef1c87 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -75,9 +75,9 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
     - cupy >=12.0.0
-    - numba-cuda >=0.2.0,<0.3.0a0
-    - numba >=0.59.1,<0.61.0a0
-    - numpy >=1.23,<3.0a0
+    - numba-cuda >=0.4.0,<0.5.0a0
+    - numba >=0.59.1,<0.62.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 14e2f31a5a5..ae02cf8d4e5 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -73,7 +73,7 @@ requirements:
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.4dev0
-    - numpy >=1.23,<3.0a0
+    - numpy >=1.23,<2.1
     - pyarrow>=14.0.0,<20.0.0a0
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 594dc0de28a..494d5722ae4 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
+    // we expect the normalizer to be created once and re-used
+    // so creating it is not measured
+    auto normalizer = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      auto result = nvtext::normalize_characters(input, to_lower);
+      auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
 }
@@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state)
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("max_width", {128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index d276c5df7dc..8fb1f30f961 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the first and last offset in the given strings column
+ *
+ * This accounts for sliced input columns as well.
+ *
+ * @param input Strings column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return First and last offset values
+ */
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream);
+
 }  // namespace strings::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 74325f4a406..70ee7891ad7 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Normalizer object to be used with nvtext::normalize_characters
+ *
+ * Use nvtext::create_normalizer to create this object.
+ *
+ * This normalizer includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes any accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ *
+ * If `special_tokens` are included the padding after `[` and before `]` is not
+ * inserted if the characters between them match one of the given tokens.
+ * Also, the `special_tokens` are expected to include the `[]` characters
+ * at the beginning of and end of each string appropriately.
+ */
+struct character_normalizer {
+  /**
+   * @brief Normalizer object constructor
+   *
+   * This initializes and holds the character normalizing tables and settings.
+   *
+   * @param do_lower_case If true, upper-case characters are converted to
+   *        lower-case and accents are stripped from those characters.
+   *        If false, accented and upper-case characters are not transformed.
+   * @param special_tokens Each row is a token including the `[]` brackets.
+   *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  character_normalizer(bool do_lower_case,
+                       cudf::strings_column_view const& special_tokens,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~character_normalizer();
+
+  struct character_normalizer_impl;
+  std::unique_ptr<character_normalizer_impl> _impl;
+};
+
+/**
+ * @brief Create a normalizer object
+ *
+ * Creates a normalizer object which can be reused on multiple calls to
+ * nvtext::normalize_characters
+ *
+ * @see nvtext::character_normalizer
+ *
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param special_tokens Individual tokens including `[]` brackets.
+ *        Default is no special tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::normalize_characters
+ */
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Normalizes the text in input strings column
+ *
+ * @see nvtext::character_normalizer for details on the normalizer behavior
+ *
+ * @code{.pseudo}
+ * cn = create_character_normalizer(true)
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,cn)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ *
+ * cn = create_character_normalizer(false)
+ * s2 = normalize_characters(s,cn)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * @param input The input strings to normalize
+ * @param normalizer Normalizer to use for this function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& input,
+  character_normalizer const& normalizer,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 45bd4615435..c5d46598d4a 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                                 : cudf::detail::get_value<int32_t>(offsets, index, stream);
 }
 
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return {0L, 0L}; }
+  auto const first_offset = (input.offset() == 0) ? 0
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  return {first_offset, last_offset};
+}
+
 }  // namespace detail
 
 rmm::device_uvector<string_view> create_string_vector_from_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 7e2b766862d..0e680e98ec5 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 #include "text/utilities/tokenize_ops.cuh"
@@ -22,10 +23,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
@@ -38,9 +40,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cub/cub.cuh>
+#include <cuda/functional>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
+#include <thrust/remove.h>
 #include <thrust/transform_reduce.h>
 
 #include <limits>
@@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080;
 constexpr uint32_t UTF8_2BYTE = 0x0800;
 constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 
+__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out)
+{
+  auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint);
+  return cudf::strings::detail::from_char_utf8(utf8, out);
+}
+
 /**
  * @brief Convert code-point arrays into UTF-8 bytes for each string.
  */
@@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn {
     // convert each code-point to 1-4 UTF-8 encoded bytes
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
-      uint32_t code_point = *str_cps++;
-      if (code_point < UTF8_1BYTE)  // ASCII range
-        *out_ptr++ = static_cast<char>(code_point);
-      else if (code_point < UTF8_2BYTE) {  // create two-byte UTF-8
-        // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else if (code_point < UTF8_3BYTE) {  // create three-byte UTF-8
-        // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else {  // create four-byte UTF-8
-        // maximum code-point value is 0x0011'0000
-        // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz
-        *out_ptr++ = static_cast<char>((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24);
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      }
+      uint32_t codepoint = *str_cps++;
+      out_ptr += cp_to_utf8(codepoint, out_ptr);
     }
   }
 };
@@ -261,4 +255,361 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   return detail::normalize_characters(input, do_lower_case, stream, mr);
 }
 
+struct character_normalizer::character_normalizer_impl {
+  rmm::device_uvector<uint32_t> cp_metadata;
+  rmm::device_uvector<aux_codepoint_data_type> aux_table;
+  bool do_lower_case;
+  std::unique_ptr<cudf::column> special_tokens;
+  rmm::device_uvector<cudf::string_view> special_tokens_view;
+
+  cudf::device_span<cudf::string_view const> get_special_tokens() const
+  {
+    return special_tokens_view;
+  }
+
+  character_normalizer_impl(rmm::device_uvector<uint32_t>&& cp_metadata,
+                            rmm::device_uvector<aux_codepoint_data_type>&& aux_table,
+                            bool do_lower_case,
+                            std::unique_ptr<cudf::column>&& special_tokens,
+                            rmm::device_uvector<cudf::string_view>&& special_tokens_view)
+    : cp_metadata(std::move(cp_metadata)),
+      aux_table(std::move(aux_table)),
+      do_lower_case{do_lower_case},
+      special_tokens{std::move(special_tokens)},
+      special_tokens_view{std::move(special_tokens_view)}
+  {
+  }
+};
+
+character_normalizer::character_normalizer(bool do_lower_case,
+                                           cudf::strings_column_view const& special_tokens,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref)
+{
+  auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream);
+  auto aux_table   = nvtext::detail::get_aux_codepoint_data(stream);
+  CUDF_EXPECTS(
+    !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument);
+
+  auto sorted = std::move(
+    cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
+  if (do_lower_case) {
+    // lower-case the tokens so they will match the normalized input
+    sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
+  }
+
+  auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
+    cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
+
+  _impl = std::make_unique<character_normalizer_impl>(std::move(cp_metadata),
+                                                      std::move(aux_table),
+                                                      do_lower_case,
+                                                      std::move(sorted),
+                                                      std::move(tokens_view));
+}
+
+character_normalizer::~character_normalizer() {}
+
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<character_normalizer>(do_lower_case, special_tokens, stream, mr);
+}
+
+namespace detail {
+namespace {
+
+/**
+ * @brief Kernel handles fixing up the normalized data to account for any special tokens
+ *
+ * This undoes the padding added around the `[]` for patterns matching the strings in the
+ * special_tokens array.
+ *
+ * Launched as a thread per input byte (total_count).
+ *
+ * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte
+ * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3
+ * @param special_tokens Tokens to check against
+ */
+CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized,
+                                       int64_t total_count,
+                                       cudf::device_span<cudf::string_view const> special_tokens)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= total_count) { return; }
+  auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1;
+  if (*begin != '[') { return; }
+  auto const end   = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS;
+  auto const match = thrust::find(thrust::seq, begin, end, static_cast<uint32_t>(']'));
+  if (match == end) { return; }
+  char candidate[8];
+  auto const ch_begin =
+    thrust::transform_iterator(begin, [](auto v) { return static_cast<char>(v); });
+  auto const ch_end = ch_begin + thrust::distance(begin, match + 1);
+  auto last         = thrust::copy_if(
+    thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; });
+  *last = 0;  // only needed for debug
+
+  auto const size  = static_cast<cudf::size_type>(thrust::distance(candidate, last));
+  auto const token = cudf::string_view(candidate, size);
+  // the binary_search expects the special_tokens to be sorted
+  if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) {
+    return;
+  }
+
+  // fix up chars to remove the extra spaces
+  *(begin + 1) = 0;  // removes space after '['
+  *(match - 1) = 0;  // removes space before ']'
+}
+
+/**
+ * @brief The normalizer kernel
+ *
+ * Launched as a thread per input byte (total_bytes).
+ *
+ * Converts the input d_chars into codepoints to lookup in the provided tables.
+ * Once processed, the d_output contains 3 uints per input byte each encoded
+ * as output UTF-8. Any zero values are to removed by a subsequent kernel call.
+ *
+ * @param d_chars The characters for the input strings column to normalize
+ * @param total_bytes The number of bytes in the d_chars
+ * @param cp_metadata First lookup table for codepoint metadata
+ * @param aux_table Second lookup table containing possible replacement characters
+ * @param do_lower_case True if the normalization includes lower-casing characters
+ * @param d_output The output of the normalization (UTF-8 encoded)
+ */
+CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
+                                        int64_t total_bytes,
+                                        codepoint_metadata_type const* cp_metadata,
+                                        aux_codepoint_data_type const* aux_table,
+                                        bool do_lower_case,
+                                        uint32_t* d_output)
+{
+  uint32_t replacement[MAX_NEW_CHARS] = {0};
+
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+
+  if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
+    auto const cp = [utf8 = d_chars + idx] {
+      cudf::char_utf8 ch_utf8 = *utf8;
+      if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); }
+      return cudf::strings::detail::utf8_to_codepoint(ch_utf8);
+    }();
+    auto const metadata = cp_metadata[cp];
+
+    if (!should_remove_cp(metadata, do_lower_case)) {
+      int8_t num_new_chars = 1;
+      // retrieve the normalized value for cp
+      auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp;
+      replacement[0]    = new_cp == 0 ? cp : new_cp;
+
+      if (do_lower_case && is_multi_char_transform(metadata)) {
+        auto const next_cps = aux_table[cp];
+        replacement[1]      = static_cast<uint32_t>(next_cps >> 32);
+        replacement[2]      = static_cast<uint32_t>(next_cps & 0xFFFFFFFF);
+        num_new_chars       = 2 + (replacement[2] != 0);
+      }
+
+      if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
+        replacement[1] = replacement[0];
+        replacement[0] = SPACE_CODE_POINT;  // add spaces around the new codepoint
+        replacement[2] = SPACE_CODE_POINT;
+        num_new_chars  = 3;
+      }
+
+      // convert codepoints back to UTF-8 in-place
+      for (int k = 0; k < num_new_chars; ++k) {
+        auto const new_cp = replacement[k];
+        if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k)); }
+      }
+    }
+  }
+
+  // employ an optimized coalesced writer to output replacement as a block of transposed data
+  using block_store =
+    cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  __shared__ typename block_store::TempStorage bs_stg;
+  auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS;
+  block_store(bs_stg).Store(block_base, replacement);
+}
+
+/**
+ * @brief Computes the output sizes for each row
+ *
+ * The input offsets are used with segmented-reduce to count the number of
+ * non-zero values for each output row.
+ *
+ * @param d_normalized The UTF-8 encoded normalized values
+ * @param offsets These identify the row boundaries
+ * @param offset Only non-zero if the input column has been sliced
+ * @param size The number of output rows (sames as the number of input rows)
+ * @param stream Stream used for allocating device memory and launching kernels
+ * @return The sizes of each output row
+ */
+template <typename OffsetType>
+rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_normalized,
+                                                   OffsetType offsets,
+                                                   int64_t offset,
+                                                   cudf::size_type size,
+                                                   rmm::cuda_stream_view stream)
+{
+  auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
+
+  auto d_data = d_normalized.data();
+
+  // counts the non-zero bytes in the d_data array
+  auto d_in = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([d_data] __device__(auto idx) {
+      idx = idx * MAX_NEW_CHARS;
+      // transform function counts number of non-zero bytes in uint32_t value
+      auto tfn = [](uint32_t v) -> cudf::size_type {
+        return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) +
+               ((v & 0xFF000000) > 0);
+      };
+      auto const begin = d_data + idx;
+      auto const end   = begin + MAX_NEW_CHARS;
+      return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{});
+    }));
+
+  // DeviceSegmentedReduce is used to compute the size of each output row
+  auto d_out = output_sizes.begin();
+  auto temp  = std::size_t{0};
+  if (offset == 0) {
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+  } else {
+    // offsets need to be normalized for segmented-reduce to work efficiently
+    auto offsets_itr = thrust::transform_iterator(
+      offsets,
+      cuda::proclaim_return_type<int64_t>([offset] __device__(auto o) { return o - offset; }));
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+  }
+
+  return output_sizes;
+}
+
+// handles ranges above int32 max
+template <typename InputIterator, typename OutputIterator, typename T>
+OutputIterator remove_copy_safe(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T const& value,
+                                rmm::cuda_stream_view stream)
+{
+  auto const copy_size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                                  static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto itr = first;
+  while (itr != last) {
+    auto const copy_end =
+      static_cast<std::size_t>(std::distance(itr, last)) <= copy_size ? last : itr + copy_size;
+    result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value);
+    itr    = copy_end;
+  }
+  return result;
+}
+
+// handles ranges above int32 max
+template <typename Iterator, typename T>
+Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream)
+{
+  auto const size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                             static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto result = first;
+  auto itr    = first;
+  while (itr != last) {
+    auto end = static_cast<std::size_t>(std::distance(itr, last)) <= size ? last : itr + size;
+    result   = thrust::remove(rmm::exec_policy(stream), itr, end, value);
+    itr      = end;
+  }
+  return result;
+}
+}  // namespace
+
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); }
+
+  auto [first_offset, last_offset] =
+    cudf::strings::detail::get_first_and_last_offset(input, stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
+
+  if (chars_size == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
+
+  constexpr int64_t block_size = 256;
+  cudf::detail::grid_1d grid{chars_size, block_size};
+  auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS;
+
+  auto const& parameters = normalizer._impl;
+
+  auto d_normalized = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
+  data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    d_input_chars,
+    chars_size,
+    parameters->cp_metadata.data(),
+    parameters->aux_table.data(),
+    parameters->do_lower_case,
+    d_normalized.data());
+
+  // This removes space added around any special tokens in the form of [ttt].
+  // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right
+  // before returning the output strings column.
+  auto const special_tokens = parameters->get_special_tokens();
+  if (!special_tokens.empty()) {
+    special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      d_normalized.data(), chars_size, special_tokens);
+  }
+
+  // Use segmented-reduce over the non-zero codepoints to get the size of the output rows
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  auto output_sizes =
+    compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream);
+
+  // convert the sizes to offsets
+  auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+
+  // create output chars by calling remove_copy(0) on the bytes in d_normalized
+  auto chars       = rmm::device_uvector<char>(total_size, stream, mr);
+  auto const begin = reinterpret_cast<char const*>(d_normalized.begin());
+  // the remove() above speeds up the remove_copy() by roughly 10%
+  auto const end =
+    reinterpret_cast<char const*>(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream));
+  remove_copy_safe(begin, end, chars.data(), 0, stream);
+
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   chars.release(),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::normalize_characters(input, normalizer, stream, mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh
new file mode 100644
index 00000000000..3972726d536
--- /dev/null
+++ b/cpp/src/text/normalize.cuh
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "text/subword/detail/cp_data.h"
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @brief Bit used to filter out invalid code points.
+ *
+ * When normalizing characters to code point values, if this bit is set,
+ * the code point should be filtered out before returning from the normalizer.
+ */
+constexpr uint32_t FILTER_BIT = 22;
+
+/**
+ * @brief Retrieve new code point from metadata value.
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return The replacement character if appropriate.
+ */
+__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
+
+/**
+ * @brief Retrieve token category from the metadata value.
+ *
+ * Category values are 0-5:
+ * 0 - character should be padded
+ * 1 - pad character if lower-case
+ * 2 - character should be removed
+ * 3 - remove character if lower-case
+ * 4 - whitespace character -- always replace
+ * 5 - uncategorized
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return Category value.
+ */
+__device__ constexpr uint32_t extract_token_cat(uint32_t metadata)
+{
+  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be padded.
+ */
+__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool always_replace(uint32_t metadata)
+{
+  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
+}
+
+/**
+ * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
+ */
+__device__ constexpr bool is_multi_char_transform(uint32_t metadata)
+{
+  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
+}
+
+/**
+ * @brief Returns true if the byte passed in could be a valid head byte for
+ * a utf8 character. That is, not binary `10xxxxxx`
+ */
+__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
+
+}  // namespace detail
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 7a39199011e..4c54409c41a 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 
@@ -38,81 +39,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Bit used to filter out invalid code points.
- *
- * When normalizing characters to code point values, if this bit is set,
- * the code point should be filtered out before returning from the normalizer.
- */
-constexpr uint32_t FILTER_BIT = 22;
-
-/**
- * @brief Retrieve new code point from metadata value.
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return The replacement character if appropriate.
- */
-__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
-
-/**
- * @brief Retrieve token category from the metadata value.
- *
- * Category values are 0-5:
- * 0 - character should be padded
- * 1 - pad character if lower-case
- * 2 - character should be removed
- * 3 - remove character if lower-case
- * 4 - whitespace character -- always replace
- * 5 - uncategorized
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return Category value.
- */
-__device__ uint32_t extract_token_cat(uint32_t metadata)
-{
-  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool should_remove_cp(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be padded.
- */
-__device__ bool should_add_spaces(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool always_replace(uint32_t metadata)
-{
-  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
-}
-
-/**
- * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
- */
-__device__ bool is_multi_char_transform(uint32_t metadata)
-{
-  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
-}
-
-/**
- * @brief Returns true if the byte passed in could be a valid head byte for
- * a utf8 character. That is, not binary `10xxxxxx`
- */
-__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
-
 /**
  * @brief Converts a UTF-8 character into a unicode code point value.
  *
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 2515cc917fa..530148eb654 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
   EXPECT_EQ(results->size(), 0);
   results = nvtext::normalize_characters(strings_view, false);
   EXPECT_EQ(results->size(), 0);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(TextNormalizeTest, AllNullStrings)
@@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
   results = nvtext::normalize_characters(strings_view, false);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(TextNormalizeTest, SomeNullStrings)
@@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
   auto results = nvtext::normalize_characters(strings_view, false);
   cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeCharacters)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  std::vector<char const*> h_strings{"abc£def",
-                                     nullptr,
-                                     "éè â îô\taeio",
-                                     "\tĂĆĖÑ  Ü",
-                                     "ACEN U",
-                                     "P^NP",
-                                     "$41.07",
-                                     "[a,b]",
-                                     "丏丟",
-                                     ""};
-  auto validity =
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
-  cudf::strings_column_view strings_view(strings);
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
+    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto sv = cudf::strings_column_view(input);
   {
-    auto results = nvtext::normalize_characters(strings_view, true);
+    auto results = nvtext::normalize_characters(sv, true);
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "ee a io aeio",
@@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = nvtext::normalize_characters(strings_view, false);
+    auto results = nvtext::normalize_characters(sv, false);
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "éè â îô aeio",
@@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
+TEST_F(TextNormalizeTest, WithNormalizer)
+{
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
+  // the following include punctuation, accents, whitespace, and CJK characters
+  auto input = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô\taeio",
+                                                   "\tĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P^NP",
+                                                   "$41.07",
+                                                   "[a,b]",
+                                                   "丏丟",
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto const sv = cudf::strings_column_view(input);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  auto expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                        "",
+                                                        "ee a io aeio",
+                                                        " acen  u",
+                                                        "acen u",
+                                                        "p ^ np",
+                                                        " $ 41 . 07",
+                                                        " [ a , b ] ",
+                                                        " 丏  丟 ",
+                                                        "",
+                                                        long_row,
+                                                        long_row,
+                                                        long_row},
+                                                       {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // test normalizer re-use
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô aeio",
+                                                   " ĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P ^ NP",
+                                                   " $ 41 . 07",
+                                                   " [ a , b ] ",
+                                                   " 丏  丟 ",
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST_F(TextNormalizeTest, SpecialTokens)
+{
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
+  auto input =
+    cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
+                                        "[bos]these should[sep]work too[eos]",
+                                        "some[non]tokens[eol]too",
+                                        long_row,
+                                        long_row,
+                                        long_row});
+
+  auto sv             = cudf::strings_column_view(input);
+  auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
+  auto stv            = cudf::strings_column_view(special_tokens);
+
+  auto normalizer = nvtext::create_character_normalizer(true, stv);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  auto expected   = cudf::test::strings_column_wrapper(
+    {" [bos] some strings with  [pad]  special [sep] tokens [eos] ",
+       " [bos] these should [sep] work too [eos] ",
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false, stv);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  expected   = cudf::test::strings_column_wrapper(
+    {" [BOS] Some strings with  [PAD]  special [SEP] tokens [EOS] ",
+       " [ bos ] these should [ sep ] work too [ eos ] ",
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 {
   cudf::test::strings_column_wrapper strings(
@@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 
   std::vector<cudf::column_view> sliced = cudf::split(strings, {4});
   auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true);
-  cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  auto expected =
+    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
+  expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
+  expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
-  cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
+  normalizer = nvtext::create_character_normalizer(false);
+  results    = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer);
+  expected   = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index c5216a9cb1f..1578dadc793 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -723,7 +723,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - &numpy numpy>=1.23,<3.0a0
+          - &numpy numpy>=1.23,<2.1
           - pandas>=2.0,<2.2.4dev0
   run_pylibcudf:
     common:
@@ -753,8 +753,8 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cachetools
-          - &numba-cuda-dep numba-cuda>=0.2.0,<0.3.0a0
-          - &numba-dep numba>=0.59.1,<0.61.0a0
+          - &numba-cuda-dep numba-cuda>=0.4.0,<0.5.0a0
+          - &numba-dep numba>=0.59.1,<0.62.0a0
           - nvtx>=0.2.1
           - packaging
           - rich
@@ -885,7 +885,8 @@ dependencies:
         matrices:
           - matrix: {dependencies: "oldest"}
             packages:
-              - numba-cuda==0.2.0
+              - numba-cuda==0.4.0
+              - numba==0.59.1
               - pandas==2.0.*
           - matrix: {dependencies: "latest"}
             packages:
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 2a17bc5dbb7..090e475471d 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -37,7 +37,3 @@ rapids_cython_init()
 
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py
new file mode 100644
index 00000000000..1240c0e1eb7
--- /dev/null
+++ b/python/cudf/cudf/core/character_normalizer.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import pylibcudf as plc
+
+import cudf
+
+
+class CharacterNormalizer:
+    """
+    A normalizer object used to normalize input text.
+
+    Parameters
+    ----------
+    do_lower : bool
+        If True, the normalizer should also lower-case
+        while normalizing.
+    special_tokens : cudf.Series
+        Series of special tokens.
+    """
+
+    def __init__(
+        self,
+        do_lower: bool,
+        special_tokens: cudf.Series = cudf.Series([], dtype="object"),
+    ) -> None:
+        self.normalizer = plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, special_tokens._column.to_pylibcudf(mode="read")
+        )
+
+    def normalize(self, text: cudf.Series) -> cudf.Series:
+        """
+        Parameters
+        ----------
+        text : cudf.Series
+            The strings to be normalized.
+
+        Returns
+        -------
+        cudf.Series
+            Normalized strings
+        """
+        result = text._column.normalize_characters(self.normalizer)
+
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 04a72017c33..c0ad33ec7d6 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4679,8 +4679,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
         Normalizes strings characters for tokenizing.
 
-        This uses the normalizer that is built into the
-        subword_tokenize function which includes:
+        .. deprecated:: 25.04
+           Use `CharacterNormalizer` instead.
+
+        The normalizer function includes:
 
             - adding padding around punctuation (unicode category starts with
               "P") as well as certain ASCII symbols like "^" and "$"
@@ -4720,8 +4722,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         2              $ 99
         dtype: object
         """
+        warnings.warn(
+            "normalize_characters is deprecated and will be removed in a future "
+            "version. Use CharacterNormalizer instead.",
+            FutureWarning,
+        )
         return self._return_or_inplace(
-            self._column.normalize_characters(do_lower)
+            self._column.characters_normalize(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -6256,14 +6263,25 @@ def normalize_spaces(self) -> Self:
         )
 
     @acquire_spill_lock()
-    def normalize_characters(self, do_lower: bool = True) -> Self:
+    def characters_normalize(self, do_lower: bool = True) -> Self:
         return ColumnBase.from_pylibcudf(  # type: ignore[return-value]
-            plc.nvtext.normalize.normalize_characters(
+            plc.nvtext.normalize.characters_normalize(
                 self.to_pylibcudf(mode="read"),
                 do_lower,
             )
         )
 
+    @acquire_spill_lock()
+    def normalize_characters(
+        self, normalizer: plc.nvtext.normalize.CharacterNormalizer
+    ) -> Self:
+        return ColumnBase.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                normalizer,
+            )
+        )
+
     @acquire_spill_lock()
     def replace_tokens(
         self, targets: Self, replacements: Self, delimiter: plc.Scalar
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 9c48b31a309..211d161696e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3308,9 +3308,13 @@ def _split(self, splits, keep_index: bool = True) -> list[Self]:
             splits,
         )
 
+        @acquire_spill_lock()
+        def split_from_pylibcudf(split: list[plc.Column]) -> list[ColumnBase]:
+            return [ColumnBase.from_pylibcudf(col) for col in split]
+
         return [
             self._from_columns_like_self(
-                [ColumnBase.from_pylibcudf(col) for col in split],
+                split_from_pylibcudf(split),
                 self._column_names,
                 self.index.names if keep_index else None,
             )
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 13d98e43ddc..08226dd7f6d 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+# Copyright (c) 2022-2025, NVIDIA CORPORATION.
 from __future__ import annotations
 
 import contextlib
@@ -784,3 +784,12 @@ def test_spilling_and_copy_on_write(manager: SpillManager):
         assert not a.is_spilled
         assert a.owner.exposed
         assert not b.owner.exposed
+
+
+def test_scatter_by_map():
+    data = range(10)
+    with cudf.option_context("spill", True):
+        df = cudf.DataFrame(data)
+        result = df.scatter_by_map(data)
+    for i, res in zip(data, result):
+        assert_eq(res, cudf.DataFrame([i], index=[i]))
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 86e1e46c1a2..dc45827d2e8 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -8,6 +8,7 @@
 
 import cudf
 from cudf.core.byte_pair_encoding import BytePairEncoder
+from cudf.core.character_normalizer import CharacterNormalizer
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing import assert_eq
 
@@ -251,7 +252,8 @@ def test_normalize_characters():
         ]
     )
 
-    actual = strings.str.normalize_characters()
+    normalizer_lower = CharacterNormalizer(True)
+    actual = normalizer_lower.normalize(strings.str)
     assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
@@ -265,7 +267,9 @@ def test_normalize_characters():
             "Stock ^   $ 1",
         ]
     )
-    actual = strings.str.normalize_characters(do_lower=False)
+
+    normalizer = CharacterNormalizer(False)
+    actual = normalizer.normalize(strings.str)
     assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 16cd97677ef..8b8abe90ac9 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -24,9 +24,9 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "libcudf==25.4.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0a0",
-    "numba>=0.59.1,<0.61.0a0",
-    "numpy>=1.23,<3.0a0",
+    "numba-cuda>=0.4.0,<0.5.0a0",
+    "numba>=0.59.1,<0.62.0a0",
+    "numpy>=1.23,<2.1",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.4dev0",
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
index 3e12eb6aa41..13b859bc33b 100644
--- a/python/cudf_kafka/CMakeLists.txt
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -35,7 +35,3 @@ include(rapids-cython-core)
 rapids_cython_init()
 
 add_subdirectory(cudf_kafka/_lib)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f8c0a7d9d4b..e9fc054efc2 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -35,7 +35,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 9c58a24c065..8ff0db084b1 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -1,9 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import os
-
 import pytest
 
 import polars as pl
@@ -203,8 +201,11 @@ def test_scan_csv_multi(tmp_path, filename, glob, nrows_skiprows):
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
     with (tmp_path / "test*.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
-    os.chdir(tmp_path)
-    q = pl.scan_csv(filename, glob=glob, n_rows=n_rows, skip_rows=skiprows)
+    if isinstance(filename, list):
+        source = [tmp_path / fn for fn in filename]
+    else:
+        source = tmp_path / filename
+    q = pl.scan_csv(source, glob=glob, n_rows=n_rows, skip_rows=skiprows)
 
     assert_gpu_result_equal(q)
 
diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
index c0792663c7e..c0b9d71653c 100644
--- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py
@@ -434,18 +434,12 @@ def set_object_dtypes_from_pa_schema(df, schema):
     # pyarrow schema.
     if schema:
         for col_name, col in df._data.items():
-            if col_name is None:
-                # Pyarrow cannot handle `None` as a field name.
-                # However, this should be a simple range index that
-                # we can ignore anyway
-                continue
-            typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
-            if (
-                col_name in schema.names
-                and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype))
-                and isinstance(col, cudf.core.column.StringColumn)
-            ):
-                df._data[col_name] = col.astype(typ)
+            if col_name in schema.names:
+                typ = cudf_dtype_from_pa_type(schema.field(col_name).type)
+                if not isinstance(
+                    typ, (cudf.ListDtype, cudf.StructDtype)
+                ) and isinstance(col, cudf.core.column.StringColumn):
+                    df._data[col_name] = col.astype(typ)
 
 
 to_parquet = dd.to_parquet
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 9f7031f4d2a..3a88668e6d2 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import dask
@@ -486,6 +487,52 @@ def test_create_metadata_file_inconsistent_schema(tmpdir):
     dd.assert_eq(ddf1.compute(), ddf2.compute())
 
 
+@pytest.mark.parametrize("specify_schema", [True, False])
+def test_read_inconsistent_schema(tmpdir, specify_schema):
+    if specify_schema:
+        # If we specify the expected schema,
+        # we also need to specify the partitioning.
+        kwargs = {
+            "dataset": {
+                "schema": pa.schema(
+                    [
+                        ("id", pa.int64()),
+                        ("text", pa.string()),
+                        ("meta1", pa.struct([("field1", pa.string())])),
+                    ]
+                ),
+                "partitioning": None,
+            },
+        }
+    else:
+        kwargs = {}
+
+    records = [
+        {"id": 123, "text": "foo"},
+        {
+            "text": "bar",
+            "meta1": [{"field1": "cat"}],
+            "id": 456,
+        },
+    ]
+    columns = ["text", "id"]
+    pd.DataFrame(records[:1]).to_parquet(tmpdir / "part.0.parquet")
+    pd.DataFrame(records[1:]).to_parquet(tmpdir / "part.1.parquet")
+    # Check that cuDF and Dask cuDF match
+    dd.assert_eq(
+        cudf.read_parquet(
+            tmpdir, columns=columns, allow_mismatched_pq_schemas=True
+        ),
+        dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs),
+        check_index=False,
+    )
+    # Check that "pandas" and "cudf" backends match
+    dd.assert_eq(
+        dd.read_parquet(tmpdir, columns=columns),
+        dask_cudf.read_parquet(tmpdir, columns=columns, **kwargs),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -526,7 +573,6 @@ def test_cudf_list_struct_write(tmpdir):
 
 
 def test_null_partition(tmpdir):
-    import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
 
     ids = pd.Series([0, 1, None], dtype="Int64")
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 87bf282f376..83493d7f2a4 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cudf==25.4.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pandas>=2.0,<2.2.4dev0",
     "pynvml>=12.0.0,<13.0.0a0",
     "rapids-dask-dependency==25.4.*,>=0.0.0a0",
@@ -47,8 +47,8 @@ cudf = "dask_cudf.backends:CudfBackendEntrypoint"
 [project.optional-dependencies]
 test = [
     "dask-cuda==25.4.*,>=0.0.0a0",
-    "numba-cuda>=0.2.0,<0.3.0a0",
-    "numba>=0.59.1,<0.61.0a0",
+    "numba-cuda>=0.4.0,<0.5.0a0",
+    "numba>=0.59.1,<0.62.0a0",
     "pytest-cov",
     "pytest-xdist",
     "pytest<8",
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index fe6e73a3f14..153570a4a7e 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -37,7 +37,3 @@ include(rapids-cython-core)
 rapids_cython_init()
 
 add_subdirectory(pylibcudf)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index f8b082c8429..2cf2bfb8ac9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
         const column_view & strings,
         bool do_lower_case
     ) except +libcudf_exception_handler
+
+    cdef struct character_normalizer "nvtext::character_normalizer":
+        pass
+
+    cdef unique_ptr[character_normalizer] create_character_normalizer(
+        bool do_lower_case,
+        const column_view & strings
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] normalize_characters(
+        const column_view & strings,
+        const character_normalizer & normalizer
+    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
index 90676145afa..e6688e19762 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -1,9 +1,18 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
 from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer
 
+cdef class CharacterNormalizer:
+    cdef unique_ptr[character_normalizer] c_obj
 
 cpdef Column normalize_spaces(Column input)
 
-cpdef Column normalize_characters(Column input, bool do_lower_case)
+cpdef Column characters_normalize(Column input, bool do_lower_case)
+
+cpdef Column normalize_characters(
+  Column input,
+  CharacterNormalizer normalizer
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
index 1d90a5a8960..d722ef6c79e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -1,6 +1,12 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
 
+class CharacterNormalizer:
+    def __init__(self, do_lower_case: bool, special_tokens: Column): ...
+
 def normalize_spaces(input: Column) -> Column: ...
-def normalize_characters(input: Column, do_lower_case: bool) -> Column: ...
+def characters_normalize(input: Column, do_lower_case: bool) -> Column: ...
+def normalize_characters(
+    input: Column, normalizer: CharacterNormalizer
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index b259ccaefa6..6a18c205841 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -1,16 +1,37 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize
 
-__all__ = ["normalize_characters", "normalize_spaces"]
+__all__ = [
+    "CharacterNormalizer"
+    "normalize_characters",
+    "normalize_spaces",
+    "characters_normalize"
+]
+
+cdef class CharacterNormalizer:
+    """The normalizer object to be used with ``normalize_characters``.
+
+    For details, see :cpp:class:`cudf::nvtext::character_normalizer`.
+    """
+    def __cinit__(self, bool do_lower_case, Column tokens):
+        cdef column_view c_tokens = tokens.view()
+        with nogil:
+            self.c_obj = move(
+                cpp_normalize.create_character_normalizer(
+                    do_lower_case,
+                    c_tokens
+                )
+            )
+
+    __hash__ = None
 
 cpdef Column normalize_spaces(Column input):
     """
@@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_spaces(input.view())
+        c_result = cpp_normalize.normalize_spaces(input.view())
 
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column normalize_characters(Column input, bool do_lower_case):
+cpdef Column characters_normalize(Column input, bool do_lower_case):
     """
     Normalizes strings characters for tokenizing.
 
@@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            do_lower_case
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    normalizer : CharacterNormalizer
+        Normalizer object used for modifying the input column text
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            dereference(normalizer.c_obj.get())
+        )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index 25b6d1389ec..47bbb191be6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -15,7 +15,7 @@ def norm_spaces_input_data():
 
 @pytest.fixture(scope="module")
 def norm_chars_input_data():
-    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"]
     return pa.array(arr)
 
 
@@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data):
 
 @pytest.mark.parametrize("do_lower", [True, False])
 def test_normalize_characters(norm_chars_input_data, do_lower):
-    result = plc.nvtext.normalize.normalize_characters(
+    result = plc.nvtext.normalize.characters_normalize(
         plc.interop.from_arrow(norm_chars_input_data),
         do_lower,
     )
-    expected = pa.array(
-        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower,
+            plc.column_factories.make_empty_column(plc.types.TypeId.STRING),
+        ),
+    )
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower):
+    special_tokens = pa.array(["[pad]"])
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, plc.interop.from_arrow(special_tokens)
+        ),
     )
-    if not do_lower:
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
+        )
+    else:
         expected = pa.array(
-            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
         )
     assert_column_eq(result, expected)
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 939da65c1ec..e12d1ffdb39 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -42,7 +42,7 @@ classifiers = [
 test = [
     "fastavro>=0.22.9",
     "hypothesis",
-    "numpy>=1.23,<3.0a0",
+    "numpy>=1.23,<2.1",
     "pandas",
     "pytest-cov",
     "pytest-xdist",