From 29b81eb42aa60c5e41785b3b2eb9e80b964c90ad Mon Sep 17 00:00:00 2001
From: Gil Forsyth <gforsyth@users.noreply.github.com>
Date: Tue, 4 Mar 2025 12:13:36 -0500
Subject: [PATCH 1/4] Combine separate ConfigureNVBench calls to fix cpp conda
 builds (#18155)

Two PRs merged in quick succession each added a separate transform benchmark with the same name, leading CMake to get angry:

https://github.com/rapidsai/cudf/commit/5b0a85b5397b69155fe0c740185945a9fe0848ac
https://github.com/rapidsai/cudf/commit/8ca4bc43d4650ae364d1f9ee412a5597f310b4f7

Authors:
  - Gil Forsyth (https://github.com/gforsyth)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Basit Ayantunde (https://github.com/lamarrr)

URL: https://github.com/rapidsai/cudf/pull/18155
---
 cpp/benchmarks/CMakeLists.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index e82c7517145..b2559208c3c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -355,11 +355,7 @@ ConfigureNVBench(
 # ##################################################################################################
 # * transform benchmark
 # ---------------------------------------------------------------------------------
-ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp)
-
-# ##################################################################################################
-# * transform benchmark ----------------------------------------------------------------------------
-ConfigureNVBench(TRANSFORM_NVBENCH transform/transform.cpp)
+ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp transform/transform.cpp)
 
 # ##################################################################################################
 # * nvtext benchmark -------------------------------------------------------------------

From 68ae48c1929ec4cead60f5386f1d3522daf7e1fc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 4 Mar 2025 10:30:50 -0800
Subject: [PATCH 2/4] Prune more seldom used dtype utils (#18150)

Similar to https://github.com/rapidsai/cudf/pull/18141

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/18150
---
 python/cudf/cudf/core/column/column.py    |   3 +-
 python/cudf/cudf/core/column/datetime.py  |  23 ++-
 python/cudf/cudf/core/column/numerical.py |  34 +++-
 python/cudf/cudf/core/column/timedelta.py |   2 +-
 python/cudf/cudf/core/dataframe.py        |   4 +-
 python/cudf/cudf/core/scalar.py           | 153 +++++++++++++++-
 python/cudf/cudf/utils/dtypes.py          | 206 ----------------------
 7 files changed, 205 insertions(+), 220 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5c72bb74d6a..0f347279956 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -64,7 +64,6 @@
     dtype_from_pylibcudf_column,
     dtype_to_pylibcudf_type,
     find_common_type,
-    get_time_unit,
     is_column_like,
     is_dtype_obj_numeric,
     is_mixed_with_object_dtype,
@@ -2736,7 +2735,7 @@ def as_column(
                 nan_as_null=nan_as_null,
             )
         elif arbitrary.dtype.kind in "mM":
-            time_unit = get_time_unit(arbitrary)
+            time_unit = np.datetime_data(arbitrary.dtype)[0]
             if time_unit in ("D", "W", "M", "Y"):
                 # TODO: Raise in these cases instead of downcasting to s?
                 new_type = f"{arbitrary.dtype.type.__name__}[s]"
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 64ddcae72a7..e99ce6b0b6e 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -179,13 +179,27 @@ def infer_format(element: str, **kwargs) -> str:
     return fmt
 
 
+def _get_time_unit(obj: ColumnBinaryOperand) -> str:
+    if isinstance(
+        obj,
+        (
+            cudf.core.column.datetime.DatetimeColumn,
+            cudf.core.column.timedelta.TimeDeltaColumn,
+        ),
+    ):
+        return obj.time_unit
+
+    time_unit, _ = np.datetime_data(obj.dtype)
+    return time_unit
+
+
 def _resolve_mixed_dtypes(
     lhs: ColumnBinaryOperand, rhs: ColumnBinaryOperand, base_type: str
 ) -> Dtype:
     units = ["s", "ms", "us", "ns"]
-    lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs)
+    lhs_time_unit = _get_time_unit(lhs)
     lhs_unit = units.index(lhs_time_unit)
-    rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs)
+    rhs_time_unit = _get_time_unit(rhs)
     rhs_unit = units.index(rhs_time_unit)
     return np.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")
 
@@ -551,7 +565,7 @@ def normalize_binop_value(  # type: ignore[override]
 
         if isinstance(other, np.datetime64):
             if np.isnat(other):
-                other_time_unit = cudf.utils.dtypes.get_time_unit(other)
+                other_time_unit = np.datetime_data(other.dtype)[0]
                 if other_time_unit not in {"s", "ms", "ns", "us"}:
                     other_time_unit = "ns"
 
@@ -562,8 +576,7 @@ def normalize_binop_value(  # type: ignore[override]
             other = other.astype(self.dtype)
             return cudf.Scalar(other)
         elif isinstance(other, np.timedelta64):
-            other_time_unit = cudf.utils.dtypes.get_time_unit(other)
-
+            other_time_unit = np.datetime_data(other.dtype)[0]
             if np.isnat(other):
                 return cudf.Scalar(
                     None,
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 249afe9aba6..57c42f92222 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -26,8 +26,8 @@
 from cudf.utils.dtypes import (
     CUDF_STRING_DTYPE,
     find_common_type,
-    min_column_type,
     min_signed_type,
+    min_unsigned_type,
     np_dtypes_to_pandas_dtypes,
 )
 
@@ -469,6 +469,34 @@ def _process_values_for_isin(
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
+    def _min_column_type(self, expected_type: np.dtype) -> np.dtype:
+        """
+        Return the smallest dtype which can represent all elements of self.
+        """
+        if self.null_count == len(self):
+            return self.dtype
+
+        min_value, max_value = self.min(), self.max()
+        either_is_inf = np.isinf(min_value) or np.isinf(max_value)
+        if not either_is_inf and expected_type.kind == "i":
+            max_bound_dtype = min_signed_type(max_value)
+            min_bound_dtype = min_signed_type(min_value)
+            return np.promote_types(max_bound_dtype, min_bound_dtype)
+        elif not either_is_inf and expected_type.kind == "u":
+            max_bound_dtype = min_unsigned_type(max_value)
+            min_bound_dtype = min_unsigned_type(min_value)
+            return np.promote_types(max_bound_dtype, min_bound_dtype)
+        elif self.dtype.kind == "f" or expected_type.kind == "f":
+            return np.promote_types(
+                expected_type,
+                np.promote_types(
+                    np.min_scalar_type(float(max_value)),
+                    np.min_scalar_type(float(min_value)),
+                ),
+            )
+        else:
+            return self.dtype
+
     def find_and_replace(
         self,
         to_replace: ColumnLike,
@@ -767,8 +795,8 @@ def _normalize_find_and_replace_input(
             normalized_column = normalized_column.astype(input_column_dtype)
         if normalized_column.can_cast_safely(input_column_dtype):
             return normalized_column.astype(input_column_dtype)
-        col_to_normalize_dtype = min_column_type(
-            normalized_column, input_column_dtype
+        col_to_normalize_dtype = normalized_column._min_column_type(  # type: ignore[attr-defined]
+            input_column_dtype
         )
         # Scalar case
         if len(col_to_normalize) == 1:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 654d2c2b800..d31a0c8b002 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -301,7 +301,7 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             other = pd.Timedelta(other).to_timedelta64()
 
         if isinstance(other, np.timedelta64):
-            other_time_unit = cudf.utils.dtypes.get_time_unit(other)
+            other_time_unit = np.datetime_data(other.dtype)[0]
             if np.isnat(other):
                 return cudf.Scalar(
                     None,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index eec0bacd5c8..458706f3567 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6729,8 +6729,8 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                 prepared._data[col] = (
                     prepared._data[col]
                     .astype(
-                        cudf.utils.dtypes.get_min_float_dtype(
-                            prepared._data[col]
+                        prepared._data[col]._min_column_type(
+                            np.dtype(np.float32)
                         )
                         if common_dtype.kind != "M"
                         else np.dtype(np.float64)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 8579b7398f0..0e1f17fa45e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -26,7 +26,6 @@
 from cudf.utils.dtypes import (
     CUDF_STRING_DTYPE,
     cudf_dtype_from_pa_type,
-    get_allowed_combinations_for_operator,
     to_cudf_compatible_scalar,
 )
 
@@ -36,6 +35,158 @@
     from cudf._typing import Dtype, ScalarLike
 
 
+# Type dispatch loops similar to what are found in `np.add.types`
+# In NumPy, whether or not an op can be performed between two
+# operands is determined by checking to see if NumPy has a c/c++
+# loop specifically for adding those two operands built in. If
+# not it will search lists like these for a loop for types that
+# the operands can be safely cast to. These are those lookups,
+# modified slightly for cuDF's rules
+_ADD_TYPES = [
+    "???",
+    "BBB",
+    "HHH",
+    "III",
+    "LLL",
+    "bbb",
+    "hhh",
+    "iii",
+    "lll",
+    "fff",
+    "ddd",
+    "mMM",
+    "MmM",
+    "mmm",
+    "LMM",
+    "MLM",
+    "Lmm",
+    "mLm",
+]
+_SUB_TYPES = [
+    "BBB",
+    "HHH",
+    "III",
+    "LLL",
+    "bbb",
+    "hhh",
+    "iii",
+    "lll",
+    "fff",
+    "ddd",
+    "???",
+    "MMm",
+    "mmm",
+    "MmM",
+    "MLM",
+    "mLm",
+    "Lmm",
+]
+_MUL_TYPES = [
+    "???",
+    "BBB",
+    "HHH",
+    "III",
+    "LLL",
+    "bbb",
+    "hhh",
+    "iii",
+    "lll",
+    "fff",
+    "ddd",
+    "mLm",
+    "Lmm",
+    "mlm",
+    "lmm",
+]
+_FLOORDIV_TYPES = [
+    "bbb",
+    "BBB",
+    "HHH",
+    "III",
+    "LLL",
+    "hhh",
+    "iii",
+    "lll",
+    "fff",
+    "ddd",
+    "???",
+    "mqm",
+    "mdm",
+    "mmq",
+]
+_TRUEDIV_TYPES = ["fff", "ddd", "mqm", "mmd", "mLm"]
+_MOD_TYPES = [
+    "bbb",
+    "BBB",
+    "hhh",
+    "HHH",
+    "iii",
+    "III",
+    "lll",
+    "LLL",
+    "fff",
+    "ddd",
+    "mmm",
+]
+_POW_TYPES = [
+    "bbb",
+    "BBB",
+    "hhh",
+    "HHH",
+    "iii",
+    "III",
+    "lll",
+    "LLL",
+    "fff",
+    "ddd",
+]
+
+
+def get_allowed_combinations_for_operator(
+    dtype_l: np.dtype, dtype_r: np.dtype, op: str
+) -> np.dtype:
+    error = TypeError(
+        f"{op} not supported between {dtype_l} and {dtype_r} scalars"
+    )
+
+    to_numpy_ops = {
+        "__add__": _ADD_TYPES,
+        "__radd__": _ADD_TYPES,
+        "__sub__": _SUB_TYPES,
+        "__rsub__": _SUB_TYPES,
+        "__mul__": _MUL_TYPES,
+        "__rmul__": _MUL_TYPES,
+        "__floordiv__": _FLOORDIV_TYPES,
+        "__rfloordiv__": _FLOORDIV_TYPES,
+        "__truediv__": _TRUEDIV_TYPES,
+        "__rtruediv__": _TRUEDIV_TYPES,
+        "__mod__": _MOD_TYPES,
+        "__rmod__": _MOD_TYPES,
+        "__pow__": _POW_TYPES,
+        "__rpow__": _POW_TYPES,
+    }
+    allowed = to_numpy_ops.get(op, op)
+
+    # special rules for string
+    if dtype_l == "object" or dtype_r == "object":
+        if (dtype_l == dtype_r == "object") and op == "__add__":
+            return CUDF_STRING_DTYPE
+        else:
+            raise error
+
+    # Check if we can directly operate
+
+    for valid_combo in allowed:
+        ltype, rtype, outtype = valid_combo  # type: ignore[misc]
+        if np.can_cast(dtype_l.char, ltype) and np.can_cast(  # type: ignore[has-type]
+            dtype_r.char,
+            rtype,  # type: ignore[has-type]
+        ):
+            return np.dtype(outtype)  # type: ignore[has-type]
+
+    raise error
+
+
 def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]:
     """
     Preprocess a value and dtype for host-side cudf.Scalar
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index adee17e7bfb..fea146c3201 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -354,47 +354,6 @@ def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     return np.uint64(x).dtype
 
 
-def min_column_type(x, expected_type):
-    """
-    Return the smallest dtype which can represent all
-    elements of the `NumericalColumn` `x`
-    If the column is not a subtype of `np.signedinteger` or `np.floating`
-    returns the same dtype as the dtype of `x` without modification
-    """
-
-    if not isinstance(x, cudf.core.column.NumericalColumn):
-        raise TypeError("Argument x must be of type column.NumericalColumn")
-    if x.null_count == len(x):
-        return x.dtype
-
-    min_value, max_value = x.min(), x.max()
-    either_is_inf = np.isinf(min_value) or np.isinf(max_value)
-    expected_type = cudf.dtype(expected_type)
-    if not either_is_inf and expected_type.kind in "i":
-        max_bound_dtype = min_signed_type(max_value)
-        min_bound_dtype = min_signed_type(min_value)
-        result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
-    elif not either_is_inf and expected_type.kind in "u":
-        max_bound_dtype = min_unsigned_type(max_value)
-        min_bound_dtype = min_unsigned_type(min_value)
-        result_type = np.promote_types(max_bound_dtype, min_bound_dtype)
-    elif x.dtype.kind == "f":
-        return get_min_float_dtype(x)
-    else:
-        result_type = x.dtype
-
-    return cudf.dtype(result_type)
-
-
-def get_min_float_dtype(col):
-    max_bound_dtype = np.min_scalar_type(float(col.max()))
-    min_bound_dtype = np.min_scalar_type(float(col.min()))
-    result_type = np.promote_types(
-        "float32", np.promote_types(max_bound_dtype, min_bound_dtype)
-    )
-    return cudf.dtype(result_type)
-
-
 def is_mixed_with_object_dtype(lhs, rhs):
     if isinstance(lhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
@@ -406,20 +365,6 @@ def is_mixed_with_object_dtype(lhs, rhs):
     )
 
 
-def get_time_unit(obj):
-    if isinstance(
-        obj,
-        (
-            cudf.core.column.datetime.DatetimeColumn,
-            cudf.core.column.timedelta.TimeDeltaColumn,
-        ),
-    ):
-        return obj.time_unit
-
-    time_unit, _ = np.datetime_data(obj.dtype)
-    return time_unit
-
-
 def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj:
     if dtype.kind in "mM":
         time_unit, _ = np.datetime_data(dtype)
@@ -430,51 +375,6 @@ def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj:
         return np.float64("nan")
 
 
-def get_allowed_combinations_for_operator(
-    dtype_l: np.dtype, dtype_r: np.dtype, op: str
-) -> np.dtype:
-    error = TypeError(
-        f"{op} not supported between {dtype_l} and {dtype_r} scalars"
-    )
-
-    to_numpy_ops = {
-        "__add__": _ADD_TYPES,
-        "__radd__": _ADD_TYPES,
-        "__sub__": _SUB_TYPES,
-        "__rsub__": _SUB_TYPES,
-        "__mul__": _MUL_TYPES,
-        "__rmul__": _MUL_TYPES,
-        "__floordiv__": _FLOORDIV_TYPES,
-        "__rfloordiv__": _FLOORDIV_TYPES,
-        "__truediv__": _TRUEDIV_TYPES,
-        "__rtruediv__": _TRUEDIV_TYPES,
-        "__mod__": _MOD_TYPES,
-        "__rmod__": _MOD_TYPES,
-        "__pow__": _POW_TYPES,
-        "__rpow__": _POW_TYPES,
-    }
-    allowed = to_numpy_ops.get(op, op)
-
-    # special rules for string
-    if dtype_l == "object" or dtype_r == "object":
-        if (dtype_l == dtype_r == "object") and op == "__add__":
-            return CUDF_STRING_DTYPE
-        else:
-            raise error
-
-    # Check if we can directly operate
-
-    for valid_combo in allowed:
-        ltype, rtype, outtype = valid_combo  # type: ignore[misc]
-        if np.can_cast(dtype_l.char, ltype) and np.can_cast(  # type: ignore[has-type]
-            dtype_r.char,
-            rtype,  # type: ignore[has-type]
-        ):
-            return np.dtype(outtype)  # type: ignore[has-type]
-
-    raise error
-
-
 def find_common_type(dtypes):
     """
     Wrapper over np.find_common_type to handle special cases
@@ -715,109 +615,3 @@ def dtype_from_pylibcudf_column(col: plc.Column) -> DtypeObj:
 
 SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID]
 CUDF_STRING_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRING]
-
-# Type dispatch loops similar to what are found in `np.add.types`
-# In NumPy, whether or not an op can be performed between two
-# operands is determined by checking to see if NumPy has a c/c++
-# loop specifically for adding those two operands built in. If
-# not it will search lists like these for a loop for types that
-# the operands can be safely cast to. These are those lookups,
-# modified slightly for cuDF's rules
-_ADD_TYPES = [
-    "???",
-    "BBB",
-    "HHH",
-    "III",
-    "LLL",
-    "bbb",
-    "hhh",
-    "iii",
-    "lll",
-    "fff",
-    "ddd",
-    "mMM",
-    "MmM",
-    "mmm",
-    "LMM",
-    "MLM",
-    "Lmm",
-    "mLm",
-]
-_SUB_TYPES = [
-    "BBB",
-    "HHH",
-    "III",
-    "LLL",
-    "bbb",
-    "hhh",
-    "iii",
-    "lll",
-    "fff",
-    "ddd",
-    "???",
-    "MMm",
-    "mmm",
-    "MmM",
-    "MLM",
-    "mLm",
-    "Lmm",
-]
-_MUL_TYPES = [
-    "???",
-    "BBB",
-    "HHH",
-    "III",
-    "LLL",
-    "bbb",
-    "hhh",
-    "iii",
-    "lll",
-    "fff",
-    "ddd",
-    "mLm",
-    "Lmm",
-    "mlm",
-    "lmm",
-]
-_FLOORDIV_TYPES = [
-    "bbb",
-    "BBB",
-    "HHH",
-    "III",
-    "LLL",
-    "hhh",
-    "iii",
-    "lll",
-    "fff",
-    "ddd",
-    "???",
-    "mqm",
-    "mdm",
-    "mmq",
-]
-_TRUEDIV_TYPES = ["fff", "ddd", "mqm", "mmd", "mLm"]
-_MOD_TYPES = [
-    "bbb",
-    "BBB",
-    "hhh",
-    "HHH",
-    "iii",
-    "III",
-    "lll",
-    "LLL",
-    "fff",
-    "ddd",
-    "mmm",
-]
-_POW_TYPES = [
-    "bbb",
-    "BBB",
-    "hhh",
-    "HHH",
-    "iii",
-    "III",
-    "lll",
-    "LLL",
-    "fff",
-    "ddd",
-]

From 32bdfb064886229e391d91ed72a7348787fd4f8c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 4 Mar 2025 11:38:38 -0800
Subject: [PATCH 3/4] Separate stats filtering helpers to reuse in page pruning
 (#18034)

Contributes to #17896

This PR separates stats based filtering helpers for reuse in page pruning using stats in Parquet PageIndex.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/18034
---
 cpp/CMakeLists.txt                          |   1 +
 cpp/src/io/parquet/predicate_pushdown.cpp   | 309 +-------------------
 cpp/src/io/parquet/stats_filter_helpers.cpp | 144 +++++++++
 cpp/src/io/parquet/stats_filter_helpers.hpp | 256 ++++++++++++++++
 4 files changed, 416 insertions(+), 294 deletions(-)
 create mode 100644 cpp/src/io/parquet/stats_filter_helpers.cpp
 create mode 100644 cpp/src/io/parquet/stats_filter_helpers.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 0282282b5f3..8952f11dd23 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -528,6 +528,7 @@ add_library(
   src/io/parquet/reader_impl_chunking.cu
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
+  src/io/parquet/stats_filter_helpers.cpp
   src/io/parquet/writer_impl.cu
   src/io/parquet/writer_impl_helpers.cpp
   src/io/parquet/decode_fixed.cu
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index e1d7dbb03b3..c2d1495a5d9 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include "reader_impl_helpers.hpp"
+#include "stats_filter_helpers.hpp"
 
 #include <cudf/ast/detail/expression_transformer.hpp>
 #include <cudf/ast/detail/operators.hpp>
@@ -21,7 +22,6 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/transform.hpp>
-#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -34,7 +34,6 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
-#include <limits>
 #include <numeric>
 #include <optional>
 #include <unordered_set>
@@ -42,89 +41,25 @@
 namespace cudf::io::parquet::detail {
 
 namespace {
+
 /**
- * @brief Converts statistics in column chunks to 2 device columns - min, max values.
+ * @brief Converts column chunk statistics to 2 device columns - min, max values.
+ *
+ * Each column's number of rows equals the total number of row groups.
  *
  */
-struct stats_caster {
+struct row_group_stats_caster : public stats_caster_base {
   size_type total_row_groups;
   std::vector<metadata> const& per_file_metadata;
   host_span<std::vector<size_type> const> row_group_indices;
 
-  template <typename ToType, typename FromType>
-  static ToType targetType(FromType const value)
-  {
-    if constexpr (cudf::is_timestamp<ToType>()) {
-      return static_cast<ToType>(
-        typename ToType::duration{static_cast<typename ToType::rep>(value)});
-    } else if constexpr (std::is_same_v<ToType, string_view>) {
-      return ToType{nullptr, 0};
-    } else {
-      return static_cast<ToType>(value);
-    }
-  }
-
-  // uses storage type as T
-  template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
-  {
-    CUDF_FAIL("unsupported type for stats casting");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
-  {
-    CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
-    return targetType<T>(*reinterpret_cast<bool const*>(stats_val));
-  }
-
-  // integral but not boolean, and fixed_point, and chrono.
-  template <typename T,
-            CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
-                           cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
-  {
-    switch (type) {
-      case INT32: return targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
-      case INT64: return targetType<T>(*reinterpret_cast<int64_t const*>(stats_val));
-      case INT96:  // Deprecated in parquet specification
-        return targetType<T>(static_cast<__int128_t>(reinterpret_cast<int64_t const*>(stats_val)[0])
-                               << 32 |
-                             reinterpret_cast<int32_t const*>(stats_val)[2]);
-      case BYTE_ARRAY: [[fallthrough]];
-      case FIXED_LEN_BYTE_ARRAY:
-        if (stats_size == sizeof(T)) {
-          // if type size == length of stats_val. then typecast and return.
-          if constexpr (cudf::is_chrono<T>()) {
-            return targetType<T>(*reinterpret_cast<typename T::rep const*>(stats_val));
-          } else {
-            return targetType<T>(*reinterpret_cast<T const*>(stats_val));
-          }
-        }
-        // unsupported type
-      default: CUDF_FAIL("Invalid type and stats combination");
-    }
-  }
-
-  template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
-  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
-  {
-    switch (type) {
-      case FLOAT: return targetType<T>(*reinterpret_cast<float const*>(stats_val));
-      case DOUBLE: return targetType<T>(*reinterpret_cast<double const*>(stats_val));
-      default: CUDF_FAIL("Invalid type and stats combination");
-    }
-  }
-
-  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  static T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  row_group_stats_caster(size_type total_row_groups,
+                         std::vector<metadata> const& per_file_metadata,
+                         host_span<std::vector<size_type> const> row_group_indices)
+    : total_row_groups{total_row_groups},
+      per_file_metadata{per_file_metadata},
+      row_group_indices{row_group_indices}
   {
-    switch (type) {
-      case BYTE_ARRAY: [[fallthrough]];
-      case FIXED_LEN_BYTE_ARRAY:
-        return string_view(reinterpret_cast<char const*>(stats_val), stats_size);
-      default: CUDF_FAIL("Invalid type and stats combination");
-    }
   }
 
   // Creates device columns from column statistics (min, max)
@@ -139,82 +74,8 @@ struct stats_caster {
     if constexpr (cudf::is_compound<T>() && !std::is_same_v<T, string_view>) {
       CUDF_FAIL("Compound types do not have statistics");
     } else {
-      // Local struct to hold host columns
-      struct host_column {
-        // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        cudf::detail::host_vector<T> val;
-        std::vector<bitmask_type> null_mask;
-        cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
-          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
-            null_mask(
-              cudf::util::div_rounding_up_safe<size_type>(
-                cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
-              ~bitmask_type{0})
-        {
-        }
-
-        void set_index(size_type index,
-                       std::optional<std::vector<uint8_t>> const& binary_value,
-                       Type const type)
-        {
-          if (binary_value.has_value()) {
-            val[index] = convert<T>(binary_value.value().data(), binary_value.value().size(), type);
-          }
-          if (not binary_value.has_value()) {
-            clear_bit_unsafe(null_mask.data(), index);
-            null_count++;
-          }
-        }
-
-        static auto make_strings_children(host_span<string_view> host_strings,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::device_async_resource_ref mr)
-        {
-          auto const total_char_count = std::accumulate(
-            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
-              return sum + str.size_bytes();
-            });
-          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
-          auto offsets =
-            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
-          offsets.push_back(0);
-          for (auto const& str : host_strings) {
-            auto tmp =
-              str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
-            chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
-            offsets.push_back(offsets.back() + tmp.length());
-          }
-          auto d_chars   = cudf::detail::make_device_uvector_async(chars, stream, mr);
-          auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr);
-          return std::tuple{std::move(d_chars), std::move(d_offsets)};
-        }
-
-        auto to_device(cudf::data_type dtype,
-                       rmm::cuda_stream_view stream,
-                       rmm::device_async_resource_ref mr)
-        {
-          if constexpr (std::is_same_v<T, string_view>) {
-            auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
-            return cudf::make_strings_column(
-              val.size(),
-              std::make_unique<column>(std::move(d_offsets), rmm::device_buffer{}, 0),
-              d_chars.release(),
-              null_count,
-              rmm::device_buffer{
-                null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr});
-          }
-          return std::make_unique<column>(
-            dtype,
-            val.size(),
-            cudf::detail::make_device_uvector_async(val, stream, mr).release(),
-            rmm::device_buffer{
-              null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr},
-            null_count);
-        }
-      };  // local struct host_column
-      host_column min(total_row_groups, stream);
-      host_column max(total_row_groups, stream);
+      host_column<T> min(total_row_groups, stream);
+      host_column<T> max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
@@ -248,146 +109,6 @@ struct stats_caster {
   }
 };
 
-/**
- * @brief Converts AST expression to StatsAST for comparing with column statistics
- * This is used in row group filtering based on predicate.
- * statistics min value of a column is referenced by column_index*2
- * statistics max value of a column is referenced by column_index*2+1
- *
- */
-class stats_expression_converter : public ast::detail::expression_transformer {
- public:
-  stats_expression_converter(ast::expression const& expr, size_type const& num_columns)
-    : _num_columns{num_columns}
-  {
-    expr.accept(*this);
-  }
-
-  /**
-   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
-   */
-  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
-  {
-    return expr;
-  }
-
-  /**
-   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
-   */
-  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
-  {
-    CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT,
-                 "Statistics AST supports only left table");
-    CUDF_EXPECTS(expr.get_column_index() < _num_columns,
-                 "Column index cannot be more than number of columns in the table");
-    return expr;
-  }
-
-  /**
-   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
-   */
-  std::reference_wrapper<ast::expression const> visit(
-    ast::column_name_reference const& expr) override
-  {
-    CUDF_FAIL("Column name reference is not supported in statistics AST");
-  }
-
-  /**
-   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
-   */
-  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
-  {
-    using cudf::ast::ast_operator;
-    auto const operands = expr.get_operands();
-    auto const op       = expr.get_operator();
-
-    if (auto* v = dynamic_cast<ast::column_reference const*>(&operands[0].get())) {
-      // First operand should be column reference, second should be literal.
-      CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2,
-                   "Only binary operations are supported on column reference");
-      CUDF_EXPECTS(dynamic_cast<ast::literal const*>(&operands[1].get()) != nullptr,
-                   "Second operand of binary operation with column reference must be a literal");
-      v->accept(*this);
-      // Push literal into the ast::tree
-      auto const& literal =
-        _stats_expr.push(*dynamic_cast<ast::literal const*>(&operands[1].get()));
-      auto const col_index = v->get_column_index();
-      switch (op) {
-        /* transform to stats conditions. op(col, literal)
-        col1 == val --> vmin <= val && vmax >= val
-        col1 != val --> !(vmin == val && vmax == val)
-        col1 >  val --> vmax > val
-        col1 <  val --> vmin < val
-        col1 >= val --> vmax >= val
-        col1 <= val --> vmin <= val
-        */
-        case ast_operator::EQUAL: {
-          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
-          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
-          _stats_expr.push(ast::operation{
-            ast::ast_operator::LOGICAL_AND,
-            _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}),
-            _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})});
-          break;
-        }
-        case ast_operator::NOT_EQUAL: {
-          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
-          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
-          _stats_expr.push(ast::operation{
-            ast_operator::LOGICAL_OR,
-            _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}),
-            _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})});
-          break;
-        }
-        case ast_operator::LESS: [[fallthrough]];
-        case ast_operator::LESS_EQUAL: {
-          auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
-          _stats_expr.push(ast::operation{op, vmin, literal});
-          break;
-        }
-        case ast_operator::GREATER: [[fallthrough]];
-        case ast_operator::GREATER_EQUAL: {
-          auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
-          _stats_expr.push(ast::operation{op, vmax, literal});
-          break;
-        }
-        default: CUDF_FAIL("Unsupported operation in Statistics AST");
-      };
-    } else {
-      auto new_operands = visit_operands(operands);
-      if (cudf::ast::detail::ast_operator_arity(op) == 2) {
-        _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()});
-      } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
-        _stats_expr.push(ast::operation{op, new_operands.front()});
-      }
-    }
-    return _stats_expr.back();
-  }
-
-  /**
-   * @brief Returns the AST to apply on Column chunk statistics.
-   *
-   * @return AST operation expression
-   */
-  [[nodiscard]] std::reference_wrapper<ast::expression const> get_stats_expr() const
-  {
-    return _stats_expr.back();
-  }
-
- private:
-  std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
-    cudf::host_span<std::reference_wrapper<ast::expression const> const> operands)
-  {
-    std::vector<std::reference_wrapper<ast::expression const>> transformed_operands;
-    for (auto const& operand : operands) {
-      auto const new_operand = operand.get().accept(*this);
-      transformed_operands.push_back(new_operand);
-    }
-    return transformed_operands;
-  }
-  ast::tree _stats_expr;
-  size_type _num_columns;
-};
 }  // namespace
 
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::apply_stats_filters(
@@ -404,7 +125,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::ap
   // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1]
   // For each column, it contains #sources * #column_chunks_per_src rows.
   std::vector<std::unique_ptr<column>> columns;
-  stats_caster const stats_col{
+  row_group_stats_caster const stats_col{
     static_cast<size_type>(total_row_groups), per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
     auto const schema_idx = output_column_schemas[col_idx];
diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp
new file mode 100644
index 00000000000..ef022b418bf
--- /dev/null
+++ b/cpp/src/io/parquet/stats_filter_helpers.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "stats_filter_helpers.hpp"
+
+#include "io/parquet/parquet_common.hpp"
+
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/traits.hpp>
+
+namespace cudf::io::parquet::detail {
+
+stats_expression_converter::stats_expression_converter(ast::expression const& expr,
+                                                       size_type num_columns)
+  : _num_columns{num_columns}
+{
+  expr.accept(*this);
+}
+
+std::reference_wrapper<ast::expression const> stats_expression_converter::visit(
+  ast::literal const& expr)
+{
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> stats_expression_converter::visit(
+  ast::column_reference const& expr)
+{
+  CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT,
+               "Statistics AST supports only left table");
+  CUDF_EXPECTS(expr.get_column_index() < _num_columns,
+               "Column index cannot be more than number of columns in the table");
+  return expr;
+}
+
+std::reference_wrapper<ast::expression const> stats_expression_converter::visit(
+  ast::column_name_reference const& expr)
+{
+  CUDF_FAIL("Column name reference is not supported in statistics AST");
+}
+
+std::reference_wrapper<ast::expression const> stats_expression_converter::visit(
+  ast::operation const& expr)
+{
+  using cudf::ast::ast_operator;
+  auto const operands = expr.get_operands();
+  auto const op       = expr.get_operator();
+
+  if (auto* v = dynamic_cast<ast::column_reference const*>(&operands[0].get())) {
+    // First operand should be column reference, second should be literal.
+    CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2,
+                 "Only binary operations are supported on column reference");
+    CUDF_EXPECTS(dynamic_cast<ast::literal const*>(&operands[1].get()) != nullptr,
+                 "Second operand of binary operation with column reference must be a literal");
+    v->accept(*this);
+    // Push literal into the ast::tree
+    auto const& literal  = _stats_expr.push(*dynamic_cast<ast::literal const*>(&operands[1].get()));
+    auto const col_index = v->get_column_index();
+    switch (op) {
+      /* transform to stats conditions. op(col, literal)
+      col1 == val --> vmin <= val && vmax >= val
+      col1 != val --> !(vmin == val && vmax == val)
+      col1 >  val --> vmax > val
+      col1 <  val --> vmin < val
+      col1 >= val --> vmax >= val
+      col1 <= val --> vmin <= val
+      */
+      case ast_operator::EQUAL: {
+        auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+        auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+        _stats_expr.push(ast::operation{
+          ast::ast_operator::LOGICAL_AND,
+          _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}),
+          _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})});
+        break;
+      }
+      case ast_operator::NOT_EQUAL: {
+        auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+        auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+        _stats_expr.push(
+          ast::operation{ast_operator::LOGICAL_OR,
+                         _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}),
+                         _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})});
+        break;
+      }
+      case ast_operator::LESS: [[fallthrough]];
+      case ast_operator::LESS_EQUAL: {
+        auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2});
+        _stats_expr.push(ast::operation{op, vmin, literal});
+        break;
+      }
+      case ast_operator::GREATER: [[fallthrough]];
+      case ast_operator::GREATER_EQUAL: {
+        auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1});
+        _stats_expr.push(ast::operation{op, vmax, literal});
+        break;
+      }
+      default: CUDF_FAIL("Unsupported operation in Statistics AST");
+    };
+  } else {
+    auto new_operands = visit_operands(operands);
+    if (cudf::ast::detail::ast_operator_arity(op) == 2) {
+      _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()});
+    } else if (cudf::ast::detail::ast_operator_arity(op) == 1) {
+      _stats_expr.push(ast::operation{op, new_operands.front()});
+    }
+  }
+  return _stats_expr.back();
+}
+
+std::reference_wrapper<ast::expression const> stats_expression_converter::get_stats_expr() const
+{
+  return _stats_expr.back();
+}
+
+std::vector<std::reference_wrapper<ast::expression const>>
+stats_expression_converter::visit_operands(
+  cudf::host_span<std::reference_wrapper<ast::expression const> const> operands)
+{
+  std::vector<std::reference_wrapper<ast::expression const>> transformed_operands;
+  std::transform(operands.begin(),
+                 operands.end(),
+                 std::back_inserter(transformed_operands),
+                 [t = this](auto& operand) { return operand.get().accept(*t); });
+
+  return transformed_operands;
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp
new file mode 100644
index 00000000000..63b57f46539
--- /dev/null
+++ b/cpp/src/io/parquet/stats_filter_helpers.hpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "io/parquet/parquet_common.hpp"
+
+#include <cudf/ast/detail/expression_transformer.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+#include <cudf/utilities/span.hpp>
+#include <cudf/utilities/traits.hpp>
+
+#include <algorithm>
+#include <numeric>
+#include <string>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Base utilities for converting and casting stats values
+ *
+ * Derived classes handle row group or page-level statistics as needed.
+ *
+ */
+class stats_caster_base {
+ protected:
+  template <typename ToType, typename FromType>
+  static inline ToType targetType(FromType const value)
+  {
+    if constexpr (cudf::is_timestamp<ToType>()) {
+      return static_cast<ToType>(
+        typename ToType::duration{static_cast<typename ToType::rep>(value)});
+    } else if constexpr (std::is_same_v<ToType, string_view>) {
+      return ToType{nullptr, 0};
+    } else {
+      return static_cast<ToType>(value);
+    }
+  }
+
+  // uses storage type as T
+  template <typename T, CUDF_ENABLE_IF(cudf::is_dictionary<T>() or cudf::is_nested<T>())>
+  static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  {
+    CUDF_FAIL("unsupported type for stats casting");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_boolean<T>())>
+  static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  {
+    CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination");
+    return stats_caster_base::targetType<T>(*reinterpret_cast<bool const*>(stats_val));
+  }
+
+  // integral but not boolean, and fixed_point, and chrono.
+  template <typename T,
+            CUDF_ENABLE_IF((cudf::is_integral<T>() and !cudf::is_boolean<T>()) or
+                           cudf::is_fixed_point<T>() or cudf::is_chrono<T>())>
+  static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  {
+    switch (type) {
+      case INT32:
+        return stats_caster_base::targetType<T>(*reinterpret_cast<int32_t const*>(stats_val));
+      case INT64:
+        return stats_caster_base::targetType<T>(*reinterpret_cast<int64_t const*>(stats_val));
+      case INT96:  // Deprecated in parquet specification
+        return stats_caster_base::targetType<T>(
+          static_cast<__int128_t>(reinterpret_cast<int64_t const*>(stats_val)[0]) << 32 |
+          reinterpret_cast<int32_t const*>(stats_val)[2]);
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY:
+        if (stats_size == sizeof(T)) {
+          // if type size == length of stats_val. then typecast and return.
+          if constexpr (cudf::is_chrono<T>()) {
+            return stats_caster_base::targetType<T>(
+              *reinterpret_cast<typename T::rep const*>(stats_val));
+          } else {
+            return stats_caster_base::targetType<T>(*reinterpret_cast<T const*>(stats_val));
+          }
+        }
+        // unsupported type
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_floating_point<T>())>
+  static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  {
+    switch (type) {
+      case FLOAT:
+        return stats_caster_base::targetType<T>(*reinterpret_cast<float const*>(stats_val));
+      case DOUBLE:
+        return stats_caster_base::targetType<T>(*reinterpret_cast<double const*>(stats_val));
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
+  static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type)
+  {
+    switch (type) {
+      case BYTE_ARRAY: [[fallthrough]];
+      case FIXED_LEN_BYTE_ARRAY:
+        return string_view(reinterpret_cast<char const*>(stats_val), stats_size);
+      default: CUDF_FAIL("Invalid type and stats combination");
+    }
+  }
+
+  /**
+   * @brief Local struct to hold host columns during stats based filtering
+   *
+   * @tparam T Type of the column
+   */
+  template <typename T>
+  struct host_column {
+    // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
+    cudf::detail::host_vector<T> val;
+    std::vector<bitmask_type> null_mask;
+    cudf::size_type null_count = 0;
+
+    host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+      : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
+        null_mask(cudf::util::div_rounding_up_safe<cudf::size_type>(
+                    cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
+                  ~bitmask_type{0})
+    {
+    }
+
+    void inline set_index(size_type index,
+                          std::optional<std::vector<uint8_t>> const& binary_value,
+                          Type const type)
+    {
+      if (binary_value.has_value()) {
+        val[index] = stats_caster_base::convert<T>(
+          binary_value.value().data(), binary_value.value().size(), type);
+      }
+      if (not binary_value.has_value()) {
+        clear_bit_unsafe(null_mask.data(), index);
+        null_count++;
+      }
+    }
+    static inline std::tuple<rmm::device_uvector<char>, rmm::device_uvector<size_type>>
+    make_strings_children(host_span<string_view> host_strings,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr)
+    {
+      auto const total_char_count =
+        std::accumulate(host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+          return sum + str.size_bytes();
+        });
+      auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+      auto offsets =
+        cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+      offsets.push_back(0);
+      for (auto const& str : host_strings) {
+        auto tmp =
+          str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
+        chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
+        offsets.push_back(offsets.back() + tmp.length());
+      }
+      auto d_chars   = cudf::detail::make_device_uvector_async(chars, stream, mr);
+      auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr);
+      return std::tuple{std::move(d_chars), std::move(d_offsets)};
+    }
+
+    std::unique_ptr<column> inline to_device(cudf::data_type dtype,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+    {
+      if constexpr (std::is_same_v<T, string_view>) {
+        auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
+        return cudf::make_strings_column(
+          val.size(),
+          std::make_unique<column>(std::move(d_offsets), rmm::device_buffer{}, 0),
+          d_chars.release(),
+          null_count,
+          rmm::device_buffer{
+            null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr});
+      }
+      return std::make_unique<column>(
+        dtype,
+        val.size(),
+        cudf::detail::make_device_uvector_async(val, stream, mr).release(),
+        rmm::device_buffer{
+          null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr},
+        null_count);
+    }
+  };
+};
+
+/**
+ * @brief Converts AST expression to StatsAST for comparing with column statistics
+ *
+ * This is used in row group filtering based on predicate.
+ * statistics min value of a column is referenced by column_index*2
+ * statistics max value of a column is referenced by column_index*2+1
+ */
+class stats_expression_converter : public ast::detail::expression_transformer {
+ public:
+  stats_expression_converter(ast::expression const& expr, size_type num_columns);
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override;
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override;
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override;
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override;
+
+  /**
+   * @brief Returns the AST to apply on Column chunk statistics.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::reference_wrapper<ast::expression const> get_stats_expr() const;
+
+ private:
+  std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
+    cudf::host_span<std::reference_wrapper<ast::expression const> const> operands);
+
+  ast::tree _stats_expr;
+  size_type _num_columns;
+};
+
+}  // namespace cudf::io::parquet::detail

From ea2bca33af1f1c084c026aa5c3a4612c1dd3d62c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 4 Mar 2025 14:55:09 -0500
Subject: [PATCH 4/4] New nvtext::wordpiece_tokenizer APIs (#17600)

Creates a new word-piece-tokenizer which replaces the existing subword-tokenizer in nvtext.
The subword-tokenizer logic is to split out and specialized to perform basic tokenizing with the word-piece logic only.
The normalizing part is already a separate API. The output will be a lists column of tokens only.

The first change is that the new API uses `wordpiece` instead of `subword`. Here are the 2 C++ API declarations:
```
std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
  cudf::strings_column_view const& input,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```
The vocabulary is loaded as a strings column and the returned object can be used on multiple calls to the next API:
```
std::unique_ptr<cudf::column> wordpiece_tokenize(
  cudf::strings_column_view const& input,
  wordpiece_vocabulary const& vocabulary,
  cudf::size_type max_words_per_row,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```
This will return a lists column of integers which represent the tokens for each row. The `max_words_per_row` will stop the tokenizing process for each row once the number of input words (characters delimited by space) has been reached. This means you may get more tokens than `max_words_per_row` for a row if a single word produces multiple tokens.

Note, that this API expects the input string to already be normalized -- processed by the `nvtext::normalize_characters` API which is also being reworked in https://github.com/rapidsai/cudf/pull/17818

The Python interface has the following pattern:
```
from cudf.core.wordpiece_tokenize import WordPieceVocabulary

input_string = .... # output of the normalizer
vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt")
vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True)
wpt = WordPieceVocabulary(vc)
wpr = wpt.tokenize(input_string)
```
The output is a lists column of the tokens and no longer the tensor-data and metadata format.
If this format is needed, then we can consider a 3rd API that converts the output here to that format.

Closes #17507

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Basit Ayantunde (https://github.com/lamarrr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/17600
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/text/subword.cpp               |  41 +-
 cpp/include/nvtext/wordpiece_tokenize.hpp     | 122 +++
 cpp/src/text/wordpiece_tokenize.cu            | 890 ++++++++++++++++++
 cpp/tests/text/subword_tests.cpp              | 150 ++-
 python/cudf/cudf/core/column/string.py        |  14 +
 python/cudf/cudf/core/wordpiece_tokenize.py   |  44 +
 .../cudf/tests/text/test_subword_tokenizer.py | 473 +++++++++-
 .../libcudf/nvtext/wordpiece_tokenize.pxd     |  22 +
 .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt |  16 +-
 .../pylibcudf/pylibcudf/nvtext/__init__.pxd   |   4 +-
 python/pylibcudf/pylibcudf/nvtext/__init__.py |   4 +-
 .../pylibcudf/nvtext/wordpiece_tokenize.pxd   |  15 +
 .../pylibcudf/nvtext/wordpiece_tokenize.pyi   |  12 +
 .../pylibcudf/nvtext/wordpiece_tokenize.pyx   |  66 ++
 .../tests/test_nvtext_subword_tokenize.py     |  36 +-
 16 files changed, 1900 insertions(+), 10 deletions(-)
 create mode 100644 cpp/include/nvtext/wordpiece_tokenize.hpp
 create mode 100644 cpp/src/text/wordpiece_tokenize.cu
 create mode 100644 python/cudf/cudf/core/wordpiece_tokenize.py
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
 create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8952f11dd23..3018295c9eb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -757,6 +757,7 @@ add_library(
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
   src/text/vocabulary_tokenize.cu
+  src/text/wordpiece_tokenize.cu
   src/transform/bools_to_mask.cu
   src/transform/compute_column.cu
   src/transform/encode.cu
diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
index 0b4e3bdefa5..d961ff86852 100644
--- a/cpp/benchmarks/text/subword.cpp
+++ b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
+#include <nvtext/wordpiece_tokenize.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -57,7 +58,10 @@ static void bench_subword_tokenizer(nvbench::state& state)
 {
   auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
-  std::vector<char const*> h_strings(num_rows, "This is a test ");
+  std::vector<char const*> h_strings(
+    num_rows,
+    "This is a test This is a test This is a test This is a test This is a test This is a test "
+    "This is a test This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
@@ -83,3 +87,36 @@ static void bench_subword_tokenizer(nvbench::state& state)
 NVBENCH_BENCH(bench_subword_tokenizer)
   .set_name("subword_tokenize")
   .add_int64_axis("num_rows", {32768, 262144, 2097152});
+
+static void bench_wordpiece_tokenizer(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_words = static_cast<cudf::size_type>(state.get_int64("max_words"));
+
+  auto const h_strings = std::vector<char const*>(
+    num_rows,
+    "This is a test This is a test This is a test This is a test This is a test This is a test "
+    "This is a test This is a test ");
+  auto const num_words = 32;  // "This is a test" * 8
+  auto const d_strings = cudf::test::strings_column_wrapper(h_strings.begin(), h_strings.end());
+  auto const input     = cudf::strings_column_view{d_strings};
+
+  auto const vocabulary =
+    cudf::test::strings_column_wrapper({"", "[UNK]", "This", "is", "a", "test"});
+  auto const vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  auto out_size = num_rows * (max_words > 0 ? std::min(max_words, num_words) : num_words);
+  state.add_global_memory_writes<nvbench::int32_t>(out_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::wordpiece_tokenize(input, *vocab, max_words);
+  });
+}
+
+NVBENCH_BENCH(bench_wordpiece_tokenizer)
+  .set_name("wordpiece_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("max_words", {0, 20, 40});
diff --git a/cpp/include/nvtext/wordpiece_tokenize.hpp b/cpp/include/nvtext/wordpiece_tokenize.hpp
new file mode 100644
index 00000000000..85c0f1406bd
--- /dev/null
+++ b/cpp/include/nvtext/wordpiece_tokenize.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+namespace CUDF_EXPORT nvtext {
+/**
+ * @addtogroup nvtext_tokenize
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Vocabulary object to be used with nvtext::wordpiece_tokenizer
+ *
+ * Use nvtext::load_wordpiece_vocabulary to create this object.
+ */
+struct wordpiece_vocabulary {
+  /**
+   * @brief Vocabulary object constructor
+   *
+   * Token ids are the row indices within the vocabulary column.
+   * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+   *
+   * @throw std::invalid_argument if `vocabulary` contains nulls or is empty
+   *
+   * @param input Strings for the vocabulary
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  wordpiece_vocabulary(cudf::strings_column_view const& input,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~wordpiece_vocabulary();
+
+  struct wordpiece_vocabulary_impl;
+  std::unique_ptr<wordpiece_vocabulary_impl> _impl;
+};
+
+/**
+ * @brief Create a tokenize_vocabulary object from a strings column
+ *
+ * Token ids are the row indices within the vocabulary column.
+ * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+ *
+ * @throw std::invalid_argument if `vocabulary` contains nulls or is empty
+ *
+ * @param input Strings for the vocabulary
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::tokenize_with_vocabulary
+ */
+std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the token ids for the input string a wordpiece tokenizer
+ * algorithm with the given vocabulary
+ *
+ * Example:
+ * @code{.pseudo}
+ * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
+ * v = load_wordpiece_vocabulary(vocabulary)
+ * input = ["I have a new GPU now !"]
+ * t = wordpiece_tokenize(i,v)
+ * t is now [[3, 2, 1, 4, 5, 6, 0, 7]]
+ * @endcode
+ *
+ * The `max_words_per_row` also optionally limits the output by only processing
+ * a maximum number of words per row. Here a word is defined as consecutive
+ * sequence of characters delimited by space character(s).
+ *
+ * Example:
+ * @code{.pseudo}
+ * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
+ * v = load_wordpiece_vocabulary(vocabulary)
+ * input = ["I have  a new GPU now !"]
+ * t4 = wordpiece_tokenize(i,v,4)
+ * t4 is now [[3, 2, 1, 4]]
+ * t5 = wordpiece_tokenize(i,v,5)
+ * t5 is now [[3, 2, 1, 4, 5, 6]]
+ * @endcode
+ *
+ * Any null row entry results in a corresponding null entry in the output.
+ *
+ * @param input Strings column to tokenize
+ * @param vocabulary Used to lookup tokens within `input`
+ * @param max_words_per_row Maximum number of words to tokenize for each row.
+ *                          Default 0 tokenizes all words.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column of token ids
+ */
+std::unique_ptr<cudf::column> wordpiece_tokenize(
+  cudf::strings_column_view const& input,
+  wordpiece_vocabulary const& vocabulary,
+  cudf::size_type max_words_per_row = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/** @} */  // end of tokenize group
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/wordpiece_tokenize.cu b/cpp/src/text/wordpiece_tokenize.cu
new file mode 100644
index 00000000000..817e7844201
--- /dev/null
+++ b/cpp/src/text/wordpiece_tokenize.cu
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/detail/lists_column_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <nvtext/wordpiece_tokenize.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cooperative_groups/scan.h>
+#include <cub/cub.cuh>
+#include <cuco/static_map.cuh>
+#include <cuda/std/functional>
+#include <cuda/std/limits>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/find.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/remove.h>
+
+namespace nvtext {
+namespace detail {
+namespace {
+
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using hash_value_type    = string_hasher_type::result_type;
+
+/**
+ * @brief Hasher used for vocabulary map
+ */
+struct vocab_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+/**
+ * @brief Equality operator for vocabulary map
+ */
+struct vocab_equal {
+  cudf::column_device_view const d_strings;
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return lhs == rhs;  // all rows are expected to be unique
+  }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(rhs) == lhs;
+  }
+};
+
+using cuco_storage        = cuco::storage<1>;
+using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;
+using vocabulary_map_type = cuco::static_map<cudf::size_type,
+                                             cudf::size_type,
+                                             cuco::extent<std::size_t>,
+                                             cuda::thread_scope_thread,
+                                             vocab_equal,
+                                             probe_scheme,
+                                             cudf::detail::cuco_allocator<char>,
+                                             cuco_storage>;
+
+/**
+ * @brief Hasher used for the subword vocabulary map
+ */
+struct sub_vocab_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(index);
+    // skip over the '##' prefix
+    return hasher(cudf::string_view(d_str.data() + 2, d_str.size_bytes() - 2));
+  }
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+/**
+ * @brief Equality operator used for the subword vocabulary map
+ *
+ * The subwords start with '##' prefix in the original vocabulary map
+ */
+struct sub_vocab_equal {
+  cudf::column_device_view const d_strings;
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return lhs == rhs;  // all rows are expected to be unique
+  }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(rhs);
+    // skip over the '##' prefix
+    return lhs == cudf::string_view(d_str.data() + 2, d_str.size_bytes() - 2);
+  }
+};
+
+// This 2nd subword map helps avoid requiring temporary strings in device code
+using sub_probe_scheme        = cuco::linear_probing<1, sub_vocab_hasher>;
+using sub_vocabulary_map_type = cuco::static_map<cudf::size_type,
+                                                 cudf::size_type,
+                                                 cuco::extent<std::size_t>,
+                                                 cuda::thread_scope_thread,
+                                                 sub_vocab_equal,
+                                                 sub_probe_scheme,
+                                                 cudf::detail::cuco_allocator<char>,
+                                                 cuco_storage>;
+}  // namespace
+}  // namespace detail
+
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a maintainable way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
+/**
+ * @brief Internal class manages all the data held by the vocabulary object
+ */
+struct wordpiece_vocabulary::wordpiece_vocabulary_impl {
+  std::unique_ptr<cudf::column> const vocabulary;  // copy of the original vocabulary input
+  col_device_view const d_vocabulary;
+  std::unique_ptr<detail::vocabulary_map_type> vocabulary_map;
+  std::unique_ptr<detail::sub_vocabulary_map_type> vocabulary_sub_map;
+  cudf::size_type unk_id{};  // resolved [UNK] id from vocabulary
+
+  auto get_map_ref() const { return vocabulary_map->ref(cuco::op::find); }
+  auto get_sub_map_ref() const { return vocabulary_sub_map->ref(cuco::op::find); }
+
+  wordpiece_vocabulary_impl(std::unique_ptr<cudf::column>&& vocab,
+                            col_device_view&& d_vocab,
+                            std::unique_ptr<detail::vocabulary_map_type>&& map,
+                            std::unique_ptr<detail::sub_vocabulary_map_type>&& sub_map,
+                            cudf::size_type unk_id)
+    : vocabulary(std::move(vocab)),
+      d_vocabulary(std::move(d_vocab)),
+      vocabulary_map(std::move(map)),
+      vocabulary_sub_map(std::move(sub_map)),
+      unk_id{unk_id}
+  {
+  }
+};
+
+namespace {
+/**
+ * @brief Identifies the column indices as the values in the vocabulary map
+ */
+struct key_pair {
+  __device__ auto operator()(cudf::size_type idx) const noexcept
+  {
+    return cuco::make_pair(idx, idx);
+  }
+};
+
+/**
+ * @brief For filtering the subword ('##' prefixed) entries in the vocabulary
+ */
+struct copy_pieces_fn {
+  cudf::column_device_view d_strings;
+  __device__ bool operator()(cudf::size_type idx)
+  {
+    auto const d_str = d_strings.element<cudf::string_view>(idx);
+    if (d_str.size_bytes() < 2) { return false; }
+    return (d_str.data()[0] == '#') and (d_str.data()[1] == '#');
+  }
+};
+
+/**
+ * @brief Resolves the [UNK] entry from the vocabulary
+ *
+ * This saves inlining the lookup code in several places in device code.
+ */
+template <typename MapRefType>
+struct resolve_unk_id {
+  MapRefType d_map;
+  __device__ cudf::size_type operator()(cudf::size_type idx)
+  {
+    // look for both since the normalizer may change the case to match the vocab table
+    auto const unk = idx == 0 ? cudf::string_view("[UNK]", 5) : cudf::string_view("[unk]", 5);
+    auto const fnd = d_map.find(unk);
+    return fnd != d_map.end() ? fnd->second : -1;
+  }
+};
+
+}  // namespace
+
+wordpiece_vocabulary::wordpiece_vocabulary(cudf::strings_column_view const& input,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty", std::invalid_argument);
+  CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls", std::invalid_argument);
+
+  // hold a copy of the input (not expected to be very large)
+  auto vocabulary   = std::make_unique<cudf::column>(input.parent(), stream, mr);
+  auto d_vocabulary = cudf::column_device_view::create(vocabulary->view(), stream);
+
+  // build the vocabulary map: each row is a single term and is the key for the map
+  auto vocab_map = std::make_unique<detail::vocabulary_map_type>(
+    static_cast<size_t>(vocabulary->size() * 2),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    detail::vocab_equal{*d_vocabulary},
+    detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
+    cuco::thread_scope_thread,
+    detail::cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
+  // the row index is the token id (data value for each key in the map)
+  auto iter = cudf::detail::make_counting_transform_iterator(0, key_pair{});
+  vocab_map->insert_async(iter, iter + vocabulary->size(), stream.value());
+  auto const zero_itr = thrust::counting_iterator<cudf::size_type>(0);
+
+  // get the indices of all the ## prefixed entries
+  auto sub_map_indices = rmm::device_uvector<cudf::size_type>(vocabulary->size(), stream);
+  auto const end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    zero_itr,
+                    thrust::counting_iterator<cudf::size_type>(sub_map_indices.size()),
+                    sub_map_indices.begin(),
+                    copy_pieces_fn{*d_vocabulary});
+  sub_map_indices.resize(thrust::distance(sub_map_indices.begin(), end), stream);
+
+  // build a 2nd map with just the ## prefixed items
+  auto vocab_sub_map = std::make_unique<detail::sub_vocabulary_map_type>(
+    sub_map_indices.size() * 2,
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    detail::sub_vocab_equal{*d_vocabulary},
+    detail::sub_probe_scheme{detail::sub_vocab_hasher{*d_vocabulary}},
+    cuco::thread_scope_thread,
+    detail::cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
+  // insert them without the '##' prefix since that is how they will be looked up
+  auto iter_sub = thrust::make_transform_iterator(sub_map_indices.begin(), key_pair{});
+  vocab_sub_map->insert_async(iter_sub, iter_sub + sub_map_indices.size(), stream.value());
+
+  // prefetch the [unk] vocab entry
+  auto unk_ids = rmm::device_uvector<cudf::size_type>(2, stream);
+  auto d_map   = vocab_map->ref(cuco::op::find);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    zero_itr,
+                    zero_itr + unk_ids.size(),
+                    unk_ids.begin(),
+                    resolve_unk_id<decltype(d_map)>{d_map});
+  auto const id0    = unk_ids.front_element(stream);
+  auto const id1    = unk_ids.back_element(stream);
+  auto const unk_id = id0 >= 0 ? id0 : id1;
+
+  _impl = std::make_unique<wordpiece_vocabulary_impl>(std::move(vocabulary),
+                                                      std::move(d_vocabulary),
+                                                      std::move(vocab_map),
+                                                      std::move(vocab_sub_map),
+                                                      unk_id);
+}
+
+wordpiece_vocabulary::~wordpiece_vocabulary() {}
+
+std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<wordpiece_vocabulary>(input, stream, mr);
+}
+
+namespace detail {
+namespace {
+
+constexpr auto block_size    = 128;
+constexpr auto no_token      = cuda::std::numeric_limits<cudf::size_type>::max();
+constexpr auto max_word_size = 200;  // words longer than this are not tokenized
+
+/**
+ * @brief Returns a new string_view truncating the last character of the input string
+ *
+ * This is more efficient than using substr() which is more generic.
+ */
+__device__ cudf::string_view remove_last_char(cudf::string_view d_str)
+{
+  if (d_str.size_bytes() < 2) { return cudf::string_view(); }
+  auto const begin = d_str.data();
+  auto end         = begin + d_str.size_bytes() - 1;
+  while ((end > begin) && cudf::strings::detail::is_utf8_continuation_char(*end)) {
+    --end;
+  }
+  auto const size = static_cast<cudf::size_type>(thrust::distance(begin, end));
+  return cudf::string_view(begin, size);
+}
+
+/**
+ * @brief The wordpiece tokenizer
+ *
+ * The given word is looked up in the d_map and if found the corresponding
+ * token (integer) is returned.
+ *
+ * If not found, the function will iteratively remove the last character
+ * from the word and check the substring exists in the d_map until the
+ * the substring(s) is found. If still not found, the unk_id is returned.
+ * If found, the characters removed are iteratively checked against
+ * the d_sub_map until all have been located. If any of these are not found,
+ * the unk_id is returned.
+ *
+ * Example: word="GPU" and d_map contains { ... {"G",10}, {"##U",7}, {"##P",3}, ... }
+ * which means the d_sub_map contains { ... {"U",7}, {"P",3}, ... }
+ * Since "GPU" is not found in d_map, the 'U' is removed and rechecked.
+ * And "GP" is also not found so another character is removed leaving "G".
+ * The "G" is found in d_map so now the removed characters are processed
+ * starting with "PU" which is not found in d_sub_map. Removing the 'U'
+ * again results in "P" which is found in d_sub_map and iterating again
+ * locates 'U' in d_sub_map as well. The end result is that "GPU" produces
+ * 3 tokens [10,3,7].
+ *
+ * @param word Word to tokenize
+ * @param d_map Vocabulary to check for word and sub-words
+ * @param d_sub_map Partial vocabulary of '##' entries
+ * @param unk_id The unknown token id returned when no token is found
+ * @param d_tokens Output token ids are returned here
+ * @return The number of resolved tokens
+ */
+template <typename MapRefType, typename SubMapRefType>
+__device__ cudf::size_type wp_tokenize_fn(cudf::string_view word,
+                                          MapRefType const& d_map,
+                                          SubMapRefType const& d_sub_map,
+                                          cudf::size_type unk_id,
+                                          cudf::size_type* d_tokens)
+{
+  // lookup word in map
+  auto token_idx = 0;
+  auto itr       = d_map.find(word);
+  if (itr != d_map.end()) {
+    d_tokens[token_idx++] = itr->second;
+    return token_idx;
+  }
+
+  // reduce word by one character and try again
+  auto piece = remove_last_char(word);
+  while (!piece.empty()) {
+    itr = d_map.find(piece);
+    if (itr == d_map.end()) {
+      piece = remove_last_char(piece);
+      continue;
+    }
+    d_tokens[token_idx++] = itr->second;
+    break;
+  }
+  if (piece.empty()) {
+    // did not find anything; this is not common
+    d_tokens[token_idx++] = unk_id;
+    return token_idx;
+  }
+
+  word =
+    cudf::string_view(word.data() + piece.size_bytes(), word.size_bytes() - piece.size_bytes());
+  piece = word;
+  while (!piece.empty()) {
+    auto itr = d_sub_map.find(piece);
+    if (itr == d_sub_map.end()) {
+      piece = remove_last_char(piece);
+      continue;
+    }
+    d_tokens[token_idx++] = itr->second;
+
+    word =
+      cudf::string_view(word.data() + piece.size_bytes(), word.size_bytes() - piece.size_bytes());
+    piece = word;
+  }
+  if (!word.empty()) {
+    // very uncommon
+    d_tokens[0] = unk_id;
+    // need to reset any previous ids too
+    for (auto i = 1; i < token_idx; ++i) {
+      d_tokens[i] = no_token;
+    }
+    token_idx = 1;
+  }
+
+  return token_idx;
+}
+
+/**
+ * @brief Kernel for tokenizing all words
+ *
+ * Launched as a thread per edge value in d_edges.
+ *
+ * Each value in d_edge is the beginning of a word.
+ * The kernel searches for a matching space character (or the next d_edge value)
+ * to find the end of the word.
+ * The result is then tokenized using the wp_tokenize_fn utility.
+ *
+ * @param d_edges The offset to the beginning of each word
+ * @param d_chars Pointer to the characters of the input column
+ * @param offset Maybe non-zero if the input column has been sliced
+ * @param d_map Lookup table for the wp_tokenize_fn utility
+ * @param d_sub_map 2nd lookup table for the wp_tokenize_fn utility
+ * @param unk_id Unknown token id when a token cannot be resolved
+ * @param d_tokens Output tokens are written here
+ */
+template <typename MapRefType, typename SubMapRefType>
+CUDF_KERNEL void tokenize_all_kernel(cudf::device_span<int64_t const> d_edges,
+                                     char const* d_chars,
+                                     // int64_t offset,
+                                     MapRefType const d_map,
+                                     SubMapRefType const d_sub_map,
+                                     cudf::size_type unk_id,
+                                     cudf::size_type* d_tokens)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (d_edges.size() - 1)) { return; }
+  auto const begin    = d_chars + d_edges[idx];
+  auto const end      = d_chars + d_edges[idx + 1];
+  auto const word_end = thrust::find(thrust::seq, begin, end, ' ');
+  auto const size     = static_cast<cudf::size_type>(thrust::distance(begin, word_end));
+  if (size == 0) { return; }
+  auto d_output = d_tokens + d_edges[idx];  // - offset;
+  if (size >= max_word_size) {
+    *d_output = unk_id;
+    return;
+  }
+  auto const word = cudf::string_view{begin, size};
+  wp_tokenize_fn(word, d_map, d_sub_map, unk_id, d_output);
+}
+
+/**
+ * @brief Count the number of tokens per output row
+ *
+ * Uses segmented-reduce to compute the number of tokens per row.
+ *
+ * @param d_tokens The tokens to count
+ * @param offsets The offsets for the segmented-reduce
+ * @param offset Maybe non-zero if the input column has been sliced
+ * @param size The number of output rows (same as the number of input rows)
+ * @param stream Stream used for device allocations and kernel launches
+ * @return The number of tokens per row
+ */
+template <typename OffsetType>
+rmm::device_uvector<cudf::size_type> count_tokens(cudf::size_type const* d_tokens,
+                                                  OffsetType offsets,
+                                                  int64_t offset,
+                                                  cudf::size_type size,
+                                                  rmm::cuda_stream_view stream)
+{
+  auto d_counts = rmm::device_uvector<cudf::size_type>(size, stream);
+
+  // transform iterator used for counting the number of !no_tokens
+  auto const d_in = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([d_tokens] __device__(auto idx) {
+      return static_cast<cudf::size_type>(d_tokens[idx] != no_token);
+    }));
+
+  auto const d_offsets = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<int64_t>([offsets, offset] __device__(auto idx) {
+      return offsets[idx] - offset;
+    }));
+
+  auto temp  = std::size_t{0};
+  auto d_out = d_counts.data();
+  cub::DeviceSegmentedReduce::Sum(
+    nullptr, temp, d_in, d_out, size, d_offsets, d_offsets + 1, stream.value());
+  auto d_temp = rmm::device_buffer{temp, stream};
+  cub::DeviceSegmentedReduce::Sum(
+    d_temp.data(), temp, d_in, d_out, size, d_offsets, d_offsets + 1, stream.value());
+
+  return d_counts;
+}
+
+/**
+ * @brief Compute all tokens for the input column
+ *
+ * @param input Input strings column
+ * @param first_offset Offset to first row in chars for `input`
+ * @param last_offset Offset just past the last row in chars for `input`
+ * @param vocabulary Vocabulary data needed by the tokenizer
+ * @param stream Stream used for device allocations and kernel launches
+ * @return The tokens (and non-tokens) for the input
+ */
+rmm::device_uvector<cudf::size_type> compute_all_tokens(
+  cudf::strings_column_view const& input,
+  int64_t first_offset,
+  int64_t chars_size,
+  wordpiece_vocabulary::wordpiece_vocabulary_impl const& vocabulary,
+  rmm::cuda_stream_view stream)
+{
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
+
+  // find beginnings of words
+  auto d_edges = rmm::device_uvector<int64_t>(chars_size / 2L, stream);
+  // beginning of a word is a non-space preceded by a space
+  auto edges_end = cudf::detail::copy_if_safe(
+    thrust::counting_iterator<int64_t>(0),
+    thrust::counting_iterator<int64_t>(chars_size),
+    d_edges.begin(),
+    [d_input_chars] __device__(auto idx) {
+      if (idx == 0) { return d_input_chars[idx] == ' '; }
+      return (d_input_chars[idx] != ' ' && d_input_chars[idx - 1] == ' ');
+    },
+    stream);
+
+  auto const edges_count =
+    input.size() + 1 + static_cast<int64_t>(thrust::distance(d_edges.begin(), edges_end));
+  // thrust::merge has an int32 max limit currently
+  CUDF_EXPECTS(edges_count < std::numeric_limits<int32_t>::max(), "words exceed internal limit");
+
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  auto const d_offsets = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<int64_t>([input_offsets, first_offset] __device__(auto idx) {
+      return input_offsets[idx] - first_offset;
+    }));
+
+  // merge in the input offsets to identify words starting each row
+  auto d_all_edges = [&] {
+    auto d_all_edges = rmm::device_uvector<int64_t>(edges_count, stream);
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  d_offsets,
+                  d_offsets + input.size() + 1,
+                  d_edges.begin(),
+                  edges_end,
+                  d_all_edges.begin());
+    d_edges.release();  // done with this
+    return d_all_edges;
+  }();
+
+  auto const map_ref     = vocabulary.get_map_ref();
+  auto const sub_map_ref = vocabulary.get_sub_map_ref();
+  auto const unk_id      = vocabulary.unk_id;
+
+  auto d_tokens = rmm::device_uvector<cudf::size_type>(chars_size, stream);
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), no_token);
+
+  cudf::detail::grid_1d grid{static_cast<cudf::size_type>(d_all_edges.size()), 512};
+  tokenize_all_kernel<decltype(map_ref), decltype(sub_map_ref)>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      d_all_edges, d_input_chars, map_ref, sub_map_ref, unk_id, d_tokens.data());
+
+  return d_tokens;
+}
+
+constexpr cudf::size_type no_word = cuda::std::numeric_limits<cudf::size_type>::max();
+constexpr int64_t no_word64       = cuda::std::numeric_limits<int64_t>::max();
+
+/**
+ * @brief Find word boundaries kernel
+ *
+ * Launched as a warp per string in the input column.
+ *
+ * Finds the edges of words within each string and stores them into 'starts' and 'sizes'.
+ * This kernel is used when a maximum number of words are to be processed per row.
+ *
+ * @param d_strings Input strings column
+ * @param d_chars The beginning of the character data for d_strings
+ *                already adjusted for any sliced offset
+ * @param offsets The offsets for the output arrays: starts and sizes
+ * @param starts The output offsets within d_chars identifying the beginning of words
+ * @param sizes The output size of the words corresponding to starts
+ */
+template <int32_t tile_size = cudf::detail::warp_size>
+CUDF_KERNEL void find_words_kernel(cudf::column_device_view const d_strings,
+                                   char const* d_chars,
+                                   int64_t const* offsets,
+                                   int64_t* starts,
+                                   cudf::size_type* sizes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+  auto const str_offset = static_cast<int64_t>(thrust::distance(d_chars, d_str.data()));
+
+  auto const d_start_words = starts + offsets[str_idx];
+  auto const d_word_sizes  = sizes + offsets[str_idx];
+  auto const max_words     = static_cast<cudf::size_type>(offsets[str_idx + 1] - offsets[str_idx]);
+
+  constexpr auto bytes_per_thread = 6;  // average 5 chars per word plus space
+  constexpr auto words_size       = block_size * bytes_per_thread;
+  __shared__ cudf::size_type s_start_words[words_size];
+  __shared__ cudf::size_type s_end_words[words_size];
+  // compiler is not able to find this for some reason so defining it here as well
+  constexpr auto no_word = cuda::std::numeric_limits<cudf::size_type>::max();
+
+  namespace cg     = cooperative_groups;
+  auto const block = cg::this_thread_block();
+  auto const tile  = cg::tiled_partition<tile_size>(block);
+
+  auto const lane_idx   = tile.thread_rank();
+  auto const warp_idx   = tile.meta_group_rank();
+  auto const warp_words = words_size / tile.meta_group_size();
+
+  cudf::size_type word_count = 0;
+  cudf::size_type byte_count = 0;
+
+  auto first_word  = no_word;  // only used by lane_idx==0
+  auto const begin = d_str.data();
+  auto const end   = begin + d_str.size_bytes();
+
+  auto start_words = s_start_words + (warp_idx * warp_words);
+  auto end_words   = s_end_words + (warp_idx * warp_words);
+
+  // continue until all bytes have been consumed or the max word count has been reached
+  auto itr = begin + lane_idx;
+  while (word_count < max_words && byte_count < d_str.size_bytes()) {
+    // initialize all intermediate results
+    start_words[lane_idx] = lane_idx > 0 ? no_word : first_word;
+    end_words[lane_idx]   = no_word;
+    for (auto j = lane_idx + tile_size; j < warp_words; j += tile_size) {
+      start_words[j] = no_word;
+      end_words[j]   = no_word;
+    }
+    tile.sync();
+
+    cudf::size_type last_idx = 0;
+    // each thread processes bytes_per_thread of the d_str
+    for (auto k = lane_idx; k < warp_words && itr < end; k += tile_size) {
+      // look for word starts (non-space preceded by a space)
+      if ((*itr != ' ') && ((itr == begin) || (*(itr - 1) == ' '))) {
+        last_idx              = (k / 2) + 1;
+        start_words[last_idx] = static_cast<cudf::size_type>(thrust::distance(begin, itr));
+      }
+      // look for word ends (space preceded by non-space)
+      if (((itr + 1) == end) || ((itr != begin) && (*itr == ' ') && (*(itr - 1) != ' '))) {
+        auto const adjust   = static_cast<cudf::size_type>(*itr != ' ');  // edge case
+        last_idx            = (k / 2) + adjust;
+        end_words[last_idx] = static_cast<cudf::size_type>(thrust::distance(begin, itr)) + adjust;
+      }
+      itr += tile_size;
+    }
+    // keep track of how much of start_words/end_words we used
+    last_idx = cg::reduce(tile, last_idx, cg::greater<cudf::size_type>{}) + 1;
+
+    cudf::size_type output_count = 0;
+    if (lane_idx == 0) {
+      // compress out the no-words
+      auto const count       = static_cast<cudf::size_type>(thrust::distance(
+        start_words, thrust::remove(thrust::seq, start_words, start_words + last_idx, no_word)));
+      auto const words_found = static_cast<cudf::size_type>(thrust::distance(
+        end_words, thrust::remove(thrust::seq, end_words, end_words + last_idx, no_word)));
+      // this partially resolved word wraps around for the next iteration
+      first_word   = (count > words_found) ? start_words[words_found] : no_word;
+      output_count = cuda::std::min(words_found, max_words - word_count);
+    }
+
+    // copy results to the output
+    auto out_starts = d_start_words + word_count;
+    auto out_sizes  = d_word_sizes + word_count;
+    output_count    = tile.shfl(output_count, 0);  // copy output_count to all threads
+    for (auto k = lane_idx; k < output_count; k += tile_size) {
+      auto const start = start_words[k];
+      out_starts[k]    = start + str_offset;
+      out_sizes[k]     = end_words[k] - start;
+    }
+
+    word_count += output_count;
+    byte_count += tile_size * bytes_per_thread;
+    tile.sync();
+  }
+
+  // fill in the remainder of the output
+  auto out_starts = d_start_words + word_count;
+  auto out_sizes  = d_word_sizes + word_count;
+  for (auto k = lane_idx; k < (max_words - word_count); k += tile_size) {
+    out_starts[k] = no_word64;
+    out_sizes[k]  = no_word;
+  }
+}
+
+/**
+ * @brief Limiting tokenizing kernel
+ *
+ * Launched as a thread per d_starts (and d_sizes) values.
+ *
+ * This kernel is provided word boundaries as d_starts and d_sizes.
+ * The start of the word at index idx is d_start[idx].
+ * The size of that word is d_size[idx].
+ *
+ * The wp_tokenize_fn is used to output the tokens for each word
+ * appropriately into d_tokens.
+ *
+ * @param d_starts The start of each word in d_chars
+ * @param d_sizes The corresponding size of the word pointed to by d_starts
+ * @param d_chars Points to the beginning of the characters of the input column
+ * @param d_map Lookup table for the wp_tokenize_fn utility
+ * @param d_sub_map 2nd lookup table for the wp_tokenize_fn utility
+ * @param unk_id Unknown token id when a token cannot be resolved
+ * @param d_tokens Output tokens are written here
+ */
+template <typename MapRefType, typename SubMapRefType>
+CUDF_KERNEL void tokenize_kernel(cudf::device_span<int64_t const> d_starts,
+                                 cudf::device_span<cudf::size_type const> d_sizes,
+                                 char const* d_chars,
+                                 MapRefType const d_map,
+                                 SubMapRefType const d_sub_map,
+                                 cudf::size_type unk_id,
+                                 cudf::size_type* d_tokens)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= d_starts.size()) { return; }
+  auto const size = d_sizes[idx];
+  if (size <= 0 || size == no_word) { return; }
+  auto const start = d_starts[idx];
+  auto const begin = d_chars + start;
+  auto d_output    = d_tokens + start;
+  if (size >= max_word_size) {
+    *d_output = unk_id;
+    return;
+  }
+  auto const word = cudf::string_view{begin, size};
+  wp_tokenize_fn(word, d_map, d_sub_map, unk_id, d_output);
+}
+
+/**
+ * @brief Compute tokens limited to `max_words_per_row`
+ *
+ * @param input Input strings column
+ * @param first_offset Offset to first row in chars for `input`
+ * @param last_offset Offset just past the last row in chars for `input`
+ * @param max_words_per_row Maximum number of words to tokenize in each row
+ * @param vocabulary Vocabulary data needed by the tokenizer
+ * @param stream Stream used for device allocations and kernel launches
+ * @return The tokens (and non-tokens) for the input
+ */
+rmm::device_uvector<cudf::size_type> compute_some_tokens(
+  cudf::strings_column_view const& input,
+  int64_t first_offset,
+  int64_t chars_size,
+  cudf::size_type max_words_per_row,
+  wordpiece_vocabulary::wordpiece_vocabulary_impl const& vocabulary,
+  rmm::cuda_stream_view stream)
+{
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
+
+  auto const d_strings  = cudf::column_device_view::create(input.parent(), stream);
+  auto max_word_offsets = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+
+  // compute max word counts for each row
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<cudf::size_type>(0),
+                    thrust::counting_iterator<cudf::size_type>(input.size()),
+                    max_word_offsets.begin(),
+                    cuda::proclaim_return_type<cudf::size_type>(
+                      [d_strings = *d_strings, max_words_per_row] __device__(auto idx) {
+                        if (idx >= d_strings.size()) { return 0; }
+                        if (d_strings.is_null(idx)) { return 0; }
+                        auto const d_str = d_strings.element<cudf::string_view>(idx);
+                        return cuda::std::min(max_words_per_row, d_str.size_bytes() / 2);
+                      }));
+
+  auto const max_size = cudf::detail::sizes_to_offsets(
+    max_word_offsets.begin(), max_word_offsets.end(), max_word_offsets.begin(), 0, stream);
+
+  auto start_words = rmm::device_uvector<int64_t>(max_size, stream);
+  auto word_sizes  = rmm::device_uvector<cudf::size_type>(max_size, stream);
+
+  // find start/end for each row up to max_words_per_row words;
+  // store word positions in start_words and sizes in word_sizes
+  cudf::detail::grid_1d grid_find{input.size() * cudf::detail::warp_size, block_size};
+  find_words_kernel<cudf::detail::warp_size>
+    <<<grid_find.num_blocks, grid_find.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, d_input_chars, max_word_offsets.data(), start_words.data(), word_sizes.data());
+
+  // remove the non-words
+  auto const end =
+    thrust::remove(rmm::exec_policy(stream), start_words.begin(), start_words.end(), no_word64);
+  auto const check =
+    thrust::remove(rmm::exec_policy(stream), word_sizes.begin(), word_sizes.end(), no_word);
+
+  auto const total_words = static_cast<int64_t>(thrust::distance(start_words.begin(), end));
+  // this should only trigger if there is a bug in the code above
+  CUDF_EXPECTS(total_words == static_cast<int64_t>(thrust::distance(word_sizes.begin(), check)),
+               "error resolving word locations from input column");
+  start_words.resize(total_words, stream);  // always
+  word_sizes.resize(total_words, stream);   // smaller
+
+  auto const map_ref     = vocabulary.get_map_ref();
+  auto const sub_map_ref = vocabulary.get_sub_map_ref();
+  auto const unk_id      = vocabulary.unk_id;
+
+  auto d_tokens = rmm::device_uvector<cudf::size_type>(chars_size, stream);
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), no_token);
+
+  cudf::detail::grid_1d grid{total_words, 512};
+  tokenize_kernel<decltype(map_ref), decltype(sub_map_ref)>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      start_words, word_sizes, d_input_chars, map_ref, sub_map_ref, unk_id, d_tokens.data());
+
+  return d_tokens;
+}
+
+}  // namespace
+
+std::unique_ptr<cudf::column> wordpiece_tokenize(cudf::strings_column_view const& input,
+                                                 wordpiece_vocabulary const& vocabulary,
+                                                 cudf::size_type max_words_per_row,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(max_words_per_row >= 0, "Invalid value for max_words_per_row argument");
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<cudf::size_type>()};
+  if (input.size() == input.null_count()) {
+    return input.has_nulls()
+             ? cudf::lists::detail::make_all_nulls_lists_column(
+                 input.size(), output_type, stream, mr)
+             : cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
+  }
+
+  auto [first_offset, last_offset] =
+    cudf::strings::detail::get_first_and_last_offset(input, stream);
+  auto const chars_size = last_offset - first_offset;
+
+  auto d_tokens =
+    max_words_per_row == 0
+      ? compute_all_tokens(input, first_offset, chars_size, *(vocabulary._impl), stream)
+      : compute_some_tokens(
+          input, first_offset, chars_size, max_words_per_row, *(vocabulary._impl), stream);
+
+  // compute token counts by doing a segmented reduce over valid d_tokens
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+  auto const d_token_counts =
+    count_tokens(d_tokens.data(), input_offsets, first_offset, input.size(), stream);
+
+  auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column(
+    d_token_counts.begin(), d_token_counts.end(), stream, mr);
+
+  auto tokens =
+    cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr);
+  auto output = tokens->mutable_view().begin<cudf::size_type>();
+  thrust::remove_copy(
+    rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), output, no_token);
+
+  return cudf::make_lists_column(input.size(),
+                                 std::move(token_offsets),
+                                 std::move(tokens),
+                                 input.null_count(),
+                                 cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                 stream,
+                                 mr);
+}
+}  // namespace detail
+
+std::unique_ptr<cudf::column> wordpiece_tokenize(cudf::strings_column_view const& input,
+                                                 wordpiece_vocabulary const& vocabulary,
+                                                 cudf::size_type max_words_per_row,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::wordpiece_tokenize(input, vocabulary, max_words_per_row, stream, mr);
+}
+
+}  // namespace nvtext
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 782551ad66e..706c704c3d7 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,13 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
+#include <nvtext/wordpiece_tokenize.hpp>
 
 #include <fstream>
 #include <vector>
@@ -438,3 +440,149 @@ TEST(TextSubwordTest, ZeroHashBinCoefficient)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata);
 }
+
+TEST(TextSubwordTest, WordPiece)
+{
+  auto vocabulary = cudf::test::strings_column_wrapper(
+    {"ate", "brown", "cheese", "dog", "fox", "jumped", "lazy", "quick", "over", "the", "[UNK]"});
+  auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+
+  auto input = cudf::test::strings_column_wrapper(
+    {"the quick brown fox jumped over",
+     "the  lazy  brown  dog",
+     " ate brown cheese dog fox jumped lazy quick over the [UNK] "});
+  auto sv      = cudf::strings_column_view(input);
+  auto results = nvtext::wordpiece_tokenize(sv, *vocab);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  auto expected = LCW({LCW{ 9, 7, 1, 4, 5, 8},
+                       LCW{ 9, 6, 1, 3},
+                       LCW{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 5);
+  // clang-format off
+  expected = LCW({LCW{ 9, 7, 1, 4, 5},
+                  LCW{ 9, 6, 1, 3},
+                  LCW{ 0, 1, 2, 3, 4}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST(TextSubwordTest, WordPieceWithSubwords)
+{
+  auto vocabulary =
+    cudf::test::strings_column_wrapper({"", "[UNK]", "!", "a", "I", "G", "have", "##P", "##U"});
+  auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+
+  auto input =
+    cudf::test::strings_column_wrapper({"I have a GPU ! ", "do not have a gpu", "no gpu"});
+  auto sv      = cudf::strings_column_view(input);
+  auto results = nvtext::wordpiece_tokenize(sv, *vocab);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  auto expected = LCW({LCW{4, 6, 3, 5, 7, 8, 2},
+                       LCW{1, 1, 6, 3, 1},
+                       LCW{1, 1}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  // max is applied to input words and not output tokens
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 6);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 4);
+  // clang-format off
+  expected = LCW({LCW{4, 6, 3, 5, 7, 8},
+                  LCW{1, 1, 6, 3},
+                  LCW{1, 1}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST(TextSubwordTest, WordPieceSliced)
+{
+  auto vocabulary = cudf::test::strings_column_wrapper(
+    {"ate", "brown", "cheese", "dog", "fox", "jumped", "lazy", "quick", "over", "the", "[UNK]"});
+  auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+
+  auto input = cudf::test::strings_column_wrapper(
+    {" ate the cheese dog quick over  lazy day ",
+     "the quick brown fox jumped over",
+     "the  lazy  brown  dog",
+     " ate brown cheese dog fox jumped lazy quick over the [UNK] ",
+     " ate the cheese dog quick over  lazy day "});
+
+  auto sliced  = cudf::slice(input, {1, 4});
+  auto sv      = cudf::strings_column_view(sliced.front());
+  auto results = nvtext::wordpiece_tokenize(sv, *vocab);
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
+  // clang-format off
+  auto expected = LCW({LCW{ 9, 7, 1, 4, 5, 8},
+                       LCW{ 9, 6, 1, 3},
+                       LCW{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 5);
+  // clang-format off
+  expected = LCW({LCW{ 9, 7, 1, 4, 5},
+                  LCW{ 9, 6, 1, 3},
+                  LCW{ 0, 1, 2, 3, 4}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST(TextSubwordTest, WordPieceEmpty)
+{
+  auto vocabulary = cudf::test::strings_column_wrapper({""});
+  auto vocab      = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+  auto input      = cudf::test::strings_column_wrapper();
+  auto sv         = cudf::strings_column_view(input);
+  auto results    = nvtext::wordpiece_tokenize(sv, *vocab);
+  EXPECT_EQ(results->size(), 0);
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 10);
+  EXPECT_EQ(results->size(), 0);
+}
+
+TEST(TextSubwordTest, WordPieceAllNulls)
+{
+  auto vocabulary = cudf::test::strings_column_wrapper({""});
+  auto vocab      = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+  auto input      = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false});
+  auto sv         = cudf::strings_column_view(input);
+  auto results    = nvtext::wordpiece_tokenize(sv, *vocab);
+  using LCW       = cudf::test::lists_column_wrapper<cudf::size_type>;
+  auto expected   = LCW({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 10);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST(TextSubwordTest, WordPieceNoTokens)
+{
+  auto vocabulary = cudf::test::strings_column_wrapper({"x"});
+  auto vocab      = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+  auto input      = cudf::test::strings_column_wrapper({"  ", " www ", "xxxx"});
+  auto sv         = cudf::strings_column_view(input);
+  auto results    = nvtext::wordpiece_tokenize(sv, *vocab);
+  using LCW       = cudf::test::lists_column_wrapper<cudf::size_type>;
+  LCW expected({LCW{}, LCW{-1}, LCW{-1}});  // -1 indicates [unk] not found in vocabulary
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::wordpiece_tokenize(sv, *vocab, 10);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
+TEST(TextSubwordTest, WordPieceErrors)
+{
+  auto empty = cudf::test::strings_column_wrapper();
+  EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(empty)),
+               std::invalid_argument);
+  auto nulls = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false});
+  EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(nulls)),
+               std::invalid_argument);
+}
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 9f3512369a0..7adea963868 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -6527,6 +6527,20 @@ def tokenize_with_vocabulary(
             )
         )
 
+    @acquire_spill_lock()
+    def wordpiece_tokenize(
+        self,
+        vocabulary: plc.nvtext.wordpiece_tokenize.WordPieceVocabulary,
+        max_words_per_row: int,
+    ) -> Self:
+        return type(self).from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.wordpiece_tokenize.wordpiece_tokenize(
+                self.to_pylibcudf(mode="read"),
+                vocabulary,
+                max_words_per_row,
+            )
+        )
+
     @acquire_spill_lock()
     def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self:
         return type(self).from_pylibcudf(  # type: ignore[return-value]
diff --git a/python/cudf/cudf/core/wordpiece_tokenize.py b/python/cudf/cudf/core/wordpiece_tokenize.py
new file mode 100644
index 00000000000..7a54a430997
--- /dev/null
+++ b/python/cudf/cudf/core/wordpiece_tokenize.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import pylibcudf as plc
+
+import cudf
+
+
+class WordPieceVocabulary:
+    """
+    A vocabulary object used to tokenize input text.
+
+    Parameters
+    ----------
+    vocabulary : cudf.Series
+        Strings column of vocabulary terms
+    """
+
+    def __init__(self, vocabulary: cudf.Series) -> None:
+        self.vocabulary = plc.nvtext.wordpiece_tokenize.WordPieceVocabulary(
+            vocabulary._column.to_pylibcudf(mode="read")
+        )
+
+    def tokenize(self, text, max_words_per_row: int = 0) -> cudf.Series:
+        """
+        Parameters
+        ----------
+        text : cudf.Series
+            The strings to be tokenized.
+        max_words_per_row : int
+            Maximum number of words to tokenize per row.
+            Default 0 tokenizes all words.
+
+        Returns
+        -------
+        cudf.Series
+            Token values
+        """
+        result = text._column.wordpiece_tokenize(
+            self.vocabulary, max_words_per_row
+        )
+
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
index 78b58344374..7927f8777be 100644
--- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 import os
 
 import cupy
@@ -7,6 +7,7 @@
 
 import cudf
 from cudf.core.subword_tokenizer import SubwordTokenizer
+from cudf.core.wordpiece_tokenize import WordPieceVocabulary
 from cudf.testing import assert_eq
 
 
@@ -237,3 +238,473 @@ def test_text_subword_tokenize(tmpdir):
     )
     expected_metadata = expected_metadata.reshape(-1, 3)
     assert_eq(expected_metadata, metadata)
+
+
+@pytest.mark.parametrize("max_words", [0, 200, 10])
+def test_text_wordpiece_tokenize(max_words, datadir):
+    s = cudf.Series(
+        [
+            "The British Isles have been ringing for the last few years with the word  ' Art '  in its German sense ; ",
+            "with  ' High Art ,  '   ' Symbolic Art ,  '   ' Ecclesiastical Art ,  '   ' Dramatic Art ,  '   ' Tragic Art ,  '  and so forth ; ",
+            "and every well - educated person is expected ,  nowadays ,  to know something about Art . "
+            "Yet in spite of all translations of German  ' AEsthetic '  treatises ,  and  ' Kunstnovellen ,  '  the mass of the British people cares very little about the matter ,  and sits contented under the imputation of  ' bad taste . ",
+            " ' Our stage ,  long since dead ,  does not revive ;  our poetry is dying ;  our music ,  like our architecture ,  only reproduces the past ; ",
+            "our painting is only first - rate when it handles landscapes and animals ,  and seems likely so to remain ; ",
+            "but ,  meanwhile ,  nobody cares .   Some of the deepest and most earnest minds vote the question ,  in general ,  a  ' sham and a snare ,  '  and whisper to each other",
+        ]
+    )
+    vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt")
+    vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True)
+    wpt = WordPieceVocabulary(vc)
+    wpr = wpt.tokenize(s, max_words)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    1109,
+                    1418,
+                    2181,
+                    2897,
+                    1138,
+                    1151,
+                    3170,
+                    1158,
+                    1111,
+                    1103,
+                    1314,
+                    1374,
+                    1201,
+                    1114,
+                    1103,
+                    1937,
+                    112,
+                    2051,
+                    112,
+                    1107,
+                    1157,
+                    1528,
+                    2305,
+                    132,
+                ],
+                dtype=np.int32,
+            ),
+            cudf.Series(
+                [
+                    1114,
+                    112,
+                    1693,
+                    2051,
+                    117,
+                    112,
+                    112,
+                    156,
+                    1183,
+                    1306,
+                    1830,
+                    1186,
+                    2646,
+                    1665,
+                    2051,
+                    117,
+                    112,
+                    112,
+                    142,
+                    1665,
+                    1665,
+                    2897,
+                    1465,
+                    2050,
+                    1596,
+                    1348,
+                    2051,
+                    117,
+                    112,
+                    112,
+                    1987,
+                    2312,
+                    2980,
+                    1596,
+                    2051,
+                    117,
+                    112,
+                    112,
+                    157,
+                    1611,
+                    1403,
+                    1596,
+                    2051,
+                    117,
+                    112,
+                    1105,
+                    1177,
+                    1111,
+                    1582,
+                    132,
+                ],
+                dtype=np.int32,
+            ),
+            cudf.Series(
+                [
+                    1105,
+                    1451,
+                    1218,
+                    118,
+                    174,
+                    1181,
+                    1358,
+                    2599,
+                    1906,
+                    1825,
+                    1110,
+                    2637,
+                    117,
+                    1208,
+                    1161,
+                    1810,
+                    1183,
+                    1116,
+                    117,
+                    1106,
+                    1221,
+                    1380,
+                    1164,
+                    2051,
+                    119,
+                    162,
+                    2105,
+                    1107,
+                    188,
+                    1643,
+                    3150,
+                    1104,
+                    1155,
+                    189,
+                    1611,
+                    2316,
+                    1742,
+                    2116,
+                    1116,
+                    1104,
+                    1528,
+                    112,
+                    138,
+                    2036,
+                    2050,
+                    1324,
+                    2105,
+                    1596,
+                    112,
+                    189,
+                    1874,
+                    2980,
+                    1548,
+                    1279,
+                    117,
+                    1105,
+                    112,
+                    148,
+                    3488,
+                    2050,
+                    2728,
+                    2707,
+                    2339,
+                    1424,
+                    117,
+                    112,
+                    1103,
+                    3367,
+                    1104,
+                    1103,
+                    1418,
+                    1234,
+                    1920,
+                    1116,
+                    1304,
+                    1376,
+                    1164,
+                    1103,
+                    2187,
+                    117,
+                    1105,
+                    3465,
+                    1116,
+                    3438,
+                    1174,
+                    1223,
+                    1103,
+                    178,
+                    1306,
+                    1643,
+                    1358,
+                    1777,
+                    2116,
+                    1104,
+                    112,
+                    2213,
+                    189,
+                    2225,
+                    1566,
+                    119,
+                ],
+                dtype=np.int32,
+            ),
+            cudf.Series(
+                [
+                    112,
+                    3458,
+                    2016,
+                    117,
+                    1263,
+                    1290,
+                    2044,
+                    117,
+                    1674,
+                    1136,
+                    1231,
+                    1964,
+                    2109,
+                    132,
+                    1412,
+                    185,
+                    1186,
+                    2105,
+                    1616,
+                    1110,
+                    173,
+                    1183,
+                    1158,
+                    132,
+                    1412,
+                    1390,
+                    117,
+                    1176,
+                    1412,
+                    170,
+                    1197,
+                    1732,
+                    3150,
+                    1665,
+                    1204,
+                    3313,
+                    117,
+                    1178,
+                    1231,
+                    1643,
+                    2180,
+                    1181,
+                    1358,
+                    2093,
+                    1116,
+                    1103,
+                    1763,
+                    132,
+                ],
+                dtype=np.int32,
+            ),
+            cudf.Series(
+                [
+                    1412,
+                    2489,
+                    1916,
+                    1110,
+                    1178,
+                    1148,
+                    118,
+                    2603,
+                    1165,
+                    1122,
+                    1289,
+                    2897,
+                    1657,
+                    1116,
+                    2599,
+                    3186,
+                    1116,
+                    1105,
+                    1126,
+                    1182,
+                    1918,
+                    3447,
+                    117,
+                    1105,
+                    3093,
+                    2620,
+                    1177,
+                    1106,
+                    3118,
+                    132,
+                ],
+                dtype=np.int32,
+            ),
+            cudf.Series(
+                [
+                    1133,
+                    117,
+                    1928,
+                    2246,
+                    3031,
+                    1513,
+                    117,
+                    1185,
+                    1830,
+                    1186,
+                    1181,
+                    1183,
+                    1920,
+                    1116,
+                    119,
+                    1789,
+                    1104,
+                    1103,
+                    1996,
+                    2556,
+                    1105,
+                    1211,
+                    174,
+                    1813,
+                    1673,
+                    2050,
+                    1713,
+                    1116,
+                    2992,
+                    1103,
+                    2304,
+                    117,
+                    1107,
+                    1704,
+                    117,
+                    170,
+                    112,
+                    188,
+                    2522,
+                    1105,
+                    170,
+                    188,
+                    1605,
+                    1874,
+                    117,
+                    112,
+                    1105,
+                    192,
+                    3031,
+                    1116,
+                    3365,
+                    1106,
+                    1296,
+                    1168,
+                ],
+                dtype=np.int32,
+            ),
+        ]
+    )
+    if max_words == 10:
+        expected = cudf.Series(
+            [
+                cudf.Series(
+                    [
+                        1109,
+                        1418,
+                        2181,
+                        2897,
+                        1138,
+                        1151,
+                        3170,
+                        1158,
+                        1111,
+                        1103,
+                        1314,
+                        1374,
+                    ],
+                    dtype=np.int32,
+                ),
+                cudf.Series(
+                    [
+                        1114,
+                        112,
+                        1693,
+                        2051,
+                        117,
+                        112,
+                        112,
+                        156,
+                        1183,
+                        1306,
+                        1830,
+                        1186,
+                        2646,
+                        1665,
+                        2051,
+                        117,
+                    ],
+                    dtype=np.int32,
+                ),
+                cudf.Series(
+                    [
+                        1105,
+                        1451,
+                        1218,
+                        118,
+                        174,
+                        1181,
+                        1358,
+                        2599,
+                        1906,
+                        1825,
+                        1110,
+                        2637,
+                        117,
+                        1208,
+                        1161,
+                        1810,
+                        1183,
+                        1116,
+                    ],
+                    dtype=np.int32,
+                ),
+                cudf.Series(
+                    [112, 3458, 2016, 117, 1263, 1290, 2044, 117, 1674, 1136],
+                    dtype=np.int32,
+                ),
+                cudf.Series(
+                    [
+                        1412,
+                        2489,
+                        1916,
+                        1110,
+                        1178,
+                        1148,
+                        118,
+                        2603,
+                        1165,
+                        1122,
+                        1289,
+                        2897,
+                    ],
+                    dtype=np.int32,
+                ),
+                cudf.Series(
+                    [
+                        1133,
+                        117,
+                        1928,
+                        2246,
+                        3031,
+                        1513,
+                        117,
+                        1185,
+                        1830,
+                        1186,
+                        1181,
+                        1183,
+                        1920,
+                        1116,
+                        119,
+                        1789,
+                        1104,
+                        1103,
+                    ],
+                    dtype=np.int32,
+                ),
+            ]
+        )
+    assert_eq(expected, wpr)
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
new file mode 100644
index 00000000000..261b86aed47
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
@@ -0,0 +1,22 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "nvtext/wordpiece_tokenize.hpp" namespace "nvtext" nogil:
+
+    cdef struct wordpiece_vocabulary "nvtext::wordpiece_vocabulary":
+        pass
+
+    cdef unique_ptr[wordpiece_vocabulary] load_wordpiece_vocabulary(
+        const column_view & strings
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] wordpiece_tokenize(
+        const column_view & strings,
+        const wordpiece_vocabulary & vocabulary,
+        size_type max_tokens_per_row
+    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
index 93e3fb15259..d8cabbf4d47 100644
--- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,8 +13,18 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx
+    byte_pair_encode.pyx
+    edit_distance.pyx
+    generate_ngrams.pyx
+    jaccard.pyx
+    minhash.pyx
+    ngrams_tokenize.pyx
+    normalize.pyx
+    replace.pyx
+    stemmer.pyx
+    tokenize.pyx
+    subword_tokenize.pyx
+    wordpiece_tokenize.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
index ef837167eb9..eb48ea84dee 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from . cimport (
     byte_pair_encode,
@@ -12,6 +12,7 @@ from . cimport (
     stemmer,
     subword_tokenize,
     tokenize,
+    wordpiece_tokenize,
 )
 
 __all__ = [
@@ -26,4 +27,5 @@ __all__ = [
     "stemmer",
     "subword_tokenize",
     "tokenize",
+    "wordpiece_tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py
index d88a7d4b825..07e80abe3a8 100644
--- a/python/pylibcudf/pylibcudf/nvtext/__init__.py
+++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from . import (
     byte_pair_encode,
@@ -12,6 +12,7 @@
     stemmer,
     subword_tokenize,
     tokenize,
+    wordpiece_tokenize,
 )
 
 __all__ = [
@@ -26,4 +27,5 @@
     "stemmer",
     "subword_tokenize",
     "tokenize",
+    "wordpiece_tokenize",
 ]
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
new file mode 100644
index 00000000000..75e86770e0f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.wordpiece_tokenize cimport wordpiece_vocabulary
+from pylibcudf.libcudf.types cimport size_type
+
+cdef class WordPieceVocabulary:
+    cdef unique_ptr[wordpiece_vocabulary] c_obj
+
+cpdef Column wordpiece_tokenize(
+    Column input,
+    WordPieceVocabulary vocabulary,
+    size_type max_words_per_row
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
new file mode 100644
index 00000000000..274bc31f316
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
@@ -0,0 +1,12 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from pylibcudf.column import Column
+
+class WordPieceVocabulary:
+    def __init__(self, vocab: Column): ...
+
+def wordpiece_tokenize(
+    input: Column,
+    vocabulary: WordPieceVocabulary,
+    max_words_per_row: int,
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx
new file mode 100644
index 00000000000..d43d0c9f814
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx
@@ -0,0 +1,66 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from cython.operator cimport dereference
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.wordpiece_tokenize cimport (
+    load_wordpiece_vocabulary as cpp_load_wordpiece_vocabulary,
+    wordpiece_tokenize as cpp_wordpiece_tokenize,
+)
+from pylibcudf.libcudf.types cimport size_type
+
+__all__ = [
+    "WordPieceVocabulary",
+    "wordpiece_tokenize",
+]
+
+cdef class WordPieceVocabulary:
+    """The Vocabulary object to be used with ``wordpiece_tokenize``.
+
+    For details, see :cpp:class:`cudf::nvtext::wordpiece_tokenize`.
+    """
+    def __cinit__(self, Column vocab):
+        cdef column_view c_vocab = vocab.view()
+        with nogil:
+            self.c_obj = move(cpp_load_wordpiece_vocabulary(c_vocab))
+
+    __hash__ = None
+
+cpdef Column wordpiece_tokenize(
+    Column input,
+    WordPieceVocabulary vocabulary,
+    size_type max_words_per_row
+):
+    """
+    Returns the token ids for the input string by looking
+    up each delimited token in the given vocabulary.
+
+    For details, see cpp:func:`cudf::nvtext::wordpiece_tokenize`
+
+    Parameters
+    ----------
+    input : Column
+        Strings column to tokenize
+    vocabulary : WordPieceVocabulary
+        Used to lookup tokens within ``input``
+    max_words_per_row : size_type
+        Maximum number of words to tokenize per input row
+
+    Returns
+    -------
+    Column
+        Lists column of token ids
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_wordpiece_tokenize(
+            input.view(),
+            dereference(vocabulary.c_obj.get()),
+            max_words_per_row
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
index 516d0f7f78d..679512bb16a 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -61,3 +61,37 @@ def test_subword_tokenize(
     assert_column_eq(tokens, expected_tokens)
     assert_column_eq(masks, expected_masks)
     assert_column_eq(metadata, expected_metadata)
+
+
+@pytest.mark.parametrize("max_words", [0, 5])
+def test_wordpiece_tokenize(max_words):
+    vocab_strs = pa.array(["[unk]", "abc", "def", "gh", "##i"])
+    vocab = plc.nvtext.wordpiece_tokenize.WordPieceVocabulary(
+        plc.interop.from_arrow(vocab_strs)
+    )
+    input = pa.array(
+        [
+            "gh def abc xyz ghi defi abci",
+            "abc def gh abc def gh abc def gh",
+            "abc def gh",
+        ]
+    )
+    result = plc.nvtext.wordpiece_tokenize.wordpiece_tokenize(
+        plc.interop.from_arrow(input), vocab, max_words
+    )
+    pa_result = plc.interop.to_arrow(result)
+    if max_words == 5:
+        expected = pa.array(
+            [[3, 2, 1, 0, 3, 4], [1, 2, 3, 1, 2], [1, 2, 3]],
+            type=pa_result.type,
+        )
+    else:
+        expected = pa.array(
+            [
+                [3, 2, 1, 0, 3, 4, 2, 4, 1, 4],
+                [1, 2, 3, 1, 2, 3, 1, 2, 3],
+                [1, 2, 3],
+            ],
+            type=pa_result.type,
+        )
+    assert_column_eq(result, expected)