From 29b81eb42aa60c5e41785b3b2eb9e80b964c90ad Mon Sep 17 00:00:00 2001 From: Gil Forsyth Date: Tue, 4 Mar 2025 12:13:36 -0500 Subject: [PATCH 1/4] Combine separate ConfigureNVBench calls to fix cpp conda builds (#18155) Two PRs merged in quick succession each added a separate transform benchmark with the same name, leading CMake to get angry: https://github.com/rapidsai/cudf/commit/5b0a85b5397b69155fe0c740185945a9fe0848ac https://github.com/rapidsai/cudf/commit/8ca4bc43d4650ae364d1f9ee412a5597f310b4f7 Authors: - Gil Forsyth (https://github.com/gforsyth) Approvers: - Bradley Dice (https://github.com/bdice) - Basit Ayantunde (https://github.com/lamarrr) URL: https://github.com/rapidsai/cudf/pull/18155 --- cpp/benchmarks/CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index e82c7517145..b2559208c3c 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -355,11 +355,7 @@ ConfigureNVBench( # ################################################################################################## # * transform benchmark # --------------------------------------------------------------------------------- -ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp) - -# ################################################################################################## -# * transform benchmark ---------------------------------------------------------------------------- -ConfigureNVBench(TRANSFORM_NVBENCH transform/transform.cpp) +ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp transform/transform.cpp) # ################################################################################################## # * nvtext benchmark ------------------------------------------------------------------- From 68ae48c1929ec4cead60f5386f1d3522daf7e1fc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 4 Mar 2025 10:30:50 -0800 Subject: [PATCH 2/4] Prune more seldom used dtype utils (#18150) Similar to https://github.com/rapidsai/cudf/pull/18141 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/18150 --- python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/column/datetime.py | 23 ++- python/cudf/cudf/core/column/numerical.py | 34 +++- python/cudf/cudf/core/column/timedelta.py | 2 +- python/cudf/cudf/core/dataframe.py | 4 +- python/cudf/cudf/core/scalar.py | 153 +++++++++++++++- python/cudf/cudf/utils/dtypes.py | 206 ---------------------- 7 files changed, 205 insertions(+), 220 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 5c72bb74d6a..0f347279956 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -64,7 +64,6 @@ dtype_from_pylibcudf_column, dtype_to_pylibcudf_type, find_common_type, - get_time_unit, is_column_like, is_dtype_obj_numeric, is_mixed_with_object_dtype, @@ -2736,7 +2735,7 @@ def as_column( nan_as_null=nan_as_null, ) elif arbitrary.dtype.kind in "mM": - time_unit = get_time_unit(arbitrary) + time_unit = np.datetime_data(arbitrary.dtype)[0] if time_unit in ("D", "W", "M", "Y"): # TODO: Raise in these cases instead of downcasting to s? new_type = f"{arbitrary.dtype.type.__name__}[s]" diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 64ddcae72a7..e99ce6b0b6e 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -179,13 +179,27 @@ def infer_format(element: str, **kwargs) -> str: return fmt +def _get_time_unit(obj: ColumnBinaryOperand) -> str: + if isinstance( + obj, + ( + cudf.core.column.datetime.DatetimeColumn, + cudf.core.column.timedelta.TimeDeltaColumn, + ), + ): + return obj.time_unit + + time_unit, _ = np.datetime_data(obj.dtype) + return time_unit + + def _resolve_mixed_dtypes( lhs: ColumnBinaryOperand, rhs: ColumnBinaryOperand, base_type: str ) -> Dtype: units = ["s", "ms", "us", "ns"] - lhs_time_unit = cudf.utils.dtypes.get_time_unit(lhs) + lhs_time_unit = _get_time_unit(lhs) lhs_unit = units.index(lhs_time_unit) - rhs_time_unit = cudf.utils.dtypes.get_time_unit(rhs) + rhs_time_unit = _get_time_unit(rhs) rhs_unit = units.index(rhs_time_unit) return np.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]") @@ -551,7 +565,7 @@ def normalize_binop_value( # type: ignore[override] if isinstance(other, np.datetime64): if np.isnat(other): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) + other_time_unit = np.datetime_data(other.dtype)[0] if other_time_unit not in {"s", "ms", "ns", "us"}: other_time_unit = "ns" @@ -562,8 +576,7 @@ def normalize_binop_value( # type: ignore[override] other = other.astype(self.dtype) return cudf.Scalar(other) elif isinstance(other, np.timedelta64): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) - + other_time_unit = np.datetime_data(other.dtype)[0] if np.isnat(other): return cudf.Scalar( None, diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 249afe9aba6..57c42f92222 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -26,8 +26,8 @@ from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, find_common_type, - min_column_type, min_signed_type, + min_unsigned_type, np_dtypes_to_pandas_dtypes, ) @@ -469,6 +469,34 @@ def _process_values_for_isin( def _can_return_nan(self, skipna: bool | None = None) -> bool: return not skipna and self.has_nulls(include_nan=True) + def _min_column_type(self, expected_type: np.dtype) -> np.dtype: + """ + Return the smallest dtype which can represent all elements of self. + """ + if self.null_count == len(self): + return self.dtype + + min_value, max_value = self.min(), self.max() + either_is_inf = np.isinf(min_value) or np.isinf(max_value) + if not either_is_inf and expected_type.kind == "i": + max_bound_dtype = min_signed_type(max_value) + min_bound_dtype = min_signed_type(min_value) + return np.promote_types(max_bound_dtype, min_bound_dtype) + elif not either_is_inf and expected_type.kind == "u": + max_bound_dtype = min_unsigned_type(max_value) + min_bound_dtype = min_unsigned_type(min_value) + return np.promote_types(max_bound_dtype, min_bound_dtype) + elif self.dtype.kind == "f" or expected_type.kind == "f": + return np.promote_types( + expected_type, + np.promote_types( + np.min_scalar_type(float(max_value)), + np.min_scalar_type(float(min_value)), + ), + ) + else: + return self.dtype + def find_and_replace( self, to_replace: ColumnLike, @@ -767,8 +795,8 @@ def _normalize_find_and_replace_input( normalized_column = normalized_column.astype(input_column_dtype) if normalized_column.can_cast_safely(input_column_dtype): return normalized_column.astype(input_column_dtype) - col_to_normalize_dtype = min_column_type( - normalized_column, input_column_dtype + col_to_normalize_dtype = normalized_column._min_column_type( # type: ignore[attr-defined] + input_column_dtype ) # Scalar case if len(col_to_normalize) == 1: diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 654d2c2b800..d31a0c8b002 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -301,7 +301,7 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand: other = pd.Timedelta(other).to_timedelta64() if isinstance(other, np.timedelta64): - other_time_unit = cudf.utils.dtypes.get_time_unit(other) + other_time_unit = np.datetime_data(other.dtype)[0] if np.isnat(other): return cudf.Scalar( None, diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index eec0bacd5c8..458706f3567 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6729,8 +6729,8 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs): prepared._data[col] = ( prepared._data[col] .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] + prepared._data[col]._min_column_type( + np.dtype(np.float32) ) if common_dtype.kind != "M" else np.dtype(np.float64) diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 8579b7398f0..0e1f17fa45e 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -26,7 +26,6 @@ from cudf.utils.dtypes import ( CUDF_STRING_DTYPE, cudf_dtype_from_pa_type, - get_allowed_combinations_for_operator, to_cudf_compatible_scalar, ) @@ -36,6 +35,158 @@ from cudf._typing import Dtype, ScalarLike +# Type dispatch loops similar to what are found in `np.add.types` +# In NumPy, whether or not an op can be performed between two +# operands is determined by checking to see if NumPy has a c/c++ +# loop specifically for adding those two operands built in. If +# not it will search lists like these for a loop for types that +# the operands can be safely cast to. These are those lookups, +# modified slightly for cuDF's rules +_ADD_TYPES = [ + "???", + "BBB", + "HHH", + "III", + "LLL", + "bbb", + "hhh", + "iii", + "lll", + "fff", + "ddd", + "mMM", + "MmM", + "mmm", + "LMM", + "MLM", + "Lmm", + "mLm", +] +_SUB_TYPES = [ + "BBB", + "HHH", + "III", + "LLL", + "bbb", + "hhh", + "iii", + "lll", + "fff", + "ddd", + "???", + "MMm", + "mmm", + "MmM", + "MLM", + "mLm", + "Lmm", +] +_MUL_TYPES = [ + "???", + "BBB", + "HHH", + "III", + "LLL", + "bbb", + "hhh", + "iii", + "lll", + "fff", + "ddd", + "mLm", + "Lmm", + "mlm", + "lmm", +] +_FLOORDIV_TYPES = [ + "bbb", + "BBB", + "HHH", + "III", + "LLL", + "hhh", + "iii", + "lll", + "fff", + "ddd", + "???", + "mqm", + "mdm", + "mmq", +] +_TRUEDIV_TYPES = ["fff", "ddd", "mqm", "mmd", "mLm"] +_MOD_TYPES = [ + "bbb", + "BBB", + "hhh", + "HHH", + "iii", + "III", + "lll", + "LLL", + "fff", + "ddd", + "mmm", +] +_POW_TYPES = [ + "bbb", + "BBB", + "hhh", + "HHH", + "iii", + "III", + "lll", + "LLL", + "fff", + "ddd", +] + + +def get_allowed_combinations_for_operator( + dtype_l: np.dtype, dtype_r: np.dtype, op: str +) -> np.dtype: + error = TypeError( + f"{op} not supported between {dtype_l} and {dtype_r} scalars" + ) + + to_numpy_ops = { + "__add__": _ADD_TYPES, + "__radd__": _ADD_TYPES, + "__sub__": _SUB_TYPES, + "__rsub__": _SUB_TYPES, + "__mul__": _MUL_TYPES, + "__rmul__": _MUL_TYPES, + "__floordiv__": _FLOORDIV_TYPES, + "__rfloordiv__": _FLOORDIV_TYPES, + "__truediv__": _TRUEDIV_TYPES, + "__rtruediv__": _TRUEDIV_TYPES, + "__mod__": _MOD_TYPES, + "__rmod__": _MOD_TYPES, + "__pow__": _POW_TYPES, + "__rpow__": _POW_TYPES, + } + allowed = to_numpy_ops.get(op, op) + + # special rules for string + if dtype_l == "object" or dtype_r == "object": + if (dtype_l == dtype_r == "object") and op == "__add__": + return CUDF_STRING_DTYPE + else: + raise error + + # Check if we can directly operate + + for valid_combo in allowed: + ltype, rtype, outtype = valid_combo # type: ignore[misc] + if np.can_cast(dtype_l.char, ltype) and np.can_cast( # type: ignore[has-type] + dtype_r.char, + rtype, # type: ignore[has-type] + ): + return np.dtype(outtype) # type: ignore[has-type] + + raise error + + def _preprocess_host_value(value, dtype) -> tuple[ScalarLike, Dtype]: """ Preprocess a value and dtype for host-side cudf.Scalar diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index adee17e7bfb..fea146c3201 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -354,47 +354,6 @@ def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype: return np.uint64(x).dtype -def min_column_type(x, expected_type): - """ - Return the smallest dtype which can represent all - elements of the `NumericalColumn` `x` - If the column is not a subtype of `np.signedinteger` or `np.floating` - returns the same dtype as the dtype of `x` without modification - """ - - if not isinstance(x, cudf.core.column.NumericalColumn): - raise TypeError("Argument x must be of type column.NumericalColumn") - if x.null_count == len(x): - return x.dtype - - min_value, max_value = x.min(), x.max() - either_is_inf = np.isinf(min_value) or np.isinf(max_value) - expected_type = cudf.dtype(expected_type) - if not either_is_inf and expected_type.kind in "i": - max_bound_dtype = min_signed_type(max_value) - min_bound_dtype = min_signed_type(min_value) - result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - elif not either_is_inf and expected_type.kind in "u": - max_bound_dtype = min_unsigned_type(max_value) - min_bound_dtype = min_unsigned_type(min_value) - result_type = np.promote_types(max_bound_dtype, min_bound_dtype) - elif x.dtype.kind == "f": - return get_min_float_dtype(x) - else: - result_type = x.dtype - - return cudf.dtype(result_type) - - -def get_min_float_dtype(col): - max_bound_dtype = np.min_scalar_type(float(col.max())) - min_bound_dtype = np.min_scalar_type(float(col.min())) - result_type = np.promote_types( - "float32", np.promote_types(max_bound_dtype, min_bound_dtype) - ) - return cudf.dtype(result_type) - - def is_mixed_with_object_dtype(lhs, rhs): if isinstance(lhs.dtype, cudf.CategoricalDtype): return is_mixed_with_object_dtype(lhs.dtype.categories, rhs) @@ -406,20 +365,6 @@ def is_mixed_with_object_dtype(lhs, rhs): ) -def get_time_unit(obj): - if isinstance( - obj, - ( - cudf.core.column.datetime.DatetimeColumn, - cudf.core.column.timedelta.TimeDeltaColumn, - ), - ): - return obj.time_unit - - time_unit, _ = np.datetime_data(obj.dtype) - return time_unit - - def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj: if dtype.kind in "mM": time_unit, _ = np.datetime_data(dtype) @@ -430,51 +375,6 @@ def _get_nan_for_dtype(dtype: DtypeObj) -> DtypeObj: return np.float64("nan") -def get_allowed_combinations_for_operator( - dtype_l: np.dtype, dtype_r: np.dtype, op: str -) -> np.dtype: - error = TypeError( - f"{op} not supported between {dtype_l} and {dtype_r} scalars" - ) - - to_numpy_ops = { - "__add__": _ADD_TYPES, - "__radd__": _ADD_TYPES, - "__sub__": _SUB_TYPES, - "__rsub__": _SUB_TYPES, - "__mul__": _MUL_TYPES, - "__rmul__": _MUL_TYPES, - "__floordiv__": _FLOORDIV_TYPES, - "__rfloordiv__": _FLOORDIV_TYPES, - "__truediv__": _TRUEDIV_TYPES, - "__rtruediv__": _TRUEDIV_TYPES, - "__mod__": _MOD_TYPES, - "__rmod__": _MOD_TYPES, - "__pow__": _POW_TYPES, - "__rpow__": _POW_TYPES, - } - allowed = to_numpy_ops.get(op, op) - - # special rules for string - if dtype_l == "object" or dtype_r == "object": - if (dtype_l == dtype_r == "object") and op == "__add__": - return CUDF_STRING_DTYPE - else: - raise error - - # Check if we can directly operate - - for valid_combo in allowed: - ltype, rtype, outtype = valid_combo # type: ignore[misc] - if np.can_cast(dtype_l.char, ltype) and np.can_cast( # type: ignore[has-type] - dtype_r.char, - rtype, # type: ignore[has-type] - ): - return np.dtype(outtype) # type: ignore[has-type] - - raise error - - def find_common_type(dtypes): """ Wrapper over np.find_common_type to handle special cases @@ -715,109 +615,3 @@ def dtype_from_pylibcudf_column(col: plc.Column) -> DtypeObj: SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] CUDF_STRING_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRING] - -# Type dispatch loops similar to what are found in `np.add.types` -# In NumPy, whether or not an op can be performed between two -# operands is determined by checking to see if NumPy has a c/c++ -# loop specifically for adding those two operands built in. If -# not it will search lists like these for a loop for types that -# the operands can be safely cast to. These are those lookups, -# modified slightly for cuDF's rules -_ADD_TYPES = [ - "???", - "BBB", - "HHH", - "III", - "LLL", - "bbb", - "hhh", - "iii", - "lll", - "fff", - "ddd", - "mMM", - "MmM", - "mmm", - "LMM", - "MLM", - "Lmm", - "mLm", -] -_SUB_TYPES = [ - "BBB", - "HHH", - "III", - "LLL", - "bbb", - "hhh", - "iii", - "lll", - "fff", - "ddd", - "???", - "MMm", - "mmm", - "MmM", - "MLM", - "mLm", - "Lmm", -] -_MUL_TYPES = [ - "???", - "BBB", - "HHH", - "III", - "LLL", - "bbb", - "hhh", - "iii", - "lll", - "fff", - "ddd", - "mLm", - "Lmm", - "mlm", - "lmm", -] -_FLOORDIV_TYPES = [ - "bbb", - "BBB", - "HHH", - "III", - "LLL", - "hhh", - "iii", - "lll", - "fff", - "ddd", - "???", - "mqm", - "mdm", - "mmq", -] -_TRUEDIV_TYPES = ["fff", "ddd", "mqm", "mmd", "mLm"] -_MOD_TYPES = [ - "bbb", - "BBB", - "hhh", - "HHH", - "iii", - "III", - "lll", - "LLL", - "fff", - "ddd", - "mmm", -] -_POW_TYPES = [ - "bbb", - "BBB", - "hhh", - "HHH", - "iii", - "III", - "lll", - "LLL", - "fff", - "ddd", -] From 32bdfb064886229e391d91ed72a7348787fd4f8c Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Tue, 4 Mar 2025 11:38:38 -0800 Subject: [PATCH 3/4] Separate stats filtering helpers to reuse in page pruning (#18034) Contributes to #17896 This PR separates stats based filtering helpers for reuse in page pruning using stats in Parquet PageIndex. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/18034 --- cpp/CMakeLists.txt | 1 + cpp/src/io/parquet/predicate_pushdown.cpp | 309 +------------------- cpp/src/io/parquet/stats_filter_helpers.cpp | 144 +++++++++ cpp/src/io/parquet/stats_filter_helpers.hpp | 256 ++++++++++++++++ 4 files changed, 416 insertions(+), 294 deletions(-) create mode 100644 cpp/src/io/parquet/stats_filter_helpers.cpp create mode 100644 cpp/src/io/parquet/stats_filter_helpers.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0282282b5f3..8952f11dd23 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -528,6 +528,7 @@ add_library( src/io/parquet/reader_impl_chunking.cu src/io/parquet/reader_impl_helpers.cpp src/io/parquet/reader_impl_preprocess.cu + src/io/parquet/stats_filter_helpers.cpp src/io/parquet/writer_impl.cu src/io/parquet/writer_impl_helpers.cpp src/io/parquet/decode_fixed.cu diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index e1d7dbb03b3..c2d1495a5d9 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ #include "reader_impl_helpers.hpp" +#include "stats_filter_helpers.hpp" #include #include @@ -21,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -34,7 +34,6 @@ #include #include -#include #include #include #include @@ -42,89 +41,25 @@ namespace cudf::io::parquet::detail { namespace { + /** - * @brief Converts statistics in column chunks to 2 device columns - min, max values. + * @brief Converts column chunk statistics to 2 device columns - min, max values. + * + * Each column's number of rows equals the total number of row groups. * */ -struct stats_caster { +struct row_group_stats_caster : public stats_caster_base { size_type total_row_groups; std::vector const& per_file_metadata; host_span const> row_group_indices; - template - static ToType targetType(FromType const value) - { - if constexpr (cudf::is_timestamp()) { - return static_cast( - typename ToType::duration{static_cast(value)}); - } else if constexpr (std::is_same_v) { - return ToType{nullptr, 0}; - } else { - return static_cast(value); - } - } - - // uses storage type as T - template () or cudf::is_nested())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - CUDF_FAIL("unsupported type for stats casting"); - } - - template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); - return targetType(*reinterpret_cast(stats_val)); - } - - // integral but not boolean, and fixed_point, and chrono. - template () and !cudf::is_boolean()) or - cudf::is_fixed_point() or cudf::is_chrono())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - switch (type) { - case INT32: return targetType(*reinterpret_cast(stats_val)); - case INT64: return targetType(*reinterpret_cast(stats_val)); - case INT96: // Deprecated in parquet specification - return targetType(static_cast<__int128_t>(reinterpret_cast(stats_val)[0]) - << 32 | - reinterpret_cast(stats_val)[2]); - case BYTE_ARRAY: [[fallthrough]]; - case FIXED_LEN_BYTE_ARRAY: - if (stats_size == sizeof(T)) { - // if type size == length of stats_val. then typecast and return. - if constexpr (cudf::is_chrono()) { - return targetType(*reinterpret_cast(stats_val)); - } else { - return targetType(*reinterpret_cast(stats_val)); - } - } - // unsupported type - default: CUDF_FAIL("Invalid type and stats combination"); - } - } - - template ())> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) - { - switch (type) { - case FLOAT: return targetType(*reinterpret_cast(stats_val)); - case DOUBLE: return targetType(*reinterpret_cast(stats_val)); - default: CUDF_FAIL("Invalid type and stats combination"); - } - } - - template )> - static T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + row_group_stats_caster(size_type total_row_groups, + std::vector const& per_file_metadata, + host_span const> row_group_indices) + : total_row_groups{total_row_groups}, + per_file_metadata{per_file_metadata}, + row_group_indices{row_group_indices} { - switch (type) { - case BYTE_ARRAY: [[fallthrough]]; - case FIXED_LEN_BYTE_ARRAY: - return string_view(reinterpret_cast(stats_val), stats_size); - default: CUDF_FAIL("Invalid type and stats combination"); - } } // Creates device columns from column statistics (min, max) @@ -139,82 +74,8 @@ struct stats_caster { if constexpr (cudf::is_compound() && !std::is_same_v) { CUDF_FAIL("Compound types do not have statistics"); } else { - // Local struct to hold host columns - struct host_column { - // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. - cudf::detail::host_vector val; - std::vector null_mask; - cudf::size_type null_count = 0; - host_column(size_type total_row_groups, rmm::cuda_stream_view stream) - : val{cudf::detail::make_host_vector(total_row_groups, stream)}, - null_mask( - cudf::util::div_rounding_up_safe( - cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), - ~bitmask_type{0}) - { - } - - void set_index(size_type index, - std::optional> const& binary_value, - Type const type) - { - if (binary_value.has_value()) { - val[index] = convert(binary_value.value().data(), binary_value.value().size(), type); - } - if (not binary_value.has_value()) { - clear_bit_unsafe(null_mask.data(), index); - null_count++; - } - } - - static auto make_strings_children(host_span host_strings, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - auto const total_char_count = std::accumulate( - host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { - return sum + str.size_bytes(); - }); - auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); - auto offsets = - cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); - offsets.push_back(0); - for (auto const& str : host_strings) { - auto tmp = - str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); - chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp)); - offsets.push_back(offsets.back() + tmp.length()); - } - auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); - auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr); - return std::tuple{std::move(d_chars), std::move(d_offsets)}; - } - - auto to_device(cudf::data_type dtype, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) - { - if constexpr (std::is_same_v) { - auto [d_chars, d_offsets] = make_strings_children(val, stream, mr); - return cudf::make_strings_column( - val.size(), - std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), - d_chars.release(), - null_count, - rmm::device_buffer{ - null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); - } - return std::make_unique( - dtype, - val.size(), - cudf::detail::make_device_uvector_async(val, stream, mr).release(), - rmm::device_buffer{ - null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}, - null_count); - } - }; // local struct host_column - host_column min(total_row_groups, stream); - host_column max(total_row_groups, stream); + host_column min(total_row_groups, stream); + host_column max(total_row_groups, stream); size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { for (auto const rg_idx : row_group_indices[src_idx]) { @@ -248,146 +109,6 @@ struct stats_caster { } }; -/** - * @brief Converts AST expression to StatsAST for comparing with column statistics - * This is used in row group filtering based on predicate. - * statistics min value of a column is referenced by column_index*2 - * statistics max value of a column is referenced by column_index*2+1 - * - */ -class stats_expression_converter : public ast::detail::expression_transformer { - public: - stats_expression_converter(ast::expression const& expr, size_type const& num_columns) - : _num_columns{num_columns} - { - expr.accept(*this); - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::literal const& ) - */ - std::reference_wrapper visit(ast::literal const& expr) override - { - return expr; - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) - */ - std::reference_wrapper visit(ast::column_reference const& expr) override - { - CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT, - "Statistics AST supports only left table"); - CUDF_EXPECTS(expr.get_column_index() < _num_columns, - "Column index cannot be more than number of columns in the table"); - return expr; - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) - */ - std::reference_wrapper visit( - ast::column_name_reference const& expr) override - { - CUDF_FAIL("Column name reference is not supported in statistics AST"); - } - - /** - * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) - */ - std::reference_wrapper visit(ast::operation const& expr) override - { - using cudf::ast::ast_operator; - auto const operands = expr.get_operands(); - auto const op = expr.get_operator(); - - if (auto* v = dynamic_cast(&operands[0].get())) { - // First operand should be column reference, second should be literal. - CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, - "Only binary operations are supported on column reference"); - CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, - "Second operand of binary operation with column reference must be a literal"); - v->accept(*this); - // Push literal into the ast::tree - auto const& literal = - _stats_expr.push(*dynamic_cast(&operands[1].get())); - auto const col_index = v->get_column_index(); - switch (op) { - /* transform to stats conditions. op(col, literal) - col1 == val --> vmin <= val && vmax >= val - col1 != val --> !(vmin == val && vmax == val) - col1 > val --> vmax > val - col1 < val --> vmin < val - col1 >= val --> vmax >= val - col1 <= val --> vmin <= val - */ - case ast_operator::EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast::ast_operator::LOGICAL_AND, - _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), - _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); - break; - } - case ast_operator::NOT_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{ - ast_operator::LOGICAL_OR, - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), - _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); - break; - } - case ast_operator::LESS: [[fallthrough]]; - case ast_operator::LESS_EQUAL: { - auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); - _stats_expr.push(ast::operation{op, vmin, literal}); - break; - } - case ast_operator::GREATER: [[fallthrough]]; - case ast_operator::GREATER_EQUAL: { - auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); - _stats_expr.push(ast::operation{op, vmax, literal}); - break; - } - default: CUDF_FAIL("Unsupported operation in Statistics AST"); - }; - } else { - auto new_operands = visit_operands(operands); - if (cudf::ast::detail::ast_operator_arity(op) == 2) { - _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); - } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { - _stats_expr.push(ast::operation{op, new_operands.front()}); - } - } - return _stats_expr.back(); - } - - /** - * @brief Returns the AST to apply on Column chunk statistics. - * - * @return AST operation expression - */ - [[nodiscard]] std::reference_wrapper get_stats_expr() const - { - return _stats_expr.back(); - } - - private: - std::vector> visit_operands( - cudf::host_span const> operands) - { - std::vector> transformed_operands; - for (auto const& operand : operands) { - auto const new_operand = operand.get().accept(*this); - transformed_operands.push_back(new_operand); - } - return transformed_operands; - } - ast::tree _stats_expr; - size_type _num_columns; -}; } // namespace std::optional>> aggregate_reader_metadata::apply_stats_filters( @@ -404,7 +125,7 @@ std::optional>> aggregate_reader_metadata::ap // where min(col[i]) = columns[i*2], max(col[i])=columns[i*2+1] // For each column, it contains #sources * #column_chunks_per_src rows. std::vector> columns; - stats_caster const stats_col{ + row_group_stats_caster const stats_col{ static_cast(total_row_groups), per_file_metadata, input_row_group_indices}; for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) { auto const schema_idx = output_column_schemas[col_idx]; diff --git a/cpp/src/io/parquet/stats_filter_helpers.cpp b/cpp/src/io/parquet/stats_filter_helpers.cpp new file mode 100644 index 00000000000..ef022b418bf --- /dev/null +++ b/cpp/src/io/parquet/stats_filter_helpers.cpp @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "stats_filter_helpers.hpp" + +#include "io/parquet/parquet_common.hpp" + +#include +#include +#include +#include + +namespace cudf::io::parquet::detail { + +stats_expression_converter::stats_expression_converter(ast::expression const& expr, + size_type num_columns) + : _num_columns{num_columns} +{ + expr.accept(*this); +} + +std::reference_wrapper stats_expression_converter::visit( + ast::literal const& expr) +{ + return expr; +} + +std::reference_wrapper stats_expression_converter::visit( + ast::column_reference const& expr) +{ + CUDF_EXPECTS(expr.get_table_source() == ast::table_reference::LEFT, + "Statistics AST supports only left table"); + CUDF_EXPECTS(expr.get_column_index() < _num_columns, + "Column index cannot be more than number of columns in the table"); + return expr; +} + +std::reference_wrapper stats_expression_converter::visit( + ast::column_name_reference const& expr) +{ + CUDF_FAIL("Column name reference is not supported in statistics AST"); +} + +std::reference_wrapper stats_expression_converter::visit( + ast::operation const& expr) +{ + using cudf::ast::ast_operator; + auto const operands = expr.get_operands(); + auto const op = expr.get_operator(); + + if (auto* v = dynamic_cast(&operands[0].get())) { + // First operand should be column reference, second should be literal. + CUDF_EXPECTS(cudf::ast::detail::ast_operator_arity(op) == 2, + "Only binary operations are supported on column reference"); + CUDF_EXPECTS(dynamic_cast(&operands[1].get()) != nullptr, + "Second operand of binary operation with column reference must be a literal"); + v->accept(*this); + // Push literal into the ast::tree + auto const& literal = _stats_expr.push(*dynamic_cast(&operands[1].get())); + auto const col_index = v->get_column_index(); + switch (op) { + /* transform to stats conditions. op(col, literal) + col1 == val --> vmin <= val && vmax >= val + col1 != val --> !(vmin == val && vmax == val) + col1 > val --> vmax > val + col1 < val --> vmin < val + col1 >= val --> vmax >= val + col1 <= val --> vmin <= val + */ + case ast_operator::EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{ + ast::ast_operator::LOGICAL_AND, + _stats_expr.push(ast::operation{ast_operator::GREATER_EQUAL, vmax, literal}), + _stats_expr.push(ast::operation{ast_operator::LESS_EQUAL, vmin, literal})}); + break; + } + case ast_operator::NOT_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push( + ast::operation{ast_operator::LOGICAL_OR, + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmin, vmax}), + _stats_expr.push(ast::operation{ast_operator::NOT_EQUAL, vmax, literal})}); + break; + } + case ast_operator::LESS: [[fallthrough]]; + case ast_operator::LESS_EQUAL: { + auto const& vmin = _stats_expr.push(ast::column_reference{col_index * 2}); + _stats_expr.push(ast::operation{op, vmin, literal}); + break; + } + case ast_operator::GREATER: [[fallthrough]]; + case ast_operator::GREATER_EQUAL: { + auto const& vmax = _stats_expr.push(ast::column_reference{col_index * 2 + 1}); + _stats_expr.push(ast::operation{op, vmax, literal}); + break; + } + default: CUDF_FAIL("Unsupported operation in Statistics AST"); + }; + } else { + auto new_operands = visit_operands(operands); + if (cudf::ast::detail::ast_operator_arity(op) == 2) { + _stats_expr.push(ast::operation{op, new_operands.front(), new_operands.back()}); + } else if (cudf::ast::detail::ast_operator_arity(op) == 1) { + _stats_expr.push(ast::operation{op, new_operands.front()}); + } + } + return _stats_expr.back(); +} + +std::reference_wrapper stats_expression_converter::get_stats_expr() const +{ + return _stats_expr.back(); +} + +std::vector> +stats_expression_converter::visit_operands( + cudf::host_span const> operands) +{ + std::vector> transformed_operands; + std::transform(operands.begin(), + operands.end(), + std::back_inserter(transformed_operands), + [t = this](auto& operand) { return operand.get().accept(*t); }); + + return transformed_operands; +} + +} // namespace cudf::io::parquet::detail diff --git a/cpp/src/io/parquet/stats_filter_helpers.hpp b/cpp/src/io/parquet/stats_filter_helpers.hpp new file mode 100644 index 00000000000..63b57f46539 --- /dev/null +++ b/cpp/src/io/parquet/stats_filter_helpers.hpp @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "io/parquet/parquet_common.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::io::parquet::detail { + +/** + * @brief Base utilities for converting and casting stats values + * + * Derived classes handle row group or page-level statistics as needed. + * + */ +class stats_caster_base { + protected: + template + static inline ToType targetType(FromType const value) + { + if constexpr (cudf::is_timestamp()) { + return static_cast( + typename ToType::duration{static_cast(value)}); + } else if constexpr (std::is_same_v) { + return ToType{nullptr, 0}; + } else { + return static_cast(value); + } + } + + // uses storage type as T + template () or cudf::is_nested())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + CUDF_FAIL("unsupported type for stats casting"); + } + + template ())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + CUDF_EXPECTS(type == BOOLEAN, "Invalid type and stats combination"); + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + } + + // integral but not boolean, and fixed_point, and chrono. + template () and !cudf::is_boolean()) or + cudf::is_fixed_point() or cudf::is_chrono())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case INT32: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case INT64: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case INT96: // Deprecated in parquet specification + return stats_caster_base::targetType( + static_cast<__int128_t>(reinterpret_cast(stats_val)[0]) << 32 | + reinterpret_cast(stats_val)[2]); + case BYTE_ARRAY: [[fallthrough]]; + case FIXED_LEN_BYTE_ARRAY: + if (stats_size == sizeof(T)) { + // if type size == length of stats_val. then typecast and return. + if constexpr (cudf::is_chrono()) { + return stats_caster_base::targetType( + *reinterpret_cast(stats_val)); + } else { + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + } + } + // unsupported type + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + template ())> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case FLOAT: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + case DOUBLE: + return stats_caster_base::targetType(*reinterpret_cast(stats_val)); + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + template )> + static inline T convert(uint8_t const* stats_val, size_t stats_size, Type const type) + { + switch (type) { + case BYTE_ARRAY: [[fallthrough]]; + case FIXED_LEN_BYTE_ARRAY: + return string_view(reinterpret_cast(stats_val), stats_size); + default: CUDF_FAIL("Invalid type and stats combination"); + } + } + + /** + * @brief Local struct to hold host columns during stats based filtering + * + * @tparam T Type of the column + */ + template + struct host_column { + // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. + cudf::detail::host_vector val; + std::vector null_mask; + cudf::size_type null_count = 0; + + host_column(size_type total_row_groups, rmm::cuda_stream_view stream) + : val{cudf::detail::make_host_vector(total_row_groups, stream)}, + null_mask(cudf::util::div_rounding_up_safe( + cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), + ~bitmask_type{0}) + { + } + + void inline set_index(size_type index, + std::optional> const& binary_value, + Type const type) + { + if (binary_value.has_value()) { + val[index] = stats_caster_base::convert( + binary_value.value().data(), binary_value.value().size(), type); + } + if (not binary_value.has_value()) { + clear_bit_unsafe(null_mask.data(), index); + null_count++; + } + } + static inline std::tuple, rmm::device_uvector> + make_strings_children(host_span host_strings, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + auto const total_char_count = + std::accumulate(host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { + return sum + str.size_bytes(); + }); + auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); + auto offsets = + cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); + offsets.push_back(0); + for (auto const& str : host_strings) { + auto tmp = + str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); + chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp)); + offsets.push_back(offsets.back() + tmp.length()); + } + auto d_chars = cudf::detail::make_device_uvector_async(chars, stream, mr); + auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, stream, mr); + return std::tuple{std::move(d_chars), std::move(d_offsets)}; + } + + std::unique_ptr inline to_device(cudf::data_type dtype, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) + { + if constexpr (std::is_same_v) { + auto [d_chars, d_offsets] = make_strings_children(val, stream, mr); + return cudf::make_strings_column( + val.size(), + std::make_unique(std::move(d_offsets), rmm::device_buffer{}, 0), + d_chars.release(), + null_count, + rmm::device_buffer{ + null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}); + } + return std::make_unique( + dtype, + val.size(), + cudf::detail::make_device_uvector_async(val, stream, mr).release(), + rmm::device_buffer{ + null_mask.data(), cudf::bitmask_allocation_size_bytes(val.size()), stream, mr}, + null_count); + } + }; +}; + +/** + * @brief Converts AST expression to StatsAST for comparing with column statistics + * + * This is used in row group filtering based on predicate. + * statistics min value of a column is referenced by column_index*2 + * statistics max value of a column is referenced by column_index*2+1 + */ +class stats_expression_converter : public ast::detail::expression_transformer { + public: + stats_expression_converter(ast::expression const& expr, size_type num_columns); + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::literal const& ) + */ + std::reference_wrapper visit(ast::literal const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& ) + */ + std::reference_wrapper visit(ast::column_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& ) + */ + std::reference_wrapper visit( + ast::column_name_reference const& expr) override; + + /** + * @copydoc ast::detail::expression_transformer::visit(ast::operation const& ) + */ + std::reference_wrapper visit(ast::operation const& expr) override; + + /** + * @brief Returns the AST to apply on Column chunk statistics. + * + * @return AST operation expression + */ + [[nodiscard]] std::reference_wrapper get_stats_expr() const; + + private: + std::vector> visit_operands( + cudf::host_span const> operands); + + ast::tree _stats_expr; + size_type _num_columns; +}; + +} // namespace cudf::io::parquet::detail From ea2bca33af1f1c084c026aa5c3a4612c1dd3d62c Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 4 Mar 2025 14:55:09 -0500 Subject: [PATCH 4/4] New nvtext::wordpiece_tokenizer APIs (#17600) Creates a new word-piece-tokenizer which replaces the existing subword-tokenizer in nvtext. The subword-tokenizer logic is to split out and specialized to perform basic tokenizing with the word-piece logic only. The normalizing part is already a separate API. The output will be a lists column of tokens only. The first change is that the new API uses `wordpiece` instead of `subword`. Here are the 2 C++ API declarations: ``` std::unique_ptr load_wordpiece_vocabulary( cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The vocabulary is loaded as a strings column and the returned object can be used on multiple calls to the next API: ``` std::unique_ptr wordpiece_tokenize( cudf::strings_column_view const& input, wordpiece_vocabulary const& vocabulary, cudf::size_type max_words_per_row, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` This will return a lists column of integers which represent the tokens for each row. The `max_words_per_row` will stop the tokenizing process for each row once the number of input words (characters delimited by space) has been reached. This means you may get more tokens than `max_words_per_row` for a row if a single word produces multiple tokens. Note, that this API expects the input string to already be normalized -- processed by the `nvtext::normalize_characters` API which is also being reworked in https://github.com/rapidsai/cudf/pull/17818 The Python interface has the following pattern: ``` from cudf.core.wordpiece_tokenize import WordPieceVocabulary input_string = .... # output of the normalizer vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt") vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True) wpt = WordPieceVocabulary(vc) wpr = wpt.tokenize(input_string) ``` The output is a lists column of the tokens and no longer the tensor-data and metadata format. If this format is needed, then we can consider a 3rd API that converts the output here to that format. Closes #17507 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Basit Ayantunde (https://github.com/lamarrr) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17600 --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/text/subword.cpp | 41 +- cpp/include/nvtext/wordpiece_tokenize.hpp | 122 +++ cpp/src/text/wordpiece_tokenize.cu | 890 ++++++++++++++++++ cpp/tests/text/subword_tests.cpp | 150 ++- python/cudf/cudf/core/column/string.py | 14 + python/cudf/cudf/core/wordpiece_tokenize.py | 44 + .../cudf/tests/text/test_subword_tokenizer.py | 473 +++++++++- .../libcudf/nvtext/wordpiece_tokenize.pxd | 22 + .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 16 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 4 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 4 +- .../pylibcudf/nvtext/wordpiece_tokenize.pxd | 15 + .../pylibcudf/nvtext/wordpiece_tokenize.pyi | 12 + .../pylibcudf/nvtext/wordpiece_tokenize.pyx | 66 ++ .../tests/test_nvtext_subword_tokenize.py | 36 +- 16 files changed, 1900 insertions(+), 10 deletions(-) create mode 100644 cpp/include/nvtext/wordpiece_tokenize.hpp create mode 100644 cpp/src/text/wordpiece_tokenize.cu create mode 100644 python/cudf/cudf/core/wordpiece_tokenize.py create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8952f11dd23..3018295c9eb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -757,6 +757,7 @@ add_library( src/text/subword/wordpiece_tokenizer.cu src/text/tokenize.cu src/text/vocabulary_tokenize.cu + src/text/wordpiece_tokenize.cu src/transform/bools_to_mask.cu src/transform/compute_column.cu src/transform/encode.cu diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp index 0b4e3bdefa5..d961ff86852 100644 --- a/cpp/benchmarks/text/subword.cpp +++ b/cpp/benchmarks/text/subword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include +#include #include @@ -57,7 +58,10 @@ static void bench_subword_tokenizer(nvbench::state& state) { auto const num_rows = static_cast(state.get_int64("num_rows")); - std::vector h_strings(num_rows, "This is a test "); + std::vector h_strings( + num_rows, + "This is a test This is a test This is a test This is a test This is a test This is a test " + "This is a test This is a test "); cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end()); static std::string hash_file = create_hash_vocab_file(); std::vector offsets{14}; @@ -83,3 +87,36 @@ static void bench_subword_tokenizer(nvbench::state& state) NVBENCH_BENCH(bench_subword_tokenizer) .set_name("subword_tokenize") .add_int64_axis("num_rows", {32768, 262144, 2097152}); + +static void bench_wordpiece_tokenizer(nvbench::state& state) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const max_words = static_cast(state.get_int64("max_words")); + + auto const h_strings = std::vector( + num_rows, + "This is a test This is a test This is a test This is a test This is a test This is a test " + "This is a test This is a test "); + auto const num_words = 32; // "This is a test" * 8 + auto const d_strings = cudf::test::strings_column_wrapper(h_strings.begin(), h_strings.end()); + auto const input = cudf::strings_column_view{d_strings}; + + auto const vocabulary = + cudf::test::strings_column_wrapper({"", "[UNK]", "This", "is", "a", "test"}); + auto const vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + auto chars_size = input.chars_size(cudf::get_default_stream()); + state.add_global_memory_reads(chars_size); + auto out_size = num_rows * (max_words > 0 ? std::min(max_words, num_words) : num_words); + state.add_global_memory_writes(out_size); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto result = nvtext::wordpiece_tokenize(input, *vocab, max_words); + }); +} + +NVBENCH_BENCH(bench_wordpiece_tokenizer) + .set_name("wordpiece_tokenize") + .add_int64_axis("num_rows", {32768, 262144, 2097152}) + .add_int64_axis("max_words", {0, 20, 40}); diff --git a/cpp/include/nvtext/wordpiece_tokenize.hpp b/cpp/include/nvtext/wordpiece_tokenize.hpp new file mode 100644 index 00000000000..85c0f1406bd --- /dev/null +++ b/cpp/include/nvtext/wordpiece_tokenize.hpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace CUDF_EXPORT nvtext { +/** + * @addtogroup nvtext_tokenize + * @{ + * @file + */ + +/** + * @brief Vocabulary object to be used with nvtext::wordpiece_tokenizer + * + * Use nvtext::load_wordpiece_vocabulary to create this object. + */ +struct wordpiece_vocabulary { + /** + * @brief Vocabulary object constructor + * + * Token ids are the row indices within the vocabulary column. + * Each vocabulary entry is expected to be unique otherwise the behavior is undefined. + * + * @throw std::invalid_argument if `vocabulary` contains nulls or is empty + * + * @param input Strings for the vocabulary + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + wordpiece_vocabulary(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~wordpiece_vocabulary(); + + struct wordpiece_vocabulary_impl; + std::unique_ptr _impl; +}; + +/** + * @brief Create a tokenize_vocabulary object from a strings column + * + * Token ids are the row indices within the vocabulary column. + * Each vocabulary entry is expected to be unique otherwise the behavior is undefined. + * + * @throw std::invalid_argument if `vocabulary` contains nulls or is empty + * + * @param input Strings for the vocabulary + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::tokenize_with_vocabulary + */ +std::unique_ptr load_wordpiece_vocabulary( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Returns the token ids for the input string a wordpiece tokenizer + * algorithm with the given vocabulary + * + * Example: + * @code{.pseudo} + * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"] + * v = load_wordpiece_vocabulary(vocabulary) + * input = ["I have a new GPU now !"] + * t = wordpiece_tokenize(i,v) + * t is now [[3, 2, 1, 4, 5, 6, 0, 7]] + * @endcode + * + * The `max_words_per_row` also optionally limits the output by only processing + * a maximum number of words per row. Here a word is defined as consecutive + * sequence of characters delimited by space character(s). + * + * Example: + * @code{.pseudo} + * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"] + * v = load_wordpiece_vocabulary(vocabulary) + * input = ["I have a new GPU now !"] + * t4 = wordpiece_tokenize(i,v,4) + * t4 is now [[3, 2, 1, 4]] + * t5 = wordpiece_tokenize(i,v,5) + * t5 is now [[3, 2, 1, 4, 5, 6]] + * @endcode + * + * Any null row entry results in a corresponding null entry in the output. + * + * @param input Strings column to tokenize + * @param vocabulary Used to lookup tokens within `input` + * @param max_words_per_row Maximum number of words to tokenize for each row. + * Default 0 tokenizes all words. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Lists column of token ids + */ +std::unique_ptr wordpiece_tokenize( + cudf::strings_column_view const& input, + wordpiece_vocabulary const& vocabulary, + cudf::size_type max_words_per_row = 0, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** @} */ // end of tokenize group +} // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/wordpiece_tokenize.cu b/cpp/src/text/wordpiece_tokenize.cu new file mode 100644 index 00000000000..817e7844201 --- /dev/null +++ b/cpp/src/text/wordpiece_tokenize.cu @@ -0,0 +1,890 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32; +using hash_value_type = string_hasher_type::result_type; + +/** + * @brief Hasher used for vocabulary map + */ +struct vocab_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + __device__ hash_value_type operator()(cudf::size_type index) const + { + return hasher(d_strings.element(index)); + } + __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } +}; +/** + * @brief Equality operator for vocabulary map + */ +struct vocab_equal { + cudf::column_device_view const d_strings; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + return lhs == rhs; // all rows are expected to be unique + } + __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept + { + return d_strings.element(rhs) == lhs; + } +}; + +using cuco_storage = cuco::storage<1>; +using probe_scheme = cuco::linear_probing<1, vocab_hasher>; +using vocabulary_map_type = cuco::static_map, + cuda::thread_scope_thread, + vocab_equal, + probe_scheme, + cudf::detail::cuco_allocator, + cuco_storage>; + +/** + * @brief Hasher used for the subword vocabulary map + */ +struct sub_vocab_hasher { + cudf::column_device_view const d_strings; + string_hasher_type hasher{}; + __device__ hash_value_type operator()(cudf::size_type index) const + { + auto const d_str = d_strings.element(index); + // skip over the '##' prefix + return hasher(cudf::string_view(d_str.data() + 2, d_str.size_bytes() - 2)); + } + __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); } +}; +/** + * @brief Equality operator used for the subword vocabulary map + * + * The subwords start with '##' prefix in the original vocabulary map + */ +struct sub_vocab_equal { + cudf::column_device_view const d_strings; + __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept + { + return lhs == rhs; // all rows are expected to be unique + } + __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept + { + auto const d_str = d_strings.element(rhs); + // skip over the '##' prefix + return lhs == cudf::string_view(d_str.data() + 2, d_str.size_bytes() - 2); + } +}; + +// This 2nd subword map helps avoid requiring temporary strings in device code +using sub_probe_scheme = cuco::linear_probing<1, sub_vocab_hasher>; +using sub_vocabulary_map_type = cuco::static_map, + cuda::thread_scope_thread, + sub_vocab_equal, + sub_probe_scheme, + cudf::detail::cuco_allocator, + cuco_storage>; +} // namespace +} // namespace detail + +// since column_device_view::create returns is a little more than +// std::unique_ptr this helper simplifies the return type in a maintainable way +using col_device_view = std::invoke_result_t; + +/** + * @brief Internal class manages all the data held by the vocabulary object + */ +struct wordpiece_vocabulary::wordpiece_vocabulary_impl { + std::unique_ptr const vocabulary; // copy of the original vocabulary input + col_device_view const d_vocabulary; + std::unique_ptr vocabulary_map; + std::unique_ptr vocabulary_sub_map; + cudf::size_type unk_id{}; // resolved [UNK] id from vocabulary + + auto get_map_ref() const { return vocabulary_map->ref(cuco::op::find); } + auto get_sub_map_ref() const { return vocabulary_sub_map->ref(cuco::op::find); } + + wordpiece_vocabulary_impl(std::unique_ptr&& vocab, + col_device_view&& d_vocab, + std::unique_ptr&& map, + std::unique_ptr&& sub_map, + cudf::size_type unk_id) + : vocabulary(std::move(vocab)), + d_vocabulary(std::move(d_vocab)), + vocabulary_map(std::move(map)), + vocabulary_sub_map(std::move(sub_map)), + unk_id{unk_id} + { + } +}; + +namespace { +/** + * @brief Identifies the column indices as the values in the vocabulary map + */ +struct key_pair { + __device__ auto operator()(cudf::size_type idx) const noexcept + { + return cuco::make_pair(idx, idx); + } +}; + +/** + * @brief For filtering the subword ('##' prefixed) entries in the vocabulary + */ +struct copy_pieces_fn { + cudf::column_device_view d_strings; + __device__ bool operator()(cudf::size_type idx) + { + auto const d_str = d_strings.element(idx); + if (d_str.size_bytes() < 2) { return false; } + return (d_str.data()[0] == '#') and (d_str.data()[1] == '#'); + } +}; + +/** + * @brief Resolves the [UNK] entry from the vocabulary + * + * This saves inlining the lookup code in several places in device code. + */ +template +struct resolve_unk_id { + MapRefType d_map; + __device__ cudf::size_type operator()(cudf::size_type idx) + { + // look for both since the normalizer may change the case to match the vocab table + auto const unk = idx == 0 ? cudf::string_view("[UNK]", 5) : cudf::string_view("[unk]", 5); + auto const fnd = d_map.find(unk); + return fnd != d_map.end() ? fnd->second : -1; + } +}; + +} // namespace + +wordpiece_vocabulary::wordpiece_vocabulary(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty", std::invalid_argument); + CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls", std::invalid_argument); + + // hold a copy of the input (not expected to be very large) + auto vocabulary = std::make_unique(input.parent(), stream, mr); + auto d_vocabulary = cudf::column_device_view::create(vocabulary->view(), stream); + + // build the vocabulary map: each row is a single term and is the key for the map + auto vocab_map = std::make_unique( + static_cast(vocabulary->size() * 2), + cuco::empty_key{-1}, + cuco::empty_value{-1}, + detail::vocab_equal{*d_vocabulary}, + detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}}, + cuco::thread_scope_thread, + detail::cuco_storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()); + // the row index is the token id (data value for each key in the map) + auto iter = cudf::detail::make_counting_transform_iterator(0, key_pair{}); + vocab_map->insert_async(iter, iter + vocabulary->size(), stream.value()); + auto const zero_itr = thrust::counting_iterator(0); + + // get the indices of all the ## prefixed entries + auto sub_map_indices = rmm::device_uvector(vocabulary->size(), stream); + auto const end = + thrust::copy_if(rmm::exec_policy(stream), + zero_itr, + thrust::counting_iterator(sub_map_indices.size()), + sub_map_indices.begin(), + copy_pieces_fn{*d_vocabulary}); + sub_map_indices.resize(thrust::distance(sub_map_indices.begin(), end), stream); + + // build a 2nd map with just the ## prefixed items + auto vocab_sub_map = std::make_unique( + sub_map_indices.size() * 2, + cuco::empty_key{-1}, + cuco::empty_value{-1}, + detail::sub_vocab_equal{*d_vocabulary}, + detail::sub_probe_scheme{detail::sub_vocab_hasher{*d_vocabulary}}, + cuco::thread_scope_thread, + detail::cuco_storage{}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()); + // insert them without the '##' prefix since that is how they will be looked up + auto iter_sub = thrust::make_transform_iterator(sub_map_indices.begin(), key_pair{}); + vocab_sub_map->insert_async(iter_sub, iter_sub + sub_map_indices.size(), stream.value()); + + // prefetch the [unk] vocab entry + auto unk_ids = rmm::device_uvector(2, stream); + auto d_map = vocab_map->ref(cuco::op::find); + thrust::transform(rmm::exec_policy_nosync(stream), + zero_itr, + zero_itr + unk_ids.size(), + unk_ids.begin(), + resolve_unk_id{d_map}); + auto const id0 = unk_ids.front_element(stream); + auto const id1 = unk_ids.back_element(stream); + auto const unk_id = id0 >= 0 ? id0 : id1; + + _impl = std::make_unique(std::move(vocabulary), + std::move(d_vocabulary), + std::move(vocab_map), + std::move(vocab_sub_map), + unk_id); +} + +wordpiece_vocabulary::~wordpiece_vocabulary() {} + +std::unique_ptr load_wordpiece_vocabulary( + cudf::strings_column_view const& input, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(input, stream, mr); +} + +namespace detail { +namespace { + +constexpr auto block_size = 128; +constexpr auto no_token = cuda::std::numeric_limits::max(); +constexpr auto max_word_size = 200; // words longer than this are not tokenized + +/** + * @brief Returns a new string_view truncating the last character of the input string + * + * This is more efficient than using substr() which is more generic. + */ +__device__ cudf::string_view remove_last_char(cudf::string_view d_str) +{ + if (d_str.size_bytes() < 2) { return cudf::string_view(); } + auto const begin = d_str.data(); + auto end = begin + d_str.size_bytes() - 1; + while ((end > begin) && cudf::strings::detail::is_utf8_continuation_char(*end)) { + --end; + } + auto const size = static_cast(thrust::distance(begin, end)); + return cudf::string_view(begin, size); +} + +/** + * @brief The wordpiece tokenizer + * + * The given word is looked up in the d_map and if found the corresponding + * token (integer) is returned. + * + * If not found, the function will iteratively remove the last character + * from the word and check the substring exists in the d_map until the + * the substring(s) is found. If still not found, the unk_id is returned. + * If found, the characters removed are iteratively checked against + * the d_sub_map until all have been located. If any of these are not found, + * the unk_id is returned. + * + * Example: word="GPU" and d_map contains { ... {"G",10}, {"##U",7}, {"##P",3}, ... } + * which means the d_sub_map contains { ... {"U",7}, {"P",3}, ... } + * Since "GPU" is not found in d_map, the 'U' is removed and rechecked. + * And "GP" is also not found so another character is removed leaving "G". + * The "G" is found in d_map so now the removed characters are processed + * starting with "PU" which is not found in d_sub_map. Removing the 'U' + * again results in "P" which is found in d_sub_map and iterating again + * locates 'U' in d_sub_map as well. The end result is that "GPU" produces + * 3 tokens [10,3,7]. + * + * @param word Word to tokenize + * @param d_map Vocabulary to check for word and sub-words + * @param d_sub_map Partial vocabulary of '##' entries + * @param unk_id The unknown token id returned when no token is found + * @param d_tokens Output token ids are returned here + * @return The number of resolved tokens + */ +template +__device__ cudf::size_type wp_tokenize_fn(cudf::string_view word, + MapRefType const& d_map, + SubMapRefType const& d_sub_map, + cudf::size_type unk_id, + cudf::size_type* d_tokens) +{ + // lookup word in map + auto token_idx = 0; + auto itr = d_map.find(word); + if (itr != d_map.end()) { + d_tokens[token_idx++] = itr->second; + return token_idx; + } + + // reduce word by one character and try again + auto piece = remove_last_char(word); + while (!piece.empty()) { + itr = d_map.find(piece); + if (itr == d_map.end()) { + piece = remove_last_char(piece); + continue; + } + d_tokens[token_idx++] = itr->second; + break; + } + if (piece.empty()) { + // did not find anything; this is not common + d_tokens[token_idx++] = unk_id; + return token_idx; + } + + word = + cudf::string_view(word.data() + piece.size_bytes(), word.size_bytes() - piece.size_bytes()); + piece = word; + while (!piece.empty()) { + auto itr = d_sub_map.find(piece); + if (itr == d_sub_map.end()) { + piece = remove_last_char(piece); + continue; + } + d_tokens[token_idx++] = itr->second; + + word = + cudf::string_view(word.data() + piece.size_bytes(), word.size_bytes() - piece.size_bytes()); + piece = word; + } + if (!word.empty()) { + // very uncommon + d_tokens[0] = unk_id; + // need to reset any previous ids too + for (auto i = 1; i < token_idx; ++i) { + d_tokens[i] = no_token; + } + token_idx = 1; + } + + return token_idx; +} + +/** + * @brief Kernel for tokenizing all words + * + * Launched as a thread per edge value in d_edges. + * + * Each value in d_edge is the beginning of a word. + * The kernel searches for a matching space character (or the next d_edge value) + * to find the end of the word. + * The result is then tokenized using the wp_tokenize_fn utility. + * + * @param d_edges The offset to the beginning of each word + * @param d_chars Pointer to the characters of the input column + * @param offset Maybe non-zero if the input column has been sliced + * @param d_map Lookup table for the wp_tokenize_fn utility + * @param d_sub_map 2nd lookup table for the wp_tokenize_fn utility + * @param unk_id Unknown token id when a token cannot be resolved + * @param d_tokens Output tokens are written here + */ +template +CUDF_KERNEL void tokenize_all_kernel(cudf::device_span d_edges, + char const* d_chars, + // int64_t offset, + MapRefType const d_map, + SubMapRefType const d_sub_map, + cudf::size_type unk_id, + cudf::size_type* d_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= (d_edges.size() - 1)) { return; } + auto const begin = d_chars + d_edges[idx]; + auto const end = d_chars + d_edges[idx + 1]; + auto const word_end = thrust::find(thrust::seq, begin, end, ' '); + auto const size = static_cast(thrust::distance(begin, word_end)); + if (size == 0) { return; } + auto d_output = d_tokens + d_edges[idx]; // - offset; + if (size >= max_word_size) { + *d_output = unk_id; + return; + } + auto const word = cudf::string_view{begin, size}; + wp_tokenize_fn(word, d_map, d_sub_map, unk_id, d_output); +} + +/** + * @brief Count the number of tokens per output row + * + * Uses segmented-reduce to compute the number of tokens per row. + * + * @param d_tokens The tokens to count + * @param offsets The offsets for the segmented-reduce + * @param offset Maybe non-zero if the input column has been sliced + * @param size The number of output rows (same as the number of input rows) + * @param stream Stream used for device allocations and kernel launches + * @return The number of tokens per row + */ +template +rmm::device_uvector count_tokens(cudf::size_type const* d_tokens, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto d_counts = rmm::device_uvector(size, stream); + + // transform iterator used for counting the number of !no_tokens + auto const d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_tokens] __device__(auto idx) { + return static_cast(d_tokens[idx] != no_token); + })); + + auto const d_offsets = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([offsets, offset] __device__(auto idx) { + return offsets[idx] - offset; + })); + + auto temp = std::size_t{0}; + auto d_out = d_counts.data(); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, d_offsets, d_offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, d_offsets, d_offsets + 1, stream.value()); + + return d_counts; +} + +/** + * @brief Compute all tokens for the input column + * + * @param input Input strings column + * @param first_offset Offset to first row in chars for `input` + * @param last_offset Offset just past the last row in chars for `input` + * @param vocabulary Vocabulary data needed by the tokenizer + * @param stream Stream used for device allocations and kernel launches + * @return The tokens (and non-tokens) for the input + */ +rmm::device_uvector compute_all_tokens( + cudf::strings_column_view const& input, + int64_t first_offset, + int64_t chars_size, + wordpiece_vocabulary::wordpiece_vocabulary_impl const& vocabulary, + rmm::cuda_stream_view stream) +{ + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + // find beginnings of words + auto d_edges = rmm::device_uvector(chars_size / 2L, stream); + // beginning of a word is a non-space preceded by a space + auto edges_end = cudf::detail::copy_if_safe( + thrust::counting_iterator(0), + thrust::counting_iterator(chars_size), + d_edges.begin(), + [d_input_chars] __device__(auto idx) { + if (idx == 0) { return d_input_chars[idx] == ' '; } + return (d_input_chars[idx] != ' ' && d_input_chars[idx - 1] == ' '); + }, + stream); + + auto const edges_count = + input.size() + 1 + static_cast(thrust::distance(d_edges.begin(), edges_end)); + // thrust::merge has an int32 max limit currently + CUDF_EXPECTS(edges_count < std::numeric_limits::max(), "words exceed internal limit"); + + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto const d_offsets = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([input_offsets, first_offset] __device__(auto idx) { + return input_offsets[idx] - first_offset; + })); + + // merge in the input offsets to identify words starting each row + auto d_all_edges = [&] { + auto d_all_edges = rmm::device_uvector(edges_count, stream); + thrust::merge(rmm::exec_policy_nosync(stream), + d_offsets, + d_offsets + input.size() + 1, + d_edges.begin(), + edges_end, + d_all_edges.begin()); + d_edges.release(); // done with this + return d_all_edges; + }(); + + auto const map_ref = vocabulary.get_map_ref(); + auto const sub_map_ref = vocabulary.get_sub_map_ref(); + auto const unk_id = vocabulary.unk_id; + + auto d_tokens = rmm::device_uvector(chars_size, stream); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), no_token); + + cudf::detail::grid_1d grid{static_cast(d_all_edges.size()), 512}; + tokenize_all_kernel + <<>>( + d_all_edges, d_input_chars, map_ref, sub_map_ref, unk_id, d_tokens.data()); + + return d_tokens; +} + +constexpr cudf::size_type no_word = cuda::std::numeric_limits::max(); +constexpr int64_t no_word64 = cuda::std::numeric_limits::max(); + +/** + * @brief Find word boundaries kernel + * + * Launched as a warp per string in the input column. + * + * Finds the edges of words within each string and stores them into 'starts' and 'sizes'. + * This kernel is used when a maximum number of words are to be processed per row. + * + * @param d_strings Input strings column + * @param d_chars The beginning of the character data for d_strings + * already adjusted for any sliced offset + * @param offsets The offsets for the output arrays: starts and sizes + * @param starts The output offsets within d_chars identifying the beginning of words + * @param sizes The output size of the words corresponding to starts + */ +template +CUDF_KERNEL void find_words_kernel(cudf::column_device_view const d_strings, + char const* d_chars, + int64_t const* offsets, + int64_t* starts, + cudf::size_type* sizes) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + auto const str_idx = idx / tile_size; + if (str_idx >= d_strings.size()) { return; } + if (d_strings.is_null(str_idx)) { return; } + auto const d_str = d_strings.element(str_idx); + if (d_str.empty()) { return; } + auto const str_offset = static_cast(thrust::distance(d_chars, d_str.data())); + + auto const d_start_words = starts + offsets[str_idx]; + auto const d_word_sizes = sizes + offsets[str_idx]; + auto const max_words = static_cast(offsets[str_idx + 1] - offsets[str_idx]); + + constexpr auto bytes_per_thread = 6; // average 5 chars per word plus space + constexpr auto words_size = block_size * bytes_per_thread; + __shared__ cudf::size_type s_start_words[words_size]; + __shared__ cudf::size_type s_end_words[words_size]; + // compiler is not able to find this for some reason so defining it here as well + constexpr auto no_word = cuda::std::numeric_limits::max(); + + namespace cg = cooperative_groups; + auto const block = cg::this_thread_block(); + auto const tile = cg::tiled_partition(block); + + auto const lane_idx = tile.thread_rank(); + auto const warp_idx = tile.meta_group_rank(); + auto const warp_words = words_size / tile.meta_group_size(); + + cudf::size_type word_count = 0; + cudf::size_type byte_count = 0; + + auto first_word = no_word; // only used by lane_idx==0 + auto const begin = d_str.data(); + auto const end = begin + d_str.size_bytes(); + + auto start_words = s_start_words + (warp_idx * warp_words); + auto end_words = s_end_words + (warp_idx * warp_words); + + // continue until all bytes have been consumed or the max word count has been reached + auto itr = begin + lane_idx; + while (word_count < max_words && byte_count < d_str.size_bytes()) { + // initialize all intermediate results + start_words[lane_idx] = lane_idx > 0 ? no_word : first_word; + end_words[lane_idx] = no_word; + for (auto j = lane_idx + tile_size; j < warp_words; j += tile_size) { + start_words[j] = no_word; + end_words[j] = no_word; + } + tile.sync(); + + cudf::size_type last_idx = 0; + // each thread processes bytes_per_thread of the d_str + for (auto k = lane_idx; k < warp_words && itr < end; k += tile_size) { + // look for word starts (non-space preceded by a space) + if ((*itr != ' ') && ((itr == begin) || (*(itr - 1) == ' '))) { + last_idx = (k / 2) + 1; + start_words[last_idx] = static_cast(thrust::distance(begin, itr)); + } + // look for word ends (space preceded by non-space) + if (((itr + 1) == end) || ((itr != begin) && (*itr == ' ') && (*(itr - 1) != ' '))) { + auto const adjust = static_cast(*itr != ' '); // edge case + last_idx = (k / 2) + adjust; + end_words[last_idx] = static_cast(thrust::distance(begin, itr)) + adjust; + } + itr += tile_size; + } + // keep track of how much of start_words/end_words we used + last_idx = cg::reduce(tile, last_idx, cg::greater{}) + 1; + + cudf::size_type output_count = 0; + if (lane_idx == 0) { + // compress out the no-words + auto const count = static_cast(thrust::distance( + start_words, thrust::remove(thrust::seq, start_words, start_words + last_idx, no_word))); + auto const words_found = static_cast(thrust::distance( + end_words, thrust::remove(thrust::seq, end_words, end_words + last_idx, no_word))); + // this partially resolved word wraps around for the next iteration + first_word = (count > words_found) ? start_words[words_found] : no_word; + output_count = cuda::std::min(words_found, max_words - word_count); + } + + // copy results to the output + auto out_starts = d_start_words + word_count; + auto out_sizes = d_word_sizes + word_count; + output_count = tile.shfl(output_count, 0); // copy output_count to all threads + for (auto k = lane_idx; k < output_count; k += tile_size) { + auto const start = start_words[k]; + out_starts[k] = start + str_offset; + out_sizes[k] = end_words[k] - start; + } + + word_count += output_count; + byte_count += tile_size * bytes_per_thread; + tile.sync(); + } + + // fill in the remainder of the output + auto out_starts = d_start_words + word_count; + auto out_sizes = d_word_sizes + word_count; + for (auto k = lane_idx; k < (max_words - word_count); k += tile_size) { + out_starts[k] = no_word64; + out_sizes[k] = no_word; + } +} + +/** + * @brief Limiting tokenizing kernel + * + * Launched as a thread per d_starts (and d_sizes) values. + * + * This kernel is provided word boundaries as d_starts and d_sizes. + * The start of the word at index idx is d_start[idx]. + * The size of that word is d_size[idx]. + * + * The wp_tokenize_fn is used to output the tokens for each word + * appropriately into d_tokens. + * + * @param d_starts The start of each word in d_chars + * @param d_sizes The corresponding size of the word pointed to by d_starts + * @param d_chars Points to the beginning of the characters of the input column + * @param d_map Lookup table for the wp_tokenize_fn utility + * @param d_sub_map 2nd lookup table for the wp_tokenize_fn utility + * @param unk_id Unknown token id when a token cannot be resolved + * @param d_tokens Output tokens are written here + */ +template +CUDF_KERNEL void tokenize_kernel(cudf::device_span d_starts, + cudf::device_span d_sizes, + char const* d_chars, + MapRefType const d_map, + SubMapRefType const d_sub_map, + cudf::size_type unk_id, + cudf::size_type* d_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= d_starts.size()) { return; } + auto const size = d_sizes[idx]; + if (size <= 0 || size == no_word) { return; } + auto const start = d_starts[idx]; + auto const begin = d_chars + start; + auto d_output = d_tokens + start; + if (size >= max_word_size) { + *d_output = unk_id; + return; + } + auto const word = cudf::string_view{begin, size}; + wp_tokenize_fn(word, d_map, d_sub_map, unk_id, d_output); +} + +/** + * @brief Compute tokens limited to `max_words_per_row` + * + * @param input Input strings column + * @param first_offset Offset to first row in chars for `input` + * @param last_offset Offset just past the last row in chars for `input` + * @param max_words_per_row Maximum number of words to tokenize in each row + * @param vocabulary Vocabulary data needed by the tokenizer + * @param stream Stream used for device allocations and kernel launches + * @return The tokens (and non-tokens) for the input + */ +rmm::device_uvector compute_some_tokens( + cudf::strings_column_view const& input, + int64_t first_offset, + int64_t chars_size, + cudf::size_type max_words_per_row, + wordpiece_vocabulary::wordpiece_vocabulary_impl const& vocabulary, + rmm::cuda_stream_view stream) +{ + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + auto const d_strings = cudf::column_device_view::create(input.parent(), stream); + auto max_word_offsets = rmm::device_uvector(input.size() + 1, stream); + + // compute max word counts for each row + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + max_word_offsets.begin(), + cuda::proclaim_return_type( + [d_strings = *d_strings, max_words_per_row] __device__(auto idx) { + if (idx >= d_strings.size()) { return 0; } + if (d_strings.is_null(idx)) { return 0; } + auto const d_str = d_strings.element(idx); + return cuda::std::min(max_words_per_row, d_str.size_bytes() / 2); + })); + + auto const max_size = cudf::detail::sizes_to_offsets( + max_word_offsets.begin(), max_word_offsets.end(), max_word_offsets.begin(), 0, stream); + + auto start_words = rmm::device_uvector(max_size, stream); + auto word_sizes = rmm::device_uvector(max_size, stream); + + // find start/end for each row up to max_words_per_row words; + // store word positions in start_words and sizes in word_sizes + cudf::detail::grid_1d grid_find{input.size() * cudf::detail::warp_size, block_size}; + find_words_kernel + <<>>( + *d_strings, d_input_chars, max_word_offsets.data(), start_words.data(), word_sizes.data()); + + // remove the non-words + auto const end = + thrust::remove(rmm::exec_policy(stream), start_words.begin(), start_words.end(), no_word64); + auto const check = + thrust::remove(rmm::exec_policy(stream), word_sizes.begin(), word_sizes.end(), no_word); + + auto const total_words = static_cast(thrust::distance(start_words.begin(), end)); + // this should only trigger if there is a bug in the code above + CUDF_EXPECTS(total_words == static_cast(thrust::distance(word_sizes.begin(), check)), + "error resolving word locations from input column"); + start_words.resize(total_words, stream); // always + word_sizes.resize(total_words, stream); // smaller + + auto const map_ref = vocabulary.get_map_ref(); + auto const sub_map_ref = vocabulary.get_sub_map_ref(); + auto const unk_id = vocabulary.unk_id; + + auto d_tokens = rmm::device_uvector(chars_size, stream); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), no_token); + + cudf::detail::grid_1d grid{total_words, 512}; + tokenize_kernel + <<>>( + start_words, word_sizes, d_input_chars, map_ref, sub_map_ref, unk_id, d_tokens.data()); + + return d_tokens; +} + +} // namespace + +std::unique_ptr wordpiece_tokenize(cudf::strings_column_view const& input, + wordpiece_vocabulary const& vocabulary, + cudf::size_type max_words_per_row, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(max_words_per_row >= 0, "Invalid value for max_words_per_row argument"); + + auto const output_type = cudf::data_type{cudf::type_to_id()}; + if (input.size() == input.null_count()) { + return input.has_nulls() + ? cudf::lists::detail::make_all_nulls_lists_column( + input.size(), output_type, stream, mr) + : cudf::lists::detail::make_empty_lists_column(output_type, stream, mr); + } + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + auto const chars_size = last_offset - first_offset; + + auto d_tokens = + max_words_per_row == 0 + ? compute_all_tokens(input, first_offset, chars_size, *(vocabulary._impl), stream) + : compute_some_tokens( + input, first_offset, chars_size, max_words_per_row, *(vocabulary._impl), stream); + + // compute token counts by doing a segmented reduce over valid d_tokens + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto const d_token_counts = + count_tokens(d_tokens.data(), input_offsets, first_offset, input.size(), stream); + + auto [token_offsets, total_count] = cudf::detail::make_offsets_child_column( + d_token_counts.begin(), d_token_counts.end(), stream, mr); + + auto tokens = + cudf::make_numeric_column(output_type, total_count, cudf::mask_state::UNALLOCATED, stream, mr); + auto output = tokens->mutable_view().begin(); + thrust::remove_copy( + rmm::exec_policy_nosync(stream), d_tokens.begin(), d_tokens.end(), output, no_token); + + return cudf::make_lists_column(input.size(), + std::move(token_offsets), + std::move(tokens), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} +} // namespace detail + +std::unique_ptr wordpiece_tokenize(cudf::strings_column_view const& input, + wordpiece_vocabulary const& vocabulary, + cudf::size_type max_words_per_row, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::wordpiece_tokenize(input, vocabulary, max_words_per_row, stream, mr); +} + +} // namespace nvtext diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp index 782551ad66e..706c704c3d7 100644 --- a/cpp/tests/text/subword_tests.cpp +++ b/cpp/tests/text/subword_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,11 +17,13 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -438,3 +440,149 @@ TEST(TextSubwordTest, ZeroHashBinCoefficient) CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_attention_mask->view(), expected_attn); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tensor_metadata->view(), expected_metadata); } + +TEST(TextSubwordTest, WordPiece) +{ + auto vocabulary = cudf::test::strings_column_wrapper( + {"ate", "brown", "cheese", "dog", "fox", "jumped", "lazy", "quick", "over", "the", "[UNK]"}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + + auto input = cudf::test::strings_column_wrapper( + {"the quick brown fox jumped over", + "the lazy brown dog", + " ate brown cheese dog fox jumped lazy quick over the [UNK] "}); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + auto expected = LCW({LCW{ 9, 7, 1, 4, 5, 8}, + LCW{ 9, 6, 1, 3}, + LCW{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::wordpiece_tokenize(sv, *vocab, 5); + // clang-format off + expected = LCW({LCW{ 9, 7, 1, 4, 5}, + LCW{ 9, 6, 1, 3}, + LCW{ 0, 1, 2, 3, 4}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST(TextSubwordTest, WordPieceWithSubwords) +{ + auto vocabulary = + cudf::test::strings_column_wrapper({"", "[UNK]", "!", "a", "I", "G", "have", "##P", "##U"}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + + auto input = + cudf::test::strings_column_wrapper({"I have a GPU ! ", "do not have a gpu", "no gpu"}); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + auto expected = LCW({LCW{4, 6, 3, 5, 7, 8, 2}, + LCW{1, 1, 6, 3, 1}, + LCW{1, 1}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + // max is applied to input words and not output tokens + results = nvtext::wordpiece_tokenize(sv, *vocab, 6); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::wordpiece_tokenize(sv, *vocab, 4); + // clang-format off + expected = LCW({LCW{4, 6, 3, 5, 7, 8}, + LCW{1, 1, 6, 3}, + LCW{1, 1}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST(TextSubwordTest, WordPieceSliced) +{ + auto vocabulary = cudf::test::strings_column_wrapper( + {"ate", "brown", "cheese", "dog", "fox", "jumped", "lazy", "quick", "over", "the", "[UNK]"}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + + auto input = cudf::test::strings_column_wrapper( + {" ate the cheese dog quick over lazy day ", + "the quick brown fox jumped over", + "the lazy brown dog", + " ate brown cheese dog fox jumped lazy quick over the [UNK] ", + " ate the cheese dog quick over lazy day "}); + + auto sliced = cudf::slice(input, {1, 4}); + auto sv = cudf::strings_column_view(sliced.front()); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + + using LCW = cudf::test::lists_column_wrapper; + // clang-format off + auto expected = LCW({LCW{ 9, 7, 1, 4, 5, 8}, + LCW{ 9, 6, 1, 3}, + LCW{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::wordpiece_tokenize(sv, *vocab, 5); + // clang-format off + expected = LCW({LCW{ 9, 7, 1, 4, 5}, + LCW{ 9, 6, 1, 3}, + LCW{ 0, 1, 2, 3, 4}}); + // clang-format on + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST(TextSubwordTest, WordPieceEmpty) +{ + auto vocabulary = cudf::test::strings_column_wrapper({""}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + auto input = cudf::test::strings_column_wrapper(); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + EXPECT_EQ(results->size(), 0); + results = nvtext::wordpiece_tokenize(sv, *vocab, 10); + EXPECT_EQ(results->size(), 0); +} + +TEST(TextSubwordTest, WordPieceAllNulls) +{ + auto vocabulary = cudf::test::strings_column_wrapper({""}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + auto input = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false}); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + using LCW = cudf::test::lists_column_wrapper; + auto expected = LCW({LCW{}, LCW{}, LCW{}}, cudf::test::iterators::all_nulls()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::wordpiece_tokenize(sv, *vocab, 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST(TextSubwordTest, WordPieceNoTokens) +{ + auto vocabulary = cudf::test::strings_column_wrapper({"x"}); + auto vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary)); + auto input = cudf::test::strings_column_wrapper({" ", " www ", "xxxx"}); + auto sv = cudf::strings_column_view(input); + auto results = nvtext::wordpiece_tokenize(sv, *vocab); + using LCW = cudf::test::lists_column_wrapper; + LCW expected({LCW{}, LCW{-1}, LCW{-1}}); // -1 indicates [unk] not found in vocabulary + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::wordpiece_tokenize(sv, *vocab, 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST(TextSubwordTest, WordPieceErrors) +{ + auto empty = cudf::test::strings_column_wrapper(); + EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(empty)), + std::invalid_argument); + auto nulls = cudf::test::strings_column_wrapper({"", "", ""}, {false, false, false}); + EXPECT_THROW(nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(nulls)), + std::invalid_argument); +} diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 9f3512369a0..7adea963868 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -6527,6 +6527,20 @@ def tokenize_with_vocabulary( ) ) + @acquire_spill_lock() + def wordpiece_tokenize( + self, + vocabulary: plc.nvtext.wordpiece_tokenize.WordPieceVocabulary, + max_words_per_row: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.wordpiece_tokenize.wordpiece_tokenize( + self.to_pylibcudf(mode="read"), + vocabulary, + max_words_per_row, + ) + ) + @acquire_spill_lock() def detokenize(self, indices: ColumnBase, separator: plc.Scalar) -> Self: return type(self).from_pylibcudf( # type: ignore[return-value] diff --git a/python/cudf/cudf/core/wordpiece_tokenize.py b/python/cudf/cudf/core/wordpiece_tokenize.py new file mode 100644 index 00000000000..7a54a430997 --- /dev/null +++ b/python/cudf/cudf/core/wordpiece_tokenize.py @@ -0,0 +1,44 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class WordPieceVocabulary: + """ + A vocabulary object used to tokenize input text. + + Parameters + ---------- + vocabulary : cudf.Series + Strings column of vocabulary terms + """ + + def __init__(self, vocabulary: cudf.Series) -> None: + self.vocabulary = plc.nvtext.wordpiece_tokenize.WordPieceVocabulary( + vocabulary._column.to_pylibcudf(mode="read") + ) + + def tokenize(self, text, max_words_per_row: int = 0) -> cudf.Series: + """ + Parameters + ---------- + text : cudf.Series + The strings to be tokenized. + max_words_per_row : int + Maximum number of words to tokenize per row. + Default 0 tokenizes all words. + + Returns + ------- + cudf.Series + Token values + """ + result = text._column.wordpiece_tokenize( + self.vocabulary, max_words_per_row + ) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py index 78b58344374..7927f8777be 100644 --- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py +++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import os import cupy @@ -7,6 +7,7 @@ import cudf from cudf.core.subword_tokenizer import SubwordTokenizer +from cudf.core.wordpiece_tokenize import WordPieceVocabulary from cudf.testing import assert_eq @@ -237,3 +238,473 @@ def test_text_subword_tokenize(tmpdir): ) expected_metadata = expected_metadata.reshape(-1, 3) assert_eq(expected_metadata, metadata) + + +@pytest.mark.parametrize("max_words", [0, 200, 10]) +def test_text_wordpiece_tokenize(max_words, datadir): + s = cudf.Series( + [ + "The British Isles have been ringing for the last few years with the word ' Art ' in its German sense ; ", + "with ' High Art , ' ' Symbolic Art , ' ' Ecclesiastical Art , ' ' Dramatic Art , ' ' Tragic Art , ' and so forth ; ", + "and every well - educated person is expected , nowadays , to know something about Art . " + "Yet in spite of all translations of German ' AEsthetic ' treatises , and ' Kunstnovellen , ' the mass of the British people cares very little about the matter , and sits contented under the imputation of ' bad taste . ", + " ' Our stage , long since dead , does not revive ; our poetry is dying ; our music , like our architecture , only reproduces the past ; ", + "our painting is only first - rate when it handles landscapes and animals , and seems likely so to remain ; ", + "but , meanwhile , nobody cares . Some of the deepest and most earnest minds vote the question , in general , a ' sham and a snare , ' and whisper to each other", + ] + ) + vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt") + vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True) + wpt = WordPieceVocabulary(vc) + wpr = wpt.tokenize(s, max_words) + expected = cudf.Series( + [ + cudf.Series( + [ + 1109, + 1418, + 2181, + 2897, + 1138, + 1151, + 3170, + 1158, + 1111, + 1103, + 1314, + 1374, + 1201, + 1114, + 1103, + 1937, + 112, + 2051, + 112, + 1107, + 1157, + 1528, + 2305, + 132, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1114, + 112, + 1693, + 2051, + 117, + 112, + 112, + 156, + 1183, + 1306, + 1830, + 1186, + 2646, + 1665, + 2051, + 117, + 112, + 112, + 142, + 1665, + 1665, + 2897, + 1465, + 2050, + 1596, + 1348, + 2051, + 117, + 112, + 112, + 1987, + 2312, + 2980, + 1596, + 2051, + 117, + 112, + 112, + 157, + 1611, + 1403, + 1596, + 2051, + 117, + 112, + 1105, + 1177, + 1111, + 1582, + 132, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1105, + 1451, + 1218, + 118, + 174, + 1181, + 1358, + 2599, + 1906, + 1825, + 1110, + 2637, + 117, + 1208, + 1161, + 1810, + 1183, + 1116, + 117, + 1106, + 1221, + 1380, + 1164, + 2051, + 119, + 162, + 2105, + 1107, + 188, + 1643, + 3150, + 1104, + 1155, + 189, + 1611, + 2316, + 1742, + 2116, + 1116, + 1104, + 1528, + 112, + 138, + 2036, + 2050, + 1324, + 2105, + 1596, + 112, + 189, + 1874, + 2980, + 1548, + 1279, + 117, + 1105, + 112, + 148, + 3488, + 2050, + 2728, + 2707, + 2339, + 1424, + 117, + 112, + 1103, + 3367, + 1104, + 1103, + 1418, + 1234, + 1920, + 1116, + 1304, + 1376, + 1164, + 1103, + 2187, + 117, + 1105, + 3465, + 1116, + 3438, + 1174, + 1223, + 1103, + 178, + 1306, + 1643, + 1358, + 1777, + 2116, + 1104, + 112, + 2213, + 189, + 2225, + 1566, + 119, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 112, + 3458, + 2016, + 117, + 1263, + 1290, + 2044, + 117, + 1674, + 1136, + 1231, + 1964, + 2109, + 132, + 1412, + 185, + 1186, + 2105, + 1616, + 1110, + 173, + 1183, + 1158, + 132, + 1412, + 1390, + 117, + 1176, + 1412, + 170, + 1197, + 1732, + 3150, + 1665, + 1204, + 3313, + 117, + 1178, + 1231, + 1643, + 2180, + 1181, + 1358, + 2093, + 1116, + 1103, + 1763, + 132, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1412, + 2489, + 1916, + 1110, + 1178, + 1148, + 118, + 2603, + 1165, + 1122, + 1289, + 2897, + 1657, + 1116, + 2599, + 3186, + 1116, + 1105, + 1126, + 1182, + 1918, + 3447, + 117, + 1105, + 3093, + 2620, + 1177, + 1106, + 3118, + 132, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1133, + 117, + 1928, + 2246, + 3031, + 1513, + 117, + 1185, + 1830, + 1186, + 1181, + 1183, + 1920, + 1116, + 119, + 1789, + 1104, + 1103, + 1996, + 2556, + 1105, + 1211, + 174, + 1813, + 1673, + 2050, + 1713, + 1116, + 2992, + 1103, + 2304, + 117, + 1107, + 1704, + 117, + 170, + 112, + 188, + 2522, + 1105, + 170, + 188, + 1605, + 1874, + 117, + 112, + 1105, + 192, + 3031, + 1116, + 3365, + 1106, + 1296, + 1168, + ], + dtype=np.int32, + ), + ] + ) + if max_words == 10: + expected = cudf.Series( + [ + cudf.Series( + [ + 1109, + 1418, + 2181, + 2897, + 1138, + 1151, + 3170, + 1158, + 1111, + 1103, + 1314, + 1374, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1114, + 112, + 1693, + 2051, + 117, + 112, + 112, + 156, + 1183, + 1306, + 1830, + 1186, + 2646, + 1665, + 2051, + 117, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1105, + 1451, + 1218, + 118, + 174, + 1181, + 1358, + 2599, + 1906, + 1825, + 1110, + 2637, + 117, + 1208, + 1161, + 1810, + 1183, + 1116, + ], + dtype=np.int32, + ), + cudf.Series( + [112, 3458, 2016, 117, 1263, 1290, 2044, 117, 1674, 1136], + dtype=np.int32, + ), + cudf.Series( + [ + 1412, + 2489, + 1916, + 1110, + 1178, + 1148, + 118, + 2603, + 1165, + 1122, + 1289, + 2897, + ], + dtype=np.int32, + ), + cudf.Series( + [ + 1133, + 117, + 1928, + 2246, + 3031, + 1513, + 117, + 1185, + 1830, + 1186, + 1181, + 1183, + 1920, + 1116, + 119, + 1789, + 1104, + 1103, + ], + dtype=np.int32, + ), + ] + ) + assert_eq(expected, wpr) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd new file mode 100644 index 00000000000..261b86aed47 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd @@ -0,0 +1,22 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.types cimport size_type + + +cdef extern from "nvtext/wordpiece_tokenize.hpp" namespace "nvtext" nogil: + + cdef struct wordpiece_vocabulary "nvtext::wordpiece_vocabulary": + pass + + cdef unique_ptr[wordpiece_vocabulary] load_wordpiece_vocabulary( + const column_view & strings + ) except +libcudf_exception_handler + + cdef unique_ptr[column] wordpiece_tokenize( + const column_view & strings, + const wordpiece_vocabulary & vocabulary, + size_type max_tokens_per_row + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 93e3fb15259..d8cabbf4d47 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -13,8 +13,18 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx + byte_pair_encode.pyx + edit_distance.pyx + generate_ngrams.pyx + jaccard.pyx + minhash.pyx + ngrams_tokenize.pyx + normalize.pyx + replace.pyx + stemmer.pyx + tokenize.pyx + subword_tokenize.pyx + wordpiece_tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index ef837167eb9..eb48ea84dee 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from . cimport ( byte_pair_encode, @@ -12,6 +12,7 @@ from . cimport ( stemmer, subword_tokenize, tokenize, + wordpiece_tokenize, ) __all__ = [ @@ -26,4 +27,5 @@ __all__ = [ "stemmer", "subword_tokenize", "tokenize", + "wordpiece_tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index d88a7d4b825..07e80abe3a8 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from . import ( byte_pair_encode, @@ -12,6 +12,7 @@ stemmer, subword_tokenize, tokenize, + wordpiece_tokenize, ) __all__ = [ @@ -26,4 +27,5 @@ "stemmer", "subword_tokenize", "tokenize", + "wordpiece_tokenize", ] diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd new file mode 100644 index 00000000000..75e86770e0f --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.wordpiece_tokenize cimport wordpiece_vocabulary +from pylibcudf.libcudf.types cimport size_type + +cdef class WordPieceVocabulary: + cdef unique_ptr[wordpiece_vocabulary] c_obj + +cpdef Column wordpiece_tokenize( + Column input, + WordPieceVocabulary vocabulary, + size_type max_words_per_row +) diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi new file mode 100644 index 00000000000..274bc31f316 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi @@ -0,0 +1,12 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +class WordPieceVocabulary: + def __init__(self, vocab: Column): ... + +def wordpiece_tokenize( + input: Column, + vocabulary: WordPieceVocabulary, + max_words_per_row: int, +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx new file mode 100644 index 00000000000..d43d0c9f814 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx @@ -0,0 +1,66 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from cython.operator cimport dereference +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext.wordpiece_tokenize cimport ( + load_wordpiece_vocabulary as cpp_load_wordpiece_vocabulary, + wordpiece_tokenize as cpp_wordpiece_tokenize, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = [ + "WordPieceVocabulary", + "wordpiece_tokenize", +] + +cdef class WordPieceVocabulary: + """The Vocabulary object to be used with ``wordpiece_tokenize``. + + For details, see :cpp:class:`cudf::nvtext::wordpiece_tokenize`. + """ + def __cinit__(self, Column vocab): + cdef column_view c_vocab = vocab.view() + with nogil: + self.c_obj = move(cpp_load_wordpiece_vocabulary(c_vocab)) + + __hash__ = None + +cpdef Column wordpiece_tokenize( + Column input, + WordPieceVocabulary vocabulary, + size_type max_words_per_row +): + """ + Returns the token ids for the input string by looking + up each delimited token in the given vocabulary. + + For details, see cpp:func:`cudf::nvtext::wordpiece_tokenize` + + Parameters + ---------- + input : Column + Strings column to tokenize + vocabulary : WordPieceVocabulary + Used to lookup tokens within ``input`` + max_words_per_row : size_type + Maximum number of words to tokenize per input row + + Returns + ------- + Column + Lists column of token ids + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_wordpiece_tokenize( + input.view(), + dereference(vocabulary.c_obj.get()), + max_words_per_row + ) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py index 516d0f7f78d..679512bb16a 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_subword_tokenize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -61,3 +61,37 @@ def test_subword_tokenize( assert_column_eq(tokens, expected_tokens) assert_column_eq(masks, expected_masks) assert_column_eq(metadata, expected_metadata) + + +@pytest.mark.parametrize("max_words", [0, 5]) +def test_wordpiece_tokenize(max_words): + vocab_strs = pa.array(["[unk]", "abc", "def", "gh", "##i"]) + vocab = plc.nvtext.wordpiece_tokenize.WordPieceVocabulary( + plc.interop.from_arrow(vocab_strs) + ) + input = pa.array( + [ + "gh def abc xyz ghi defi abci", + "abc def gh abc def gh abc def gh", + "abc def gh", + ] + ) + result = plc.nvtext.wordpiece_tokenize.wordpiece_tokenize( + plc.interop.from_arrow(input), vocab, max_words + ) + pa_result = plc.interop.to_arrow(result) + if max_words == 5: + expected = pa.array( + [[3, 2, 1, 0, 3, 4], [1, 2, 3, 1, 2], [1, 2, 3]], + type=pa_result.type, + ) + else: + expected = pa.array( + [ + [3, 2, 1, 0, 3, 4, 2, 4, 1, 4], + [1, 2, 3, 1, 2, 3, 1, 2, 3], + [1, 2, 3], + ], + type=pa_result.type, + ) + assert_column_eq(result, expected)