rapidsai · rapids-bot · Feb 17, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025
@@ -4986,7 +4986,7 @@ def character_ngrams(
         return result
 
     def hash_character_ngrams(
-        self, n: int = 5, as_list: bool = False
+        self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0
     ) -> SeriesOrIndex:
         """
         Generate hashes of n-grams from characters in a column of strings.
@@ -5000,12 +5000,14 @@ def hash_character_ngrams(
         as_list : bool
             Set to True to return the hashes in a list column where each
             list element is the hashes for each string.
+        seed: uint32
+            The seed value for the hash algorithm.
 
         Examples
         --------
         >>> import cudf
         >>> str_series = cudf.Series(['abcdefg','stuvwxyz'])
-        >>> str_series.str.hash_character_ngrams(5, True)
+        >>> str_series.str.hash_character_ngrams(n=5, as_list=True)
         0               [3902511862, 570445242, 4202475763]
         1    [556054766, 3166857694, 3760633458, 192452857]
         dtype: list
@@ -5021,7 +5023,7 @@ def hash_character_ngrams(
         """
 
         result = self._return_or_inplace(
-            self._column.hash_character_ngrams(n),
+            self._column.hash_character_ngrams(n, seed),
             retain_index=True,
         )
         if isinstance(result, cudf.Series) and not as_list:
@@ -6176,9 +6178,11 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn:
         return type(self).from_pylibcudf(result)  # type: ignore[return-value]
 
     @acquire_spill_lock()
-    def hash_character_ngrams(self, ngrams: int) -> ListColumn:
+    def hash_character_ngrams(
+        self, ngrams: int, seed: np.uint32
+    ) -> ListColumn:
         result = plc.nvtext.generate_ngrams.hash_character_ngrams(
-            self.to_pylibcudf(mode="read"), ngrams
+            self.to_pylibcudf(mode="read"), ngrams, seed
         )
         return type(self).from_pylibcudf(result)  # type: ignore[return-value]
 

@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import random
 import string
@@ -378,11 +378,11 @@ def test_hash_character_ngrams():
             ),
         ]
     )
-    actual = strings.str.hash_character_ngrams(5, True)
+    actual = strings.str.hash_character_ngrams(n=5, as_list=True)
     assert type(expected) is type(actual)
     assert_eq(expected, actual)
 
-    actual = strings.str.hash_character_ngrams(5)
+    actual = strings.str.hash_character_ngrams(n=5)
     expected = expected.explode()
     assert type(expected) is type(actual)
     assert_eq(expected, actual)

@@ -1,4 +1,5 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
+from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
@@ -22,5 +23,6 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] hash_character_ngrams(
         const column_view &strings,
-        size_type ngrams
+        size_type ngrams,
+        uint32_t seed
     ) except +libcudf_exception_handler
@@ -1,5 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
@@ -9,4 +10,4 @@ cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)
 
 cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)
 
-cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
+cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed)
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
@@ -7,4 +7,4 @@ def generate_ngrams(
     input: Column, ngrams: int, separator: Scalar
 ) -> Column: ...
 def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
-def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
+def hash_character_ngrams(input: Column, ngrams: int, seed: int) -> Column: ...
@@ -1,5 +1,6 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.column cimport Column
@@ -81,7 +82,8 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
         )
     return Column.from_libcudf(move(c_result))
 
-cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
+
+cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed):
     """
     Returns a lists column of hash values of the characters in each string
 
@@ -93,6 +95,8 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
         Input strings
     ngram : size_type
         The ngram number to generate
+    seed : uint32_t
+        Seed used for the hash algorithm
 
     Returns
     -------
@@ -106,5 +110,6 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
         c_result = cpp_hash_character_ngrams(
             c_strings,
             ngrams,
+            seed
         )
     return Column.from_libcudf(move(c_result))
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -40,10 +40,10 @@ def test_generate_character_ngrams(input_col, ngram):
 
 
 @pytest.mark.parametrize("ngram", [2, 3])
-def test_hash_character_ngrams(input_col, ngram):
+@pytest.mark.parametrize("seed", [0, 3])
+def test_hash_character_ngrams(input_col, ngram, seed):
     result = plc.nvtext.generate_ngrams.hash_character_ngrams(
-        plc.interop.from_arrow(input_col),
-        ngram,
+        plc.interop.from_arrow(input_col), ngram, seed
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(