From 2f66bb118ec61df0d7e150ee4110112f22db6f43 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Feb 2025 14:19:36 -0500 Subject: [PATCH 1/4] Add seed parameter to cudf hash_character_ngrams --- python/cudf/cudf/core/column/string.py | 12 ++++++++---- python/cudf/cudf/tests/text/test_text_methods.py | 6 +++--- .../pylibcudf/libcudf/nvtext/generate_ngrams.pxd | 6 ++++-- .../pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd | 5 +++-- .../pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi | 4 ++-- .../pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx | 8 ++++++-- .../pylibcudf/tests/test_nvtext_generate_ngrams.py | 8 ++++---- 7 files changed, 30 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 074da57c470..d0f82812afc 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4986,7 +4986,7 @@ def character_ngrams( return result def hash_character_ngrams( - self, n: int = 5, as_list: bool = False + self, n: int = 5, seed: np.uint32 = 0, as_list: bool = False ) -> SeriesOrIndex: """ Generate hashes of n-grams from characters in a column of strings. @@ -4997,6 +4997,8 @@ def hash_character_ngrams( n : int The degree of the n-gram (number of consecutive characters). Default is 5. + seed: uint32 + The seed value for the hash algorithm. as_list : bool Set to True to return the hashes in a list column where each list element is the hashes for each string. @@ -5021,7 +5023,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - self._column.hash_character_ngrams(n), + self._column.hash_character_ngrams(n, seed), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -6176,9 +6178,11 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn: return type(self).from_pylibcudf(result) # type: ignore[return-value] @acquire_spill_lock() - def hash_character_ngrams(self, ngrams: int) -> ListColumn: + def hash_character_ngrams( + self, ngrams: int, seed: np.uint32 + ) -> ListColumn: result = plc.nvtext.generate_ngrams.hash_character_ngrams( - self.to_pylibcudf(mode="read"), ngrams + self.to_pylibcudf(mode="read"), ngrams, seed ) return type(self).from_pylibcudf(result) # type: ignore[return-value] diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 9a62285403f..86e1e46c1a2 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import random import string @@ -378,11 +378,11 @@ def test_hash_character_ngrams(): ), ] ) - actual = strings.str.hash_character_ngrams(5, True) + actual = strings.str.hash_character_ngrams(n=5, as_list=True) assert type(expected) is type(actual) assert_eq(expected, actual) - actual = strings.str.hash_character_ngrams(5) + actual = strings.str.hash_character_ngrams(n=5) expected = expected.explode() assert type(expected) is type(actual) assert_eq(expected, actual) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd index c7bd4da5441..a62361bb190 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd @@ -1,4 +1,5 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column @@ -22,5 +23,6 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil: cdef unique_ptr[column] hash_character_ngrams( const column_view &strings, - size_type ngrams + size_type ngrams, + uint32_t seed ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd index f15eb1f25e9..bbeb8f241a1 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd @@ -1,5 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from pylibcudf.column cimport Column from pylibcudf.libcudf.types cimport size_type from pylibcudf.scalar cimport Scalar @@ -9,4 +10,4 @@ cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator) cpdef Column generate_character_ngrams(Column input, size_type ngrams=*) -cpdef Column hash_character_ngrams(Column input, size_type ngrams=*) +cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi index 2757518379d..a7d4da97d2a 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column from pylibcudf.scalar import Scalar @@ -7,4 +7,4 @@ def generate_ngrams( input: Column, ngrams: int, separator: Scalar ) -> Column: ... def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... -def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ... +def hash_character_ngrams(input: Column, ngrams: int, seed: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 521bc0ef4a4..896615eaa3c 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -1,5 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from libc.stdint cimport uint32_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column @@ -81,7 +82,7 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): ) return Column.from_libcudf(move(c_result)) -cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): +cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed): """ Returns a lists column of hash values of the characters in each string @@ -93,6 +94,8 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): Input strings ngram : size_type The ngram number to generate + seed : uint32_t + Seed used for the hash algorithm Returns ------- @@ -106,5 +109,6 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2): c_result = cpp_hash_character_ngrams( c_strings, ngrams, + seed ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py index fae4685f81b..c8f8ce4f8ff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_generate_ngrams.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -40,10 +40,10 @@ def test_generate_character_ngrams(input_col, ngram): @pytest.mark.parametrize("ngram", [2, 3]) -def test_hash_character_ngrams(input_col, ngram): +@pytest.mark.parametrize("seed", [0, 3]) +def test_hash_character_ngrams(input_col, ngram, seed): result = plc.nvtext.generate_ngrams.hash_character_ngrams( - plc.interop.from_arrow(input_col), - ngram, + plc.interop.from_arrow(input_col), ngram, seed ) pa_result = plc.interop.to_arrow(result) assert all( From 52dcd5a767cc4dae2aec2032cfd2fc836a9c7366 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Feb 2025 15:23:05 -0500 Subject: [PATCH 2/4] move new parameter to the end of fn signature --- python/cudf/cudf/core/column/string.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index d0f82812afc..90465751385 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4986,7 +4986,7 @@ def character_ngrams( return result def hash_character_ngrams( - self, n: int = 5, seed: np.uint32 = 0, as_list: bool = False + self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0 ) -> SeriesOrIndex: """ Generate hashes of n-grams from characters in a column of strings. From cc4b443f030f55b32b99e70ae9907b84c8bb2d4a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 12 Feb 2025 17:06:09 -0500 Subject: [PATCH 3/4] fix function doc --- python/cudf/cudf/core/column/string.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 90465751385..5d8fa6a90a4 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4997,17 +4997,17 @@ def hash_character_ngrams( n : int The degree of the n-gram (number of consecutive characters). Default is 5. - seed: uint32 - The seed value for the hash algorithm. as_list : bool Set to True to return the hashes in a list column where each list element is the hashes for each string. + seed: uint32 + The seed value for the hash algorithm. Examples -------- >>> import cudf >>> str_series = cudf.Series(['abcdefg','stuvwxyz']) - >>> str_series.str.hash_character_ngrams(5, True) + >>> str_series.str.hash_character_ngrams(n=5, as_list=True) 0 [3902511862, 570445242, 4202475763] 1 [556054766, 3166857694, 3760633458, 192452857] dtype: list From 3b7c4159c687b778eafde7df4b4decbb88c5257c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Feb 2025 16:34:51 -0500 Subject: [PATCH 4/4] merge again? --- python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx index 896615eaa3c..29da693e06f 100644 --- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx @@ -82,6 +82,7 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2): ) return Column.from_libcudf(move(c_result)) + cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed): """ Returns a lists column of hash values of the characters in each string