Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add seed parameter to cudf hash_character_ngrams #17994

Merged
merged 8 commits into from
Feb 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -4986,7 +4986,7 @@ def character_ngrams(
return result

def hash_character_ngrams(
self, n: int = 5, as_list: bool = False
self, n: int = 5, as_list: bool = False, seed: np.uint32 = 0
) -> SeriesOrIndex:
"""
Generate hashes of n-grams from characters in a column of strings.
Expand All @@ -5000,12 +5000,14 @@ def hash_character_ngrams(
as_list : bool
Set to True to return the hashes in a list column where each
list element is the hashes for each string.
seed: uint32
The seed value for the hash algorithm.

Examples
--------
>>> import cudf
>>> str_series = cudf.Series(['abcdefg','stuvwxyz'])
>>> str_series.str.hash_character_ngrams(5, True)
>>> str_series.str.hash_character_ngrams(n=5, as_list=True)
0 [3902511862, 570445242, 4202475763]
1 [556054766, 3166857694, 3760633458, 192452857]
dtype: list
Expand All @@ -5021,7 +5023,7 @@ def hash_character_ngrams(
"""

result = self._return_or_inplace(
self._column.hash_character_ngrams(n),
self._column.hash_character_ngrams(n, seed),
retain_index=True,
)
if isinstance(result, cudf.Series) and not as_list:
Expand Down Expand Up @@ -6176,9 +6178,11 @@ def generate_character_ngrams(self, ngrams: int) -> ListColumn:
return type(self).from_pylibcudf(result) # type: ignore[return-value]

@acquire_spill_lock()
def hash_character_ngrams(self, ngrams: int) -> ListColumn:
def hash_character_ngrams(
self, ngrams: int, seed: np.uint32
) -> ListColumn:
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
self.to_pylibcudf(mode="read"), ngrams
self.to_pylibcudf(mode="read"), ngrams, seed
)
return type(self).from_pylibcudf(result) # type: ignore[return-value]

Expand Down
6 changes: 3 additions & 3 deletions python/cudf/cudf/tests/text/test_text_methods.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2019-2024, NVIDIA CORPORATION.
# Copyright (c) 2019-2025, NVIDIA CORPORATION.

import random
import string
Expand Down Expand Up @@ -378,11 +378,11 @@ def test_hash_character_ngrams():
),
]
)
actual = strings.str.hash_character_ngrams(5, True)
actual = strings.str.hash_character_ngrams(n=5, as_list=True)
assert type(expected) is type(actual)
assert_eq(expected, actual)

actual = strings.str.hash_character_ngrams(5)
actual = strings.str.hash_character_ngrams(n=5)
expected = expected.explode()
assert type(expected) is type(actual)
assert_eq(expected, actual)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
from libc.stdint cimport uint32_t
from libcpp.memory cimport unique_ptr
from pylibcudf.exception_handler cimport libcudf_exception_handler
from pylibcudf.libcudf.column.column cimport column
Expand All @@ -22,5 +23,6 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:

cdef unique_ptr[column] hash_character_ngrams(
const column_view &strings,
size_type ngrams
size_type ngrams,
uint32_t seed
) except +libcudf_exception_handler
5 changes: 3 additions & 2 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from pylibcudf.column cimport Column
from pylibcudf.libcudf.types cimport size_type
from pylibcudf.scalar cimport Scalar
Expand All @@ -9,4 +10,4 @@ cpdef Column generate_ngrams(Column input, size_type ngrams, Scalar separator)

cpdef Column generate_character_ngrams(Column input, size_type ngrams=*)

cpdef Column hash_character_ngrams(Column input, size_type ngrams=*)
cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed)
4 changes: 2 additions & 2 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.

from pylibcudf.column import Column
from pylibcudf.scalar import Scalar
Expand All @@ -7,4 +7,4 @@ def generate_ngrams(
input: Column, ngrams: int, separator: Scalar
) -> Column: ...
def generate_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
def hash_character_ngrams(input: Column, ngrams: int = 2) -> Column: ...
def hash_character_ngrams(input: Column, ngrams: int, seed: int) -> Column: ...
9 changes: 7 additions & 2 deletions python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.

from libc.stdint cimport uint32_t
from libcpp.memory cimport unique_ptr
from libcpp.utility cimport move
from pylibcudf.column cimport Column
Expand Down Expand Up @@ -81,7 +82,8 @@ cpdef Column generate_character_ngrams(Column input, size_type ngrams = 2):
)
return Column.from_libcudf(move(c_result))

cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):

cpdef Column hash_character_ngrams(Column input, size_type ngrams, uint32_t seed):
"""
Returns a lists column of hash values of the characters in each string

Expand All @@ -93,6 +95,8 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
Input strings
ngram : size_type
The ngram number to generate
seed : uint32_t
Seed used for the hash algorithm

Returns
-------
Expand All @@ -106,5 +110,6 @@ cpdef Column hash_character_ngrams(Column input, size_type ngrams = 2):
c_result = cpp_hash_character_ngrams(
c_strings,
ngrams,
seed
)
return Column.from_libcudf(move(c_result))
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2024, NVIDIA CORPORATION.
# Copyright (c) 2024-2025, NVIDIA CORPORATION.

import pyarrow as pa
import pytest
Expand Down Expand Up @@ -40,10 +40,10 @@ def test_generate_character_ngrams(input_col, ngram):


@pytest.mark.parametrize("ngram", [2, 3])
def test_hash_character_ngrams(input_col, ngram):
@pytest.mark.parametrize("seed", [0, 3])
def test_hash_character_ngrams(input_col, ngram, seed):
result = plc.nvtext.generate_ngrams.hash_character_ngrams(
plc.interop.from_arrow(input_col),
ngram,
plc.interop.from_arrow(input_col), ngram, seed
)
pa_result = plc.interop.to_arrow(result)
assert all(
Expand Down
Loading