From 51e83d6ac18f749fd5bbcbd3a828a6d42fe63ee1 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 26 Feb 2025 14:22:57 -0500 Subject: [PATCH 1/3] Add nvtext substring deduplication API --- cpp/CMakeLists.txt | 1 + cpp/include/nvtext/dedup.hpp | 53 +++++ cpp/src/text/dedup.cu | 206 ++++++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/text/dedup_tests.cpp | 59 +++++ python/cudf/cudf/core/column/string.py | 28 +++ .../cudf/cudf/tests/text/test_text_methods.py | 22 ++ .../pylibcudf/libcudf/nvtext/dedup.pxd | 13 ++ .../pylibcudf/pylibcudf/nvtext/CMakeLists.txt | 6 +- .../pylibcudf/pylibcudf/nvtext/__init__.pxd | 6 +- python/pylibcudf/pylibcudf/nvtext/__init__.py | 4 +- python/pylibcudf/pylibcudf/nvtext/dedup.pxd | 6 + python/pylibcudf/pylibcudf/nvtext/dedup.pyi | 5 + python/pylibcudf/pylibcudf/nvtext/dedup.pyx | 39 ++++ .../pylibcudf/tests/test_nvtext_dedup.py | 48 ++++ 15 files changed, 491 insertions(+), 6 deletions(-) create mode 100644 cpp/include/nvtext/dedup.hpp create mode 100644 cpp/src/text/dedup.cu create mode 100644 cpp/tests/text/dedup_tests.cpp create mode 100644 python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pxd create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pyi create mode 100644 python/pylibcudf/pylibcudf/nvtext/dedup.pyx create mode 100644 python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0282282b5f3..3e8852c7300 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -739,6 +739,7 @@ add_library( src/table/table.cpp src/table/table_device_view.cu src/table/table_view.cpp + src/text/dedup.cu src/text/detokenize.cu src/text/edit_distance.cu src/text/generate_ngrams.cu diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp new file mode 100644 index 00000000000..4595bcdef57 --- /dev/null +++ b/cpp/include/nvtext/dedup.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +//! NVText APIs +namespace CUDF_EXPORT nvtext { +/** + * @addtogroup nvtext_replace + * @{ + * @file + */ + +/** + * @brief Returns a duplicate strings found in the given input + * + * The internal implementation creates a suffix array of the input which + * requires ~10x the input size for temporary memory. + * + * The output includes any strings of at least `min_width` bytes that + * appear more than once in the entire input. + * + * @param input Strings column to dedup + * @param min_width Minimum number of bytes must match to specify a duplicate + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with updated strings + */ +std::unique_ptr substring_deduplicate( + cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** @} */ // end of group +} // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu new file mode 100644 index 00000000000..add038e20c8 --- /dev/null +++ b/cpp/src/text/dedup.cu @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +using string_index = cudf::strings::detail::string_index_pair; + +struct sort_comparator_fn { + char const* d_chars; + int64_t chars_size; + __device__ bool operator()(int64_t lhs, int64_t rhs) const + { + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + return lh_str < rh_str; + } +}; + +__device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs) +{ + auto const size1 = lhs.size_bytes(); + auto const size2 = rhs.size_bytes(); + auto const* ptr1 = lhs.data(); + auto const* ptr2 = rhs.data(); + + cudf::size_type idx = 0; + for (; (idx < size1) && (idx < size2); ++idx) { + if (*ptr1 != *ptr2) { break; } + ++ptr1; + ++ptr2; + } + return idx; +} + +struct find_duplicates_fn { + char const* d_chars; + int64_t chars_size; + cudf::size_type width; + int64_t const* d_indices; + __device__ int16_t operator()(int64_t idx) const + { + if (idx == 0) { return 0; } + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs = d_indices[idx - 1]; + auto const rhs = d_indices[idx]; + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + + auto const size = + cuda::std::min(count_common_bytes(lh_str, rh_str), + static_cast(cuda::std::numeric_limits::max())); + return size >= width ? static_cast(size) : 0; + } +}; + +struct collapse_overlaps_fn { + char const* d_chars; + int64_t const* d_offsets; + int16_t const* d_sizes; + __device__ string_index operator()(int64_t idx) const + { + auto size = d_sizes[idx]; + auto offset = d_offsets[idx]; + if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { + return string_index{nullptr, 0}; + } + auto d_ptr = d_chars + offset; + return string_index(d_ptr, size); + } +}; + +} // namespace + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + // need to handle slicing + auto d_input_chars = input.chars_begin(stream); + auto chars_size = input.chars_size(stream); + CUDF_EXPECTS(min_width < chars_size, "min_width value larger than the input"); + + auto indices = rmm::device_uvector(chars_size - min_width, stream); + auto sizes = rmm::device_uvector(indices.size(), stream); + + thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); + // thrust::sort may be limited to a 32-bit range + thrust::sort(rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + sort_comparator_fn{d_input_chars, chars_size}); + + // locate candidate duplicates within the suffixes produced by sort + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + sizes.begin(), + find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()}); + + // remove the non-candidate entries from indices and sizes + thrust::remove_if( + rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + thrust::counting_iterator(0), + [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }); + auto end = thrust::remove(rmm::exec_policy(stream), sizes.begin(), sizes.end(), 0); + sizes.resize(thrust::distance(sizes.begin(), end), stream); + indices.resize(sizes.size(), stream); + + // sort the resulting indices/sizes for overlap filtering + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), sizes.begin()); + + // produce final duplicates for make_strings_column and collapse any overlapping candidates + auto duplicates = + rmm::device_uvector(indices.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + duplicates.begin(), + collapse_overlaps_fn{d_input_chars, indices.data(), sizes.data()}); + + // filter out the remaining non-viable candidates + duplicates.resize( + thrust::distance( + duplicates.begin(), + thrust::remove( + rmm::exec_policy(stream), duplicates.begin(), duplicates.end(), string_index{nullptr, 0})), + stream); + + // sort result by size descending (should be very fast) + thrust::sort(rmm::exec_policy_nosync(stream), + duplicates.begin(), + duplicates.end(), + [] __device__(auto lhs, auto rhs) -> bool { return lhs.second > rhs.second; }); + + return cudf::strings::detail::make_strings_column( + duplicates.begin(), duplicates.end(), stream, mr); +} +} // namespace detail + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_deduplicate(input, min_width, stream, mr); +} + +} // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cfc6a0dc425..881238939e4 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -607,6 +607,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te ConfigureTest( TEXT_TEST text/bpe_tests.cpp + text/dedup_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp text/minhash_tests.cpp diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp new file mode 100644 index 00000000000..0bad8ae22ae --- /dev/null +++ b/cpp/tests/text/dedup_tests.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +struct TextDedupTest : public cudf::test::BaseFixture {}; + +TEST_F(TextDedupTest, StringDedup) +{ + // https://loremipsum.io/generator?n=25&t=p + // clang-format off + auto input = cudf::test::strings_column_wrapper({ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ", // 90 + "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ", // 180 + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit ", // 270 + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 ", // 360 + "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ", // 450 + "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil ", // 540 + "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum ", // 630 + "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo ", // 720 + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur ", // 810 + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit ", // 900 + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 ", // 990 + }); + // clang-format on + + auto sv = cudf::strings_column_view(input); + + auto results = nvtext::substring_deduplicate(sv, 20); + auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper( + {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index c0ad33ec7d6..af1b4ad7463 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5343,6 +5343,27 @@ def is_vowel(self, position) -> SeriesOrIndex: self._column.is_letter(True, position) # type: ignore[arg-type] ) + def substring_deduplicate(self, min_width) -> SeriesOrIndex: + """ + + + Parameters + ---------- + min_width : int32 + The minimum number of bytes to determine duplicates + + Returns + ------- + Series of duplicate strings found + + """ + return self._return_or_inplace( + self._column.substring_deduplicate(min_width), # type: ignore[arg-type] + inplace=False, + expand=False, + retain_index=False, + ) + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this @@ -6210,6 +6231,13 @@ def hash_character_ngrams( ) return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() + def substring_deduplicate(self, min_width: int) -> Self: + result = plc.nvtext.dedup.substring_deduplicate( + self.to_pylibcudf(mode="read"), min_width + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() def edit_distance(self, targets: Self) -> NumericalColumn: result = plc.nvtext.edit_distance.edit_distance( diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index dc45827d2e8..f0b975d56e2 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1043,3 +1043,25 @@ def test_byte_pair_encoding(separator, input, results): actual = encoder(strings, separator) assert type(expected) is type(actual) assert_eq(expected, actual) + + +def test_substring_deduplicate(): + text = ( + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " # 90 + "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " # 180 + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " # 270 + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " # 360 + "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. " # 450 + "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil " # 540 + "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum " # 630 + "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo " # 720 + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " # 810 + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " # 900 + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " # 990 + ) + input = cudf.Series([text]) + actual = input.str.substring_deduplicate(15) + expected = cudf.Series( + [" 01234567890123456789 ", ". 012345678901234", " reprehenderit "] + ) + assert_eq(expected, actual) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..9e38b9b4c51 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.types cimport size_type + + +cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] substring_deduplicate( + column_view source_strings, + size_type min_width) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index 93e3fb15259..0333c84badd 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx + dedup.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx + normalize.pyx replace.pyx stemmer.pyx tokenize.pyx byte_pair_encode.pyx subword_tokenize.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index ef837167eb9..1b58752d292 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. from . cimport ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -15,11 +16,12 @@ from . cimport ( ) __all__ = [ + "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", "minhash", - "byte_pair_encode" "ngrams_tokenize", "normalize", "replace", diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index d88a7d4b825..0740c8c6e4e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -1,7 +1,8 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from . import ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -16,6 +17,7 @@ __all__ = [ "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..5ff85f60b68 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +cpdef Column substring_deduplicate(Column input, size_type min_width) diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi new file mode 100644 index 00000000000..c324cfbd9c2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def substring_deduplicate(input: Column, min_width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx new file mode 100644 index 00000000000..5c65129f380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.dedup cimport ( + substring_deduplicate as cpp_substring_deduplicate, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["substring_deduplicate"] + + +cpdef Column substring_deduplicate(Column input, size_type min_width): + """ + Returns duplicate strings found anywhere in the input column + with min_width minimum number of bytes. + + For details, see :cpp:func:`substring_deduplicate` + + Parameters + ---------- + input : Column + Strings column of text + min_width : size_type + Minimum width of bytes to detect duplicates + + Returns + ------- + Column + New column of duplicate strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_substring_deduplicate(input.view(), min_width) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py new file mode 100644 index 00000000000..e7f4a971f08 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py @@ -0,0 +1,48 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = ["trouble", "toy", "syzygy"] + return pa.array(arr) + + +@pytest.mark.parametrize("check_vowels", [True, False]) +@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) +def test_is_letter(input_col, check_vowels, indices): + def is_letter(s, i, check): + vowels = "aeiouy" + return (s[i] in vowels) == check + + result = plc.nvtext.stemmer.is_letter( + plc.interop.from_arrow(input_col), + check_vowels, + plc.interop.from_arrow(pa.array(indices)) + if isinstance(indices, list) + else indices, + ) + expected = pa.array( + [ + is_letter( + s, + indices[i] if isinstance(indices, list) else indices, + check_vowels, + ) + for i, s in enumerate(input_col.to_pylist()) + ] + ) + assert_column_eq(result, expected) + + +def test_porter_stemmer_measure(input_col): + result = plc.nvtext.stemmer.porter_stemmer_measure( + plc.interop.from_arrow(input_col), + ) + expected = pa.array([1, 1, 2], type=pa.int32()) + assert_column_eq(result, expected) From b2025fd8e3895a7d606acd892a911515029fc813 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Mar 2025 11:48:43 -0500 Subject: [PATCH 2/3] add sliced support; fix pytest --- cpp/src/text/dedup.cu | 24 +++++++---- cpp/tests/text/dedup_tests.cpp | 8 ++++ .../pylibcudf/tests/test_nvtext_dedup.py | 43 +++++-------------- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index add038e20c8..fbce1914461 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -96,9 +97,11 @@ struct find_duplicates_fn { auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); - auto const size = - cuda::std::min(count_common_bytes(lh_str, rh_str), - static_cast(cuda::std::numeric_limits::max())); + constexpr auto max_run_length = + static_cast(cuda::std::numeric_limits::max()); + + auto const size = cuda::std::min(count_common_bytes(lh_str, rh_str), max_run_length); + return size >= width ? static_cast(size) : 0; } }; @@ -112,6 +115,7 @@ struct collapse_overlaps_fn { auto size = d_sizes[idx]; auto offset = d_offsets[idx]; if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { + // TODO: need to handle chains longer than max return string_index{nullptr, 0}; } auto d_ptr = d_chars + offset; @@ -129,16 +133,18 @@ std::unique_ptr substring_deduplicate(cudf::strings_column_view co CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); auto d_strings = cudf::column_device_view::create(input.parent(), stream); - // need to handle slicing - auto d_input_chars = input.chars_begin(stream); - auto chars_size = input.chars_size(stream); - CUDF_EXPECTS(min_width < chars_size, "min_width value larger than the input"); + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + + auto d_input_chars = input.chars_begin(stream) + first_offset; + auto chars_size = last_offset - first_offset; + CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size"); - auto indices = rmm::device_uvector(chars_size - min_width, stream); + auto indices = rmm::device_uvector(chars_size - min_width + 1, stream); auto sizes = rmm::device_uvector(indices.size(), stream); thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); - // thrust::sort may be limited to a 32-bit range + // note: thrust::sort may be limited to a 32-bit range thrust::sort(rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp index 0bad8ae22ae..6429b70363a 100644 --- a/cpp/tests/text/dedup_tests.cpp +++ b/cpp/tests/text/dedup_tests.cpp @@ -56,4 +56,12 @@ TEST_F(TextDedupTest, StringDedup) expected = cudf::test::strings_column_wrapper( {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + // Test with sliced input + auto const sliced_input = cudf::slice(input, {1, 10}).front(); + + sv = cudf::strings_column_view(sliced_input); + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); } diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py index e7f4a971f08..7987045435c 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -9,40 +9,19 @@ @pytest.fixture(scope="module") def input_col(): - arr = ["trouble", "toy", "syzygy"] + arr = [ + "01234567890123456789", + "01234567890123456789", + "01234567890123456789", + ] return pa.array(arr) -@pytest.mark.parametrize("check_vowels", [True, False]) -@pytest.mark.parametrize("indices", [[3, 1, 4], 1]) -def test_is_letter(input_col, check_vowels, indices): - def is_letter(s, i, check): - vowels = "aeiouy" - return (s[i] in vowels) == check - - result = plc.nvtext.stemmer.is_letter( - plc.interop.from_arrow(input_col), - check_vowels, - plc.interop.from_arrow(pa.array(indices)) - if isinstance(indices, list) - else indices, - ) - expected = pa.array( - [ - is_letter( - s, - indices[i] if isinstance(indices, list) else indices, - check_vowels, - ) - for i, s in enumerate(input_col.to_pylist()) - ] - ) - assert_column_eq(result, expected) - - -def test_porter_stemmer_measure(input_col): - result = plc.nvtext.stemmer.porter_stemmer_measure( +@pytest.mark.parametrize("min_width", [10, 20]) +def test_substring_deduplicate(input_col, min_width): + result = plc.nvtext.dedup.substring_deduplicate( plc.interop.from_arrow(input_col), + min_width, ) - expected = pa.array([1, 1, 2], type=pa.int32()) + expected = pa.array(["01234567890123456789012345678901234567890123456789"]) assert_column_eq(result, expected) From 915a290b5f4253ace582fd29ccb3df05759218ac Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Mar 2025 19:22:08 -0500 Subject: [PATCH 3/3] fix pytest --- cpp/src/text/dedup.cu | 4 +++- .../cudf/cudf/tests/text/test_text_methods.py | 17 ++++++----------- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu index fbce1914461..3c59d9121ea 100644 --- a/cpp/src/text/dedup.cu +++ b/cpp/src/text/dedup.cu @@ -115,9 +115,11 @@ struct collapse_overlaps_fn { auto size = d_sizes[idx]; auto offset = d_offsets[idx]; if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { - // TODO: need to handle chains longer than max return string_index{nullptr, 0}; } + // TODO: need to handle chains longer than max + // size == d_sizes[idx-1] == max + auto d_ptr = d_chars + offset; return string_index(d_ptr, size); } diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 1c3d378ba5c..1463f1d3351 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1089,17 +1089,12 @@ def test_byte_pair_encoding(separator, input, results): def test_substring_deduplicate(): text = ( - "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt " # 90 - "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " # 180 - "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " # 270 - "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " # 360 - "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. " # 450 - "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil " # 540 - "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum " # 630 - "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo " # 720 - "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " # 810 - "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " # 900 - "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " # 990 + " 01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " ) input = cudf.Series([text]) actual = input.str.substring_deduplicate(15)