diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 3018295c9eb..73c76cd5d3e 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -740,6 +740,7 @@ add_library( src/table/table.cpp src/table/table_device_view.cu src/table/table_view.cpp + src/text/dedup.cu src/text/detokenize.cu src/text/edit_distance.cu src/text/generate_ngrams.cu diff --git a/cpp/include/nvtext/dedup.hpp b/cpp/include/nvtext/dedup.hpp new file mode 100644 index 00000000000..4595bcdef57 --- /dev/null +++ b/cpp/include/nvtext/dedup.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +//! NVText APIs +namespace CUDF_EXPORT nvtext { +/** + * @addtogroup nvtext_replace + * @{ + * @file + */ + +/** + * @brief Returns a duplicate strings found in the given input + * + * The internal implementation creates a suffix array of the input which + * requires ~10x the input size for temporary memory. + * + * The output includes any strings of at least `min_width` bytes that + * appear more than once in the entire input. + * + * @param input Strings column to dedup + * @param min_width Minimum number of bytes must match to specify a duplicate + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with updated strings + */ +std::unique_ptr substring_deduplicate( + cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** @} */ // end of group +} // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/dedup.cu b/cpp/src/text/dedup.cu new file mode 100644 index 00000000000..3c59d9121ea --- /dev/null +++ b/cpp/src/text/dedup.cu @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace nvtext { +namespace detail { +namespace { + +using string_index = cudf::strings::detail::string_index_pair; + +struct sort_comparator_fn { + char const* d_chars; + int64_t chars_size; + __device__ bool operator()(int64_t lhs, int64_t rhs) const + { + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + return lh_str < rh_str; + } +}; + +__device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs) +{ + auto const size1 = lhs.size_bytes(); + auto const size2 = rhs.size_bytes(); + auto const* ptr1 = lhs.data(); + auto const* ptr2 = rhs.data(); + + cudf::size_type idx = 0; + for (; (idx < size1) && (idx < size2); ++idx) { + if (*ptr1 != *ptr2) { break; } + ++ptr1; + ++ptr2; + } + return idx; +} + +struct find_duplicates_fn { + char const* d_chars; + int64_t chars_size; + cudf::size_type width; + int64_t const* d_indices; + __device__ int16_t operator()(int64_t idx) const + { + if (idx == 0) { return 0; } + constexpr int64_t max_size = cuda::std::numeric_limits::max(); + + auto const lhs = d_indices[idx - 1]; + auto const rhs = d_indices[idx]; + auto const lhs_size = static_cast(cuda::std::min(max_size, chars_size - lhs)); + auto const rhs_size = static_cast(cuda::std::min(max_size, chars_size - rhs)); + + auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size); + auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size); + + constexpr auto max_run_length = + static_cast(cuda::std::numeric_limits::max()); + + auto const size = cuda::std::min(count_common_bytes(lh_str, rh_str), max_run_length); + + return size >= width ? static_cast(size) : 0; + } +}; + +struct collapse_overlaps_fn { + char const* d_chars; + int64_t const* d_offsets; + int16_t const* d_sizes; + __device__ string_index operator()(int64_t idx) const + { + auto size = d_sizes[idx]; + auto offset = d_offsets[idx]; + if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) { + return string_index{nullptr, 0}; + } + // TODO: need to handle chains longer than max + // size == d_sizes[idx-1] == max + + auto d_ptr = d_chars + offset; + return string_index(d_ptr, size); + } +}; + +} // namespace + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_EXPECTS(min_width > 8, "min_width should be at least 8"); + auto d_strings = cudf::column_device_view::create(input.parent(), stream); + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + + auto d_input_chars = input.chars_begin(stream) + first_offset; + auto chars_size = last_offset - first_offset; + CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size"); + + auto indices = rmm::device_uvector(chars_size - min_width + 1, stream); + auto sizes = rmm::device_uvector(indices.size(), stream); + + thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end()); + // note: thrust::sort may be limited to a 32-bit range + thrust::sort(rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + sort_comparator_fn{d_input_chars, chars_size}); + + // locate candidate duplicates within the suffixes produced by sort + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + sizes.begin(), + find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()}); + + // remove the non-candidate entries from indices and sizes + thrust::remove_if( + rmm::exec_policy_nosync(stream), + indices.begin(), + indices.end(), + thrust::counting_iterator(0), + [d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; }); + auto end = thrust::remove(rmm::exec_policy(stream), sizes.begin(), sizes.end(), 0); + sizes.resize(thrust::distance(sizes.begin(), end), stream); + indices.resize(sizes.size(), stream); + + // sort the resulting indices/sizes for overlap filtering + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), sizes.begin()); + + // produce final duplicates for make_strings_column and collapse any overlapping candidates + auto duplicates = + rmm::device_uvector(indices.size(), stream); + thrust::transform(rmm::exec_policy_nosync(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(indices.size()), + duplicates.begin(), + collapse_overlaps_fn{d_input_chars, indices.data(), sizes.data()}); + + // filter out the remaining non-viable candidates + duplicates.resize( + thrust::distance( + duplicates.begin(), + thrust::remove( + rmm::exec_policy(stream), duplicates.begin(), duplicates.end(), string_index{nullptr, 0})), + stream); + + // sort result by size descending (should be very fast) + thrust::sort(rmm::exec_policy_nosync(stream), + duplicates.begin(), + duplicates.end(), + [] __device__(auto lhs, auto rhs) -> bool { return lhs.second > rhs.second; }); + + return cudf::strings::detail::make_strings_column( + duplicates.begin(), duplicates.end(), stream, mr); +} +} // namespace detail + +std::unique_ptr substring_deduplicate(cudf::strings_column_view const& input, + cudf::size_type min_width, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_deduplicate(input, min_width, stream, mr); +} + +} // namespace nvtext diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e3ca8b70b87..9fbe3d04d5d 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -607,6 +607,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te ConfigureTest( TEXT_TEST text/bpe_tests.cpp + text/dedup_tests.cpp text/edit_distance_tests.cpp text/jaccard_tests.cpp text/minhash_tests.cpp diff --git a/cpp/tests/text/dedup_tests.cpp b/cpp/tests/text/dedup_tests.cpp new file mode 100644 index 00000000000..6429b70363a --- /dev/null +++ b/cpp/tests/text/dedup_tests.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include + +#include + +struct TextDedupTest : public cudf::test::BaseFixture {}; + +TEST_F(TextDedupTest, StringDedup) +{ + // https://loremipsum.io/generator?n=25&t=p + // clang-format off + auto input = cudf::test::strings_column_wrapper({ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ", // 90 + "01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ", // 180 + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit ", // 270 + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 ", // 360 + "cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ", // 450 + "Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil ", // 540 + "sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum ", // 630 + "quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo ", // 720 + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur ", // 810 + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit ", // 900 + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 ", // 990 + }); + // clang-format on + + auto sv = cudf::strings_column_view(input); + + auto results = nvtext::substring_deduplicate(sv, 20); + auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper( + {" 01234567890123456789 ", ". 012345678901234", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); + + // Test with sliced input + auto const sliced_input = cudf::slice(input, {1, 10}).front(); + + sv = cudf::strings_column_view(sliced_input); + results = nvtext::substring_deduplicate(sv, 15); + expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected); +} diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 7adea963868..0f9b77aa874 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5342,6 +5342,27 @@ def is_vowel(self, position) -> SeriesOrIndex: self._column.is_letter(True, position) # type: ignore[arg-type] ) + def substring_deduplicate(self, min_width) -> SeriesOrIndex: + """ + + + Parameters + ---------- + min_width : int32 + The minimum number of bytes to determine duplicates + + Returns + ------- + Series of duplicate strings found + + """ + return self._return_or_inplace( + self._column.substring_deduplicate(min_width), # type: ignore[arg-type] + inplace=False, + expand=False, + retain_index=False, + ) + def edit_distance(self, targets) -> SeriesOrIndex: """ The ``targets`` strings are measured against the strings in this @@ -6322,6 +6343,13 @@ def hash_character_ngrams( ) return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() + def substring_deduplicate(self, min_width: int) -> Self: + result = plc.nvtext.dedup.substring_deduplicate( + self.to_pylibcudf(mode="read"), min_width + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + @acquire_spill_lock() def edit_distance(self, targets: Self) -> NumericalColumn: result = plc.nvtext.edit_distance.edit_distance( diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index 47b41bd1e39..1463f1d3351 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -1085,3 +1085,20 @@ def test_byte_pair_encoding(separator, input, results): actual = encoder(strings, separator) assert type(expected) is type(actual) assert_eq(expected, actual) + + +def test_substring_deduplicate(): + text = ( + " 01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation " + "laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit " + "voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 " + "deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur " + "dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit " + "quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 " + ) + input = cudf.Series([text]) + actual = input.str.substring_deduplicate(15) + expected = cudf.Series( + [" 01234567890123456789 ", ". 012345678901234", " reprehenderit "] + ) + assert_eq(expected, actual) diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..9e38b9b4c51 --- /dev/null +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/dedup.pxd @@ -0,0 +1,13 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from pylibcudf.exception_handler cimport libcudf_exception_handler +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.types cimport size_type + + +cdef extern from "nvtext/dedup.hpp" namespace "nvtext" nogil: + + cdef unique_ptr[column] substring_deduplicate( + column_view source_strings, + size_type min_width) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt index d8cabbf4d47..29f350c5611 100644 --- a/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/nvtext/CMakeLists.txt @@ -14,6 +14,7 @@ set(cython_sources byte_pair_encode.pyx + dedup.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd index eb48ea84dee..56506f5f8b6 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.pxd @@ -2,6 +2,7 @@ from . cimport ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -16,11 +17,12 @@ from . cimport ( ) __all__ = [ + "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", "minhash", - "byte_pair_encode" "ngrams_tokenize", "normalize", "replace", diff --git a/python/pylibcudf/pylibcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/nvtext/__init__.py index 07e80abe3a8..eebab06de3c 100644 --- a/python/pylibcudf/pylibcudf/nvtext/__init__.py +++ b/python/pylibcudf/pylibcudf/nvtext/__init__.py @@ -2,6 +2,7 @@ from . import ( byte_pair_encode, + dedup, edit_distance, generate_ngrams, jaccard, @@ -17,6 +18,7 @@ __all__ = [ "byte_pair_encode", + "dedup", "edit_distance", "generate_ngrams", "jaccard", diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pxd b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd new file mode 100644 index 00000000000..5ff85f60b68 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column +from pylibcudf.libcudf.types cimport size_type + +cpdef Column substring_deduplicate(Column input, size_type min_width) diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyi b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi new file mode 100644 index 00000000000..c324cfbd9c2 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyi @@ -0,0 +1,5 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from pylibcudf.column import Column + +def substring_deduplicate(input: Column, min_width: int) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/dedup.pyx b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx new file mode 100644 index 00000000000..5c65129f380 --- /dev/null +++ b/python/pylibcudf/pylibcudf/nvtext/dedup.pyx @@ -0,0 +1,39 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.nvtext.dedup cimport ( + substring_deduplicate as cpp_substring_deduplicate, +) +from pylibcudf.libcudf.types cimport size_type + +__all__ = ["substring_deduplicate"] + + +cpdef Column substring_deduplicate(Column input, size_type min_width): + """ + Returns duplicate strings found anywhere in the input column + with min_width minimum number of bytes. + + For details, see :cpp:func:`substring_deduplicate` + + Parameters + ---------- + input : Column + Strings column of text + min_width : size_type + Minimum width of bytes to detect duplicates + + Returns + ------- + Column + New column of duplicate strings + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_substring_deduplicate(input.view(), min_width) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py new file mode 100644 index 00000000000..7987045435c --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_dedup.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pyarrow as pa +import pytest +from utils import assert_column_eq + +import pylibcudf as plc + + +@pytest.fixture(scope="module") +def input_col(): + arr = [ + "01234567890123456789", + "01234567890123456789", + "01234567890123456789", + ] + return pa.array(arr) + + +@pytest.mark.parametrize("min_width", [10, 20]) +def test_substring_deduplicate(input_col, min_width): + result = plc.nvtext.dedup.substring_deduplicate( + plc.interop.from_arrow(input_col), + min_width, + ) + expected = pa.array(["01234567890123456789012345678901234567890123456789"]) + assert_column_eq(result, expected)