Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add nvtext substring deduplication API #18104

Draft
wants to merge 7 commits into
base: branch-25.04
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -740,6 +740,7 @@ add_library(
src/table/table.cpp
src/table/table_device_view.cu
src/table/table_view.cpp
src/text/dedup.cu
src/text/detokenize.cu
src/text/edit_distance.cu
src/text/generate_ngrams.cu
Expand Down
53 changes: 53 additions & 0 deletions cpp/include/nvtext/dedup.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>

//! NVText APIs
namespace CUDF_EXPORT nvtext {
/**
* @addtogroup nvtext_replace
* @{
* @file
*/

/**
* @brief Returns a duplicate strings found in the given input
*
* The internal implementation creates a suffix array of the input which
* requires ~10x the input size for temporary memory.
*
* The output includes any strings of at least `min_width` bytes that
* appear more than once in the entire input.
*
* @param input Strings column to dedup
* @param min_width Minimum number of bytes must match to specify a duplicate
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New strings column with updated strings
*/
std::unique_ptr<cudf::column> substring_deduplicate(
cudf::strings_column_view const& input,
cudf::size_type min_width,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/** @} */ // end of group
} // namespace CUDF_EXPORT nvtext
214 changes: 214 additions & 0 deletions cpp/src/text/dedup.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/sorting.hpp>
#include <cudf/strings/detail/strings_column_factories.cuh>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
#include <cudf/utilities/memory_resource.hpp>

#include <nvtext/dedup.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

#include <cuda/std/functional>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/remove.h>
#include <thrust/sequence.h>
#include <thrust/sort.h>
#include <thrust/unique.h>

namespace nvtext {
namespace detail {
namespace {

using string_index = cudf::strings::detail::string_index_pair;

struct sort_comparator_fn {
char const* d_chars;
int64_t chars_size;
__device__ bool operator()(int64_t lhs, int64_t rhs) const
{
constexpr int64_t max_size = cuda::std::numeric_limits<cudf::size_type>::max();

auto const lhs_size = static_cast<cudf::size_type>(cuda::std::min(max_size, chars_size - lhs));
auto const rhs_size = static_cast<cudf::size_type>(cuda::std::min(max_size, chars_size - rhs));
auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size);
auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size);
return lh_str < rh_str;
}
};

__device__ cudf::size_type count_common_bytes(cudf::string_view lhs, cudf::string_view rhs)
{
auto const size1 = lhs.size_bytes();
auto const size2 = rhs.size_bytes();
auto const* ptr1 = lhs.data();
auto const* ptr2 = rhs.data();

cudf::size_type idx = 0;
for (; (idx < size1) && (idx < size2); ++idx) {
if (*ptr1 != *ptr2) { break; }
++ptr1;
++ptr2;
}
return idx;
}

struct find_duplicates_fn {
char const* d_chars;
int64_t chars_size;
cudf::size_type width;
int64_t const* d_indices;
__device__ int16_t operator()(int64_t idx) const
{
if (idx == 0) { return 0; }
constexpr int64_t max_size = cuda::std::numeric_limits<cudf::size_type>::max();

auto const lhs = d_indices[idx - 1];
auto const rhs = d_indices[idx];
auto const lhs_size = static_cast<cudf::size_type>(cuda::std::min(max_size, chars_size - lhs));
auto const rhs_size = static_cast<cudf::size_type>(cuda::std::min(max_size, chars_size - rhs));

auto const lh_str = cudf::string_view(d_chars + lhs, lhs_size);
auto const rh_str = cudf::string_view(d_chars + rhs, rhs_size);

constexpr auto max_run_length =
static_cast<cudf::size_type>(cuda::std::numeric_limits<int16_t>::max());

auto const size = cuda::std::min(count_common_bytes(lh_str, rh_str), max_run_length);

return size >= width ? static_cast<int16_t>(size) : 0;
}
};

struct collapse_overlaps_fn {
char const* d_chars;
int64_t const* d_offsets;
int16_t const* d_sizes;
__device__ string_index operator()(int64_t idx) const
{
auto size = d_sizes[idx];
auto offset = d_offsets[idx];
if ((idx > 0) && ((offset - 1) == d_offsets[idx - 1]) && (size < d_sizes[idx - 1])) {
return string_index{nullptr, 0};
}
// TODO: need to handle chains longer than max<int16_t>
// size == d_sizes[idx-1] == max<int16_t>

auto d_ptr = d_chars + offset;
return string_index(d_ptr, size);
}
};

} // namespace

std::unique_ptr<cudf::column> substring_deduplicate(cudf::strings_column_view const& input,
cudf::size_type min_width,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_EXPECTS(min_width > 8, "min_width should be at least 8");
auto d_strings = cudf::column_device_view::create(input.parent(), stream);

auto [first_offset, last_offset] =
cudf::strings::detail::get_first_and_last_offset(input, stream);

auto d_input_chars = input.chars_begin(stream) + first_offset;
auto chars_size = last_offset - first_offset;
CUDF_EXPECTS(min_width < chars_size, "min_width value cannot exceed the input size");

auto indices = rmm::device_uvector<int64_t>(chars_size - min_width + 1, stream);
auto sizes = rmm::device_uvector<int16_t>(indices.size(), stream);

thrust::sequence(rmm::exec_policy_nosync(stream), indices.begin(), indices.end());
// note: thrust::sort may be limited to a 32-bit range
thrust::sort(rmm::exec_policy_nosync(stream),
indices.begin(),
indices.end(),
sort_comparator_fn{d_input_chars, chars_size});

// locate candidate duplicates within the suffixes produced by sort
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(indices.size()),
sizes.begin(),
find_duplicates_fn{d_input_chars, chars_size, min_width, indices.data()});

// remove the non-candidate entries from indices and sizes
thrust::remove_if(
rmm::exec_policy_nosync(stream),
indices.begin(),
indices.end(),
thrust::counting_iterator<int64_t>(0),
[d_sizes = sizes.data()] __device__(int64_t idx) -> bool { return d_sizes[idx] == 0; });
auto end = thrust::remove(rmm::exec_policy(stream), sizes.begin(), sizes.end(), 0);
sizes.resize(thrust::distance(sizes.begin(), end), stream);
indices.resize(sizes.size(), stream);

// sort the resulting indices/sizes for overlap filtering
thrust::sort_by_key(
rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), sizes.begin());

// produce final duplicates for make_strings_column and collapse any overlapping candidates
auto duplicates =
rmm::device_uvector<cudf::strings::detail::string_index_pair>(indices.size(), stream);
thrust::transform(rmm::exec_policy_nosync(stream),
thrust::counting_iterator<int64_t>(0),
thrust::counting_iterator<int64_t>(indices.size()),
duplicates.begin(),
collapse_overlaps_fn{d_input_chars, indices.data(), sizes.data()});

// filter out the remaining non-viable candidates
duplicates.resize(
thrust::distance(
duplicates.begin(),
thrust::remove(
rmm::exec_policy(stream), duplicates.begin(), duplicates.end(), string_index{nullptr, 0})),
stream);

// sort result by size descending (should be very fast)
thrust::sort(rmm::exec_policy_nosync(stream),
duplicates.begin(),
duplicates.end(),
[] __device__(auto lhs, auto rhs) -> bool { return lhs.second > rhs.second; });

return cudf::strings::detail::make_strings_column(
duplicates.begin(), duplicates.end(), stream, mr);
}
} // namespace detail

std::unique_ptr<cudf::column> substring_deduplicate(cudf::strings_column_view const& input,
cudf::size_type min_width,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::substring_deduplicate(input, min_width, stream, mr);
}

} // namespace nvtext
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,7 @@ ConfigureTest(STRUCTS_TEST structs/structs_column_tests.cpp structs/utilities_te
ConfigureTest(
TEXT_TEST
text/bpe_tests.cpp
text/dedup_tests.cpp
text/edit_distance_tests.cpp
text/jaccard_tests.cpp
text/minhash_tests.cpp
Expand Down
67 changes: 67 additions & 0 deletions cpp/tests/text/dedup_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <cudf/column/column.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/dedup.hpp>

#include <vector>

struct TextDedupTest : public cudf::test::BaseFixture {};

TEST_F(TextDedupTest, StringDedup)
{
// https://loremipsum.io/generator?n=25&t=p
// clang-format off
auto input = cudf::test::strings_column_wrapper({
"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ", // 90
"01234567890123456789 magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ", // 180
"laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit ", // 270
"voluptate velit esse cillum dolore eu fugiat nulla pariatur. 01234567890123456789 ", // 360
"cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. ", // 450
"Ea esse numquam et recusandae quia et voluptatem sint quo explicabo repudiandae. At nihil ", // 540
"sunt non architecto doloremque eos dolorem consequuntur. Vel adipisci quod et voluptatum ", // 630
"quis est fuga tempore qui dignissimos aliquam et sint repellendus ut autem voluptas quo ", // 720
"deleniti earum? Qui ipsam ipsum hic ratione mollitia aut nobis laboriosam. Eum aspernatur ", // 810
"dolorem sit voluptatum numquam in iure placeat vel laudantium molestiae? Ad reprehenderit ", // 900
"quia aut minima deleniti id consequatur sapiente est dolores cupiditate. 012345678901234 ", // 990
});
// clang-format on

auto sv = cudf::strings_column_view(input);

auto results = nvtext::substring_deduplicate(sv, 20);
auto expected = cudf::test::strings_column_wrapper({" 01234567890123456789 "});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);

results = nvtext::substring_deduplicate(sv, 15);
expected = cudf::test::strings_column_wrapper(
{" 01234567890123456789 ", ". 012345678901234", " reprehenderit "});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);

// Test with sliced input
auto const sliced_input = cudf::slice(input, {1, 10}).front();

sv = cudf::strings_column_view(sliced_input);
results = nvtext::substring_deduplicate(sv, 15);
expected = cudf::test::strings_column_wrapper({"01234567890123456789 ", " reprehenderit "});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
}
28 changes: 28 additions & 0 deletions python/cudf/cudf/core/column/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -5342,6 +5342,27 @@ def is_vowel(self, position) -> SeriesOrIndex:
self._column.is_letter(True, position) # type: ignore[arg-type]
)

def substring_deduplicate(self, min_width) -> SeriesOrIndex:
"""


Parameters
----------
min_width : int32
The minimum number of bytes to determine duplicates

Returns
-------
Series of duplicate strings found

"""
return self._return_or_inplace(
self._column.substring_deduplicate(min_width), # type: ignore[arg-type]
inplace=False,
expand=False,
retain_index=False,
)

def edit_distance(self, targets) -> SeriesOrIndex:
"""
The ``targets`` strings are measured against the strings in this
Expand Down Expand Up @@ -6322,6 +6343,13 @@ def hash_character_ngrams(
)
return type(self).from_pylibcudf(result) # type: ignore[return-value]

@acquire_spill_lock()
def substring_deduplicate(self, min_width: int) -> Self:
result = plc.nvtext.dedup.substring_deduplicate(
self.to_pylibcudf(mode="read"), min_width
)
return type(self).from_pylibcudf(result) # type: ignore[return-value]

@acquire_spill_lock()
def edit_distance(self, targets: Self) -> NumericalColumn:
result = plc.nvtext.edit_distance.edit_distance(
Expand Down
Loading
Loading