Skip to content

Commit

Permalink
Merge branch 'branch-25.04' into fix_libcudf_pins_cu11
Browse files Browse the repository at this point in the history
  • Loading branch information
galipremsagar authored Mar 4, 2025
2 parents a0cd0ac + ea2bca3 commit a290f6e
Show file tree
Hide file tree
Showing 26 changed files with 2,521 additions and 524 deletions.
2 changes: 2 additions & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,7 @@ add_library(
src/io/parquet/reader_impl_chunking.cu
src/io/parquet/reader_impl_helpers.cpp
src/io/parquet/reader_impl_preprocess.cu
src/io/parquet/stats_filter_helpers.cpp
src/io/parquet/writer_impl.cu
src/io/parquet/writer_impl_helpers.cpp
src/io/parquet/decode_fixed.cu
Expand Down Expand Up @@ -756,6 +757,7 @@ add_library(
src/text/subword/wordpiece_tokenizer.cu
src/text/tokenize.cu
src/text/vocabulary_tokenize.cu
src/text/wordpiece_tokenize.cu
src/transform/bools_to_mask.cu
src/transform/compute_column.cu
src/transform/encode.cu
Expand Down
41 changes: 39 additions & 2 deletions cpp/benchmarks/text/subword.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,6 +20,7 @@
#include <cudf/strings/strings_column_view.hpp>

#include <nvtext/subword_tokenize.hpp>
#include <nvtext/wordpiece_tokenize.hpp>

#include <nvbench/nvbench.cuh>

Expand Down Expand Up @@ -57,7 +58,10 @@ static void bench_subword_tokenizer(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));

std::vector<char const*> h_strings(num_rows, "This is a test ");
std::vector<char const*> h_strings(
num_rows,
"This is a test This is a test This is a test This is a test This is a test This is a test "
"This is a test This is a test ");
cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
static std::string hash_file = create_hash_vocab_file();
std::vector<uint32_t> offsets{14};
Expand All @@ -83,3 +87,36 @@ static void bench_subword_tokenizer(nvbench::state& state)
NVBENCH_BENCH(bench_subword_tokenizer)
.set_name("subword_tokenize")
.add_int64_axis("num_rows", {32768, 262144, 2097152});

static void bench_wordpiece_tokenizer(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const max_words = static_cast<cudf::size_type>(state.get_int64("max_words"));

auto const h_strings = std::vector<char const*>(
num_rows,
"This is a test This is a test This is a test This is a test This is a test This is a test "
"This is a test This is a test ");
auto const num_words = 32; // "This is a test" * 8
auto const d_strings = cudf::test::strings_column_wrapper(h_strings.begin(), h_strings.end());
auto const input = cudf::strings_column_view{d_strings};

auto const vocabulary =
cudf::test::strings_column_wrapper({"", "[UNK]", "This", "is", "a", "test"});
auto const vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));

state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
auto chars_size = input.chars_size(cudf::get_default_stream());
state.add_global_memory_reads<nvbench::int8_t>(chars_size);
auto out_size = num_rows * (max_words > 0 ? std::min(max_words, num_words) : num_words);
state.add_global_memory_writes<nvbench::int32_t>(out_size);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
auto result = nvtext::wordpiece_tokenize(input, *vocab, max_words);
});
}

NVBENCH_BENCH(bench_wordpiece_tokenizer)
.set_name("wordpiece_tokenize")
.add_int64_axis("num_rows", {32768, 262144, 2097152})
.add_int64_axis("max_words", {0, 20, 40});
122 changes: 122 additions & 0 deletions cpp/include/nvtext/wordpiece_tokenize.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
/*
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/export.hpp>
#include <cudf/utilities/memory_resource.hpp>

namespace CUDF_EXPORT nvtext {
/**
* @addtogroup nvtext_tokenize
* @{
* @file
*/

/**
* @brief Vocabulary object to be used with nvtext::wordpiece_tokenizer
*
* Use nvtext::load_wordpiece_vocabulary to create this object.
*/
struct wordpiece_vocabulary {
/**
* @brief Vocabulary object constructor
*
* Token ids are the row indices within the vocabulary column.
* Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
*
* @throw std::invalid_argument if `vocabulary` contains nulls or is empty
*
* @param input Strings for the vocabulary
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
*/
wordpiece_vocabulary(cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
~wordpiece_vocabulary();

struct wordpiece_vocabulary_impl;
std::unique_ptr<wordpiece_vocabulary_impl> _impl;
};

/**
* @brief Create a tokenize_vocabulary object from a strings column
*
* Token ids are the row indices within the vocabulary column.
* Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
*
* @throw std::invalid_argument if `vocabulary` contains nulls or is empty
*
* @param input Strings for the vocabulary
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Object to be used with nvtext::tokenize_with_vocabulary
*/
std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
cudf::strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/**
* @brief Returns the token ids for the input string a wordpiece tokenizer
* algorithm with the given vocabulary
*
* Example:
* @code{.pseudo}
* vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
* v = load_wordpiece_vocabulary(vocabulary)
* input = ["I have a new GPU now !"]
* t = wordpiece_tokenize(i,v)
* t is now [[3, 2, 1, 4, 5, 6, 0, 7]]
* @endcode
*
* The `max_words_per_row` also optionally limits the output by only processing
* a maximum number of words per row. Here a word is defined as consecutive
* sequence of characters delimited by space character(s).
*
* Example:
* @code{.pseudo}
* vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
* v = load_wordpiece_vocabulary(vocabulary)
* input = ["I have a new GPU now !"]
* t4 = wordpiece_tokenize(i,v,4)
* t4 is now [[3, 2, 1, 4]]
* t5 = wordpiece_tokenize(i,v,5)
* t5 is now [[3, 2, 1, 4, 5, 6]]
* @endcode
*
* Any null row entry results in a corresponding null entry in the output.
*
* @param input Strings column to tokenize
* @param vocabulary Used to lookup tokens within `input`
* @param max_words_per_row Maximum number of words to tokenize for each row.
* Default 0 tokenizes all words.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return Lists column of token ids
*/
std::unique_ptr<cudf::column> wordpiece_tokenize(
cudf::strings_column_view const& input,
wordpiece_vocabulary const& vocabulary,
cudf::size_type max_words_per_row = 0,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());

/** @} */ // end of tokenize group
} // namespace CUDF_EXPORT nvtext
Loading

0 comments on commit a290f6e

Please sign in to comment.