New nvtext::wordpiece_tokenizer APIs (#17600)

Creates a new word-piece-tokenizer which replaces the existing subword-tokenizer in nvtext. The subword-tokenizer logic is to split out and specialized to perform basic tokenizing with the word-piece logic only. The normalizing part is already a separate API. The output will be a lists column of tokens only. The first change is that the new API uses `wordpiece` instead of `subword`. Here are the 2 C++ API declarations: ``` std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary( cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The vocabulary is loaded as a strings column and the returned object can be used on multiple calls to the next API: ``` std::unique_ptr<cudf::column> wordpiece_tokenize( cudf::strings_column_view const& input, wordpiece_vocabulary const& vocabulary, cudf::size_type max_words_per_row, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` This will return a lists column of integers which represent the tokens for each row. The `max_words_per_row` will stop the tokenizing process for each row once the number of input words (characters delimited by space) has been reached. This means you may get more tokens than `max_words_per_row` for a row if a single word produces multiple tokens. Note, that this API expects the input string to already be normalized -- processed by the `nvtext::normalize_characters` API which is also being reworked in #17818 The Python interface has the following pattern: ``` from cudf.core.wordpiece_tokenize import WordPieceVocabulary input_string = .... # output of the normalizer vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt") vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True) wpt = WordPieceVocabulary(vc) wpr = wpt.tokenize(input_string) ``` The output is a lists column of the tokens and no longer the tensor-data and metadata format. If this format is needed, then we can consider a 3rd API that converts the output here to that format. Closes #17507 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Basit Ayantunde (https://github.com/lamarrr) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: #17600
rapidsai · Mar 4, 2025 · ea2bca3 · ea2bca3
1 parent 32bdfb0
commit ea2bca3
Show file tree

Hide file tree

Showing 16 changed files with 1,900 additions and 10 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -757,6 +757,7 @@ add_library(
   src/text/subword/wordpiece_tokenizer.cu
   src/text/tokenize.cu
   src/text/vocabulary_tokenize.cu
+  src/text/wordpiece_tokenize.cu
   src/transform/bools_to_mask.cu
   src/transform/compute_column.cu
   src/transform/encode.cu

diff --git a/cpp/benchmarks/text/subword.cpp b/cpp/benchmarks/text/subword.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
+#include <nvtext/wordpiece_tokenize.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -57,7 +58,10 @@ static void bench_subword_tokenizer(nvbench::state& state)
 {
   auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
 
-  std::vector<char const*> h_strings(num_rows, "This is a test ");
+  std::vector<char const*> h_strings(
+    num_rows,
+    "This is a test This is a test This is a test This is a test This is a test This is a test "
+    "This is a test This is a test ");
   cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
   static std::string hash_file = create_hash_vocab_file();
   std::vector<uint32_t> offsets{14};
@@ -83,3 +87,36 @@ static void bench_subword_tokenizer(nvbench::state& state)
 NVBENCH_BENCH(bench_subword_tokenizer)
   .set_name("subword_tokenize")
   .add_int64_axis("num_rows", {32768, 262144, 2097152});
+
+static void bench_wordpiece_tokenizer(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_words = static_cast<cudf::size_type>(state.get_int64("max_words"));
+
+  auto const h_strings = std::vector<char const*>(
+    num_rows,
+    "This is a test This is a test This is a test This is a test This is a test This is a test "
+    "This is a test This is a test ");
+  auto const num_words = 32;  // "This is a test" * 8
+  auto const d_strings = cudf::test::strings_column_wrapper(h_strings.begin(), h_strings.end());
+  auto const input     = cudf::strings_column_view{d_strings};
+
+  auto const vocabulary =
+    cudf::test::strings_column_wrapper({"", "[UNK]", "This", "is", "a", "test"});
+  auto const vocab = nvtext::load_wordpiece_vocabulary(cudf::strings_column_view(vocabulary));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  auto out_size = num_rows * (max_words > 0 ? std::min(max_words, num_words) : num_words);
+  state.add_global_memory_writes<nvbench::int32_t>(out_size);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = nvtext::wordpiece_tokenize(input, *vocab, max_words);
+  });
+}
+
+NVBENCH_BENCH(bench_wordpiece_tokenizer)
+  .set_name("wordpiece_tokenize")
+  .add_int64_axis("num_rows", {32768, 262144, 2097152})
+  .add_int64_axis("max_words", {0, 20, 40});
diff --git a/cpp/include/nvtext/wordpiece_tokenize.hpp b/cpp/include/nvtext/wordpiece_tokenize.hpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
+
+namespace CUDF_EXPORT nvtext {
+/**
+ * @addtogroup nvtext_tokenize
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Vocabulary object to be used with nvtext::wordpiece_tokenizer
+ *
+ * Use nvtext::load_wordpiece_vocabulary to create this object.
+ */
+struct wordpiece_vocabulary {
+  /**
+   * @brief Vocabulary object constructor
+   *
+   * Token ids are the row indices within the vocabulary column.
+   * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+   *
+   * @throw std::invalid_argument if `vocabulary` contains nulls or is empty
+   *
+   * @param input Strings for the vocabulary
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  wordpiece_vocabulary(cudf::strings_column_view const& input,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~wordpiece_vocabulary();
+
+  struct wordpiece_vocabulary_impl;
+  std::unique_ptr<wordpiece_vocabulary_impl> _impl;
+};
+
+/**
+ * @brief Create a tokenize_vocabulary object from a strings column
+ *
+ * Token ids are the row indices within the vocabulary column.
+ * Each vocabulary entry is expected to be unique otherwise the behavior is undefined.
+ *
+ * @throw std::invalid_argument if `vocabulary` contains nulls or is empty
+ *
+ * @param input Strings for the vocabulary
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::tokenize_with_vocabulary
+ */
+std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary(
+  cudf::strings_column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the token ids for the input string a wordpiece tokenizer
+ * algorithm with the given vocabulary
+ *
+ * Example:
+ * @code{.pseudo}
+ * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
+ * v = load_wordpiece_vocabulary(vocabulary)
+ * input = ["I have a new GPU now !"]
+ * t = wordpiece_tokenize(i,v)
+ * t is now [[3, 2, 1, 4, 5, 6, 0, 7]]
+ * @endcode
+ *
+ * The `max_words_per_row` also optionally limits the output by only processing
+ * a maximum number of words per row. Here a word is defined as consecutive
+ * sequence of characters delimited by space character(s).
+ *
+ * Example:
+ * @code{.pseudo}
+ * vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"]
+ * v = load_wordpiece_vocabulary(vocabulary)
+ * input = ["I have  a new GPU now !"]
+ * t4 = wordpiece_tokenize(i,v,4)
+ * t4 is now [[3, 2, 1, 4]]
+ * t5 = wordpiece_tokenize(i,v,5)
+ * t5 is now [[3, 2, 1, 4, 5, 6]]
+ * @endcode
+ *
+ * Any null row entry results in a corresponding null entry in the output.
+ *
+ * @param input Strings column to tokenize
+ * @param vocabulary Used to lookup tokens within `input`
+ * @param max_words_per_row Maximum number of words to tokenize for each row.
+ *                          Default 0 tokenizes all words.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Lists column of token ids
+ */
+std::unique_ptr<cudf::column> wordpiece_tokenize(
+  cudf::strings_column_view const& input,
+  wordpiece_vocabulary const& vocabulary,
+  cudf::size_type max_words_per_row = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/** @} */  // end of tokenize group
+}  // namespace CUDF_EXPORT nvtext