-
Notifications
You must be signed in to change notification settings - Fork 933
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New nvtext::wordpiece_tokenizer APIs (#17600)
Creates a new word-piece-tokenizer which replaces the existing subword-tokenizer in nvtext. The subword-tokenizer logic is to split out and specialized to perform basic tokenizing with the word-piece logic only. The normalizing part is already a separate API. The output will be a lists column of tokens only. The first change is that the new API uses `wordpiece` instead of `subword`. Here are the 2 C++ API declarations: ``` std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary( cudf::strings_column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` The vocabulary is loaded as a strings column and the returned object can be used on multiple calls to the next API: ``` std::unique_ptr<cudf::column> wordpiece_tokenize( cudf::strings_column_view const& input, wordpiece_vocabulary const& vocabulary, cudf::size_type max_words_per_row, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); ``` This will return a lists column of integers which represent the tokens for each row. The `max_words_per_row` will stop the tokenizing process for each row once the number of input words (characters delimited by space) has been reached. This means you may get more tokens than `max_words_per_row` for a row if a single word produces multiple tokens. Note, that this API expects the input string to already be normalized -- processed by the `nvtext::normalize_characters` API which is also being reworked in #17818 The Python interface has the following pattern: ``` from cudf.core.wordpiece_tokenize import WordPieceVocabulary input_string = .... # output of the normalizer vocab_file = os.path.join(datadir, "bert_base_cased_sampled/vocab.txt") vc = cudf.read_text(vocab_file, delimiter="\n", strip_delimiters=True) wpt = WordPieceVocabulary(vc) wpr = wpt.tokenize(input_string) ``` The output is a lists column of the tokens and no longer the tensor-data and metadata format. If this format is needed, then we can consider a 3rd API that converts the output here to that format. Closes #17507 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Basit Ayantunde (https://github.com/lamarrr) - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: #17600
- Loading branch information
1 parent
32bdfb0
commit ea2bca3
Showing
16 changed files
with
1,900 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
/* | ||
* Copyright (c) 2024-2025, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/scalar/scalar.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
#include <cudf/utilities/export.hpp> | ||
#include <cudf/utilities/memory_resource.hpp> | ||
|
||
namespace CUDF_EXPORT nvtext { | ||
/** | ||
* @addtogroup nvtext_tokenize | ||
* @{ | ||
* @file | ||
*/ | ||
|
||
/** | ||
* @brief Vocabulary object to be used with nvtext::wordpiece_tokenizer | ||
* | ||
* Use nvtext::load_wordpiece_vocabulary to create this object. | ||
*/ | ||
struct wordpiece_vocabulary { | ||
/** | ||
* @brief Vocabulary object constructor | ||
* | ||
* Token ids are the row indices within the vocabulary column. | ||
* Each vocabulary entry is expected to be unique otherwise the behavior is undefined. | ||
* | ||
* @throw std::invalid_argument if `vocabulary` contains nulls or is empty | ||
* | ||
* @param input Strings for the vocabulary | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
* @param mr Device memory resource used to allocate the returned column's device memory | ||
*/ | ||
wordpiece_vocabulary(cudf::strings_column_view const& input, | ||
rmm::cuda_stream_view stream = cudf::get_default_stream(), | ||
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); | ||
~wordpiece_vocabulary(); | ||
|
||
struct wordpiece_vocabulary_impl; | ||
std::unique_ptr<wordpiece_vocabulary_impl> _impl; | ||
}; | ||
|
||
/** | ||
* @brief Create a tokenize_vocabulary object from a strings column | ||
* | ||
* Token ids are the row indices within the vocabulary column. | ||
* Each vocabulary entry is expected to be unique otherwise the behavior is undefined. | ||
* | ||
* @throw std::invalid_argument if `vocabulary` contains nulls or is empty | ||
* | ||
* @param input Strings for the vocabulary | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
* @param mr Device memory resource used to allocate the returned column's device memory | ||
* @return Object to be used with nvtext::tokenize_with_vocabulary | ||
*/ | ||
std::unique_ptr<wordpiece_vocabulary> load_wordpiece_vocabulary( | ||
cudf::strings_column_view const& input, | ||
rmm::cuda_stream_view stream = cudf::get_default_stream(), | ||
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); | ||
|
||
/** | ||
* @brief Returns the token ids for the input string a wordpiece tokenizer | ||
* algorithm with the given vocabulary | ||
* | ||
* Example: | ||
* @code{.pseudo} | ||
* vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"] | ||
* v = load_wordpiece_vocabulary(vocabulary) | ||
* input = ["I have a new GPU now !"] | ||
* t = wordpiece_tokenize(i,v) | ||
* t is now [[3, 2, 1, 4, 5, 6, 0, 7]] | ||
* @endcode | ||
* | ||
* The `max_words_per_row` also optionally limits the output by only processing | ||
* a maximum number of words per row. Here a word is defined as consecutive | ||
* sequence of characters delimited by space character(s). | ||
* | ||
* Example: | ||
* @code{.pseudo} | ||
* vocabulary = ["[UNK]", "a", "have", "I", "new", "GP", "##U", "!"] | ||
* v = load_wordpiece_vocabulary(vocabulary) | ||
* input = ["I have a new GPU now !"] | ||
* t4 = wordpiece_tokenize(i,v,4) | ||
* t4 is now [[3, 2, 1, 4]] | ||
* t5 = wordpiece_tokenize(i,v,5) | ||
* t5 is now [[3, 2, 1, 4, 5, 6]] | ||
* @endcode | ||
* | ||
* Any null row entry results in a corresponding null entry in the output. | ||
* | ||
* @param input Strings column to tokenize | ||
* @param vocabulary Used to lookup tokens within `input` | ||
* @param max_words_per_row Maximum number of words to tokenize for each row. | ||
* Default 0 tokenizes all words. | ||
* @param stream CUDA stream used for device memory operations and kernel launches | ||
* @param mr Device memory resource used to allocate the returned column's device memory | ||
* @return Lists column of token ids | ||
*/ | ||
std::unique_ptr<cudf::column> wordpiece_tokenize( | ||
cudf::strings_column_view const& input, | ||
wordpiece_vocabulary const& vocabulary, | ||
cudf::size_type max_words_per_row = 0, | ||
rmm::cuda_stream_view stream = cudf::get_default_stream(), | ||
rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); | ||
|
||
/** @} */ // end of tokenize group | ||
} // namespace CUDF_EXPORT nvtext |
Oops, something went wrong.