github · aneubeck · May 7, 2025 · May 5, 2025 · May 5, 2025 · May 5, 2025
@@ -1,6 +1,6 @@
 [package]
 name = "bpe-openai"
-version = "0.2.1"
+version = "0.3.0"
 edition = "2021"
 description = "Prebuilt fast byte-pair encoders for OpenAI."
 repository = "https://github.com/github/rust-gems"
@@ -13,18 +13,19 @@ crate-type = ["lib", "staticlib"]
 bench = false
 
 [dependencies]
-bpe = { version = "0.2.0", path = "../bpe" }
+bpe = { version = "0.2", path = "../bpe" }
 either = "1.13"
 regex-automata = "0.4"
 rmp-serde = "1"
+unicode-normalization = "0.1"
 
 [dev-dependencies]
-bpe = { version = "0.2.0", path = "../bpe", features = ["rand"] }
+bpe = { version = "0.2", path = "../bpe", features = ["rand"] }
 tiktoken-rs = "0.6"
 
 [build-dependencies]
-base64 = "0.22.1"
-bpe = { version = "0.2.0", path = "../bpe", features = ["tiktoken"] }
+base64 = "0.22"
+bpe = { version = "0.2", path = "../bpe", features = ["tiktoken"] }
 flate2 = "1.0"
 rmp-serde = "1"
 serde = "1"
@@ -17,6 +17,11 @@ fn main() {
         include_bytes!("data/o200k_base.tiktoken.gz"),
         17846336922010275747,
     );
+    serialize_tiktoken_bpe(
+        "voyage3_base",
+        include_bytes!("data/voyage3_base.tiktoken.gz"),
+        17846336922010275747,
+    );
     println!("cargo::rerun-if-changed=build.rs");
 }
 

@@ -8,6 +8,11 @@ use regex_automata::{
     Anchored, Input,
 };
 
+pub mod normalizer;
+
+pub use bpe::*;
+pub use normalizer::{Normalizable, NormalizedString};
+
 // Note: Below we rewrite the negative look-ahead with a positive pseudo look-ahead.
 // The look-ahead character is dropped from the match by the Pretokenizer iterator.
 // Note: The negative look-ahead `\\s+(?!\\S)` requires `\\s+\\s` but also `\\s+$` to handle end of file without dropping a character!
@@ -18,7 +23,7 @@ static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
-    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)])
+    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], false)
         .expect("valid regex")
 });
 
@@ -35,11 +40,19 @@ static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
     ].join("|");
     let pat2 = "\\s+\\s";
     let pat3 = "\\s+";
-    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)])
+    Tokenizer::new_lookahead(bpe, &[(&pat1, false), (pat2, true), (pat3, false)], false)
         .expect("valid regex")
 });
 
-pub use bpe::*;
+static BPE_VOYAGE3_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
+    let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_voyage3_base.dict"));
+    let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
+    let pat1 = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+$";
+    let pat2 = "\\s+\\s";
+    let pat3 = "\\s+";
+    Tokenizer::new_lookahead(bpe, &[(pat1, false), (pat2, true), (pat3, false)], true)
+        .expect("valid regex")
+});
 
 /// A byte-pair encoding tokenizer that supports a pre-tokenization regex.
 /// The direct methods on this type pre-tokenize the input text and should
@@ -52,6 +65,8 @@ pub struct Tokenizer {
     pub bpe: BytePairEncoding,
     /// The pattern regex used to split the input.
     pub pre: Option<Pretokenizer>,
+    /// Indicates whether the input should be normalized with NFC.
+    nfc: bool,
 }
 
 pub struct Pretokenizer {
@@ -64,9 +79,9 @@ pub struct Pretokenizer {
 impl Tokenizer {
     /// Build a tokenizer with an optional pretokenization regex pattern.
     #[allow(clippy::result_large_err)]
-    pub fn new(bpe: BytePairEncoding, pat: Option<&str>) -> Result<Self, BuildError> {
+    pub fn new(bpe: BytePairEncoding, pat: Option<&str>, nfc: bool) -> Result<Self, BuildError> {
         let pre = pat.map(Pretokenizer::new).transpose()?;
-        Ok(Self { bpe, pre })
+        Ok(Self { nfc, bpe, pre })
     }
 
     /// Build a tokenizer with pretokenization regex patterns. If the boolean for a pattern is true,
@@ -75,34 +90,41 @@ impl Tokenizer {
     pub fn new_lookahead(
         bpe: BytePairEncoding,
         patterns: &[(&str, bool)],
+        nfc: bool,
     ) -> Result<Self, BuildError> {
         let pre = Some(Pretokenizer::new_lookahead(patterns)?);
-        Ok(Self { bpe, pre })
+        Ok(Self { nfc, bpe, pre })
     }
 
     /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
     /// before counting.
-    pub fn count(&self, text: &str) -> usize {
-        self.split(text)
+    pub fn count<'a, I: Normalizable<'a>>(&self, text: I) -> usize {
+        let text = self.normalize(text);
+        self.split(text.as_str())
             .map(|piece| self.bpe.count(piece.as_bytes()))
             .sum()
     }
 
     /// Returns the token count iff the total token count stays below the specified token_limit.
     /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
     /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
-    pub fn count_till_limit(&self, text: &str, token_limit: usize) -> Option<usize> {
-        self.split(text).try_fold(0, |consumed, piece| {
+    ///
+    /// Note: This function assumes that the text is already normalized, so that this function can run
+    /// in roughly O(token_limit) time.
+    pub fn count_till_limit(&self, text: &NormalizedString, token_limit: usize) -> Option<usize> {
+        let res: Option<usize> = self.split(text.as_str()).try_fold(0, |consumed, piece| {
             self.bpe
                 .count_till_limit(piece.as_bytes(), token_limit - consumed)
                 .map(|piece_count| consumed + piece_count)
-        })
+        });
+        res
     }
 
     /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
     /// encoding.
-    pub fn encode(&self, text: &str) -> Vec<u32> {
-        self.split(text)
+    pub fn encode<'a, I: Normalizable<'a>>(&self, text: I) -> Vec<u32> {
+        let text: NormalizedString<'_> = self.normalize(text);
+        self.split(text.as_str())
             .flat_map(|piece| self.bpe.encode_via_backtracking(piece.as_bytes()))
             .collect()
     }
@@ -114,12 +136,18 @@ impl Tokenizer {
 
     /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
     /// tokenizer does not have pre-tokenization, the iterator returns the full text.
-    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
         match &self.pre {
             Some(pre) => Either::Left(pre.split(text)),
             None => Either::Right(std::iter::once(text)),
         }
     }
+
+    /// Returns the normalized text if the tokenizer requires normalization.
+    /// If the input was already normalized, this function is a noop.
+    pub fn normalize<'a, I: Normalizable<'a>>(&self, text: I) -> NormalizedString<'a> {
+        text.normalize(self.nfc)
+    }
 }
 
 impl Pretokenizer {
@@ -143,7 +171,7 @@ impl Pretokenizer {
     }
 
     /// Returns an iterator with the text pieces after splitting with the regular expression.
-    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> + 'a {
+    pub fn split<'a>(&'a self, text: &'a str) -> impl Iterator<Item = &'a str> {
         Splits {
             pat: &self.pat,
             lookahead: &self.lookahead,
@@ -201,6 +229,10 @@ pub fn o200k_base() -> &'static Tokenizer {
     &BPE_O200K_BASE
 }
 
+pub fn voyage3_base() -> &'static Tokenizer {
+    &BPE_VOYAGE3_BASE
+}
+
 #[cfg(test)]
 mod tests {
     use bpe::byte_pair_encoding::{create_test_string, select_test_string};
@@ -233,9 +265,21 @@ mod tests {
 
     #[test]
     fn test_count_till_limit() {
-        assert_eq!(cl100k_base().count_till_limit("abc", 3), Some(1));
-        assert_eq!(cl100k_base().count_till_limit("abcabc", 3), Some(2));
-        assert_eq!(cl100k_base().count_till_limit("abcabcabc", 3), Some(3));
-        assert_eq!(cl100k_base().count_till_limit("abcabcabcabc", 3), None);
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abc"), 3),
+            Some(1)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabc"), 3),
+            Some(2)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabc"), 3),
+            Some(3)
+        );
+        assert_eq!(
+            cl100k_base().count_till_limit(&cl100k_base().normalize("abcabcabcabc"), 3),
+            None
+        );
     }
 }
@@ -0,0 +1,58 @@
+use std::borrow::Cow;
+
+use unicode_normalization::UnicodeNormalization;
+
+/// Type which represents a normalized string.
-/// Type which represents a normalized string.
+/// Type which represents an NFC normalized string.
-/// Type which represents a normalized string.
+/// Type which represents an NFC normalized string.
+/// This is to avoid calling normalize multiple times or forgetting to call normalization!
+///
+/// TODO: Annotate the type with the normalization type, once there are more than one.
+pub struct NormalizedString<'a>(Cow<'a, str>);
+
+impl<'a> NormalizedString<'a> {
+    /// Returns the normalized inner str buffer.
+    pub fn as_str(&self) -> &str {
+        &self.0
+    }
+
+    /// This function is unsafe, since the caller must ensure that the correct normalization
+    /// was used. The normalization may vary by tokenizer. This mostly a backdoor which might
+    /// be handy for certain optimizations or for testing.
+    ///
+    /// # Safety
+    /// This is safe if `s` is in fact correctly normalized already. The caller is
+    /// responsible for ensuring that.
+    pub unsafe fn from_str(s: &'a str) -> NormalizedString<'a> {
+        NormalizedString(Cow::Borrowed(s))
+    }
+}
+
+/// Helper trait which converts string types into NormalizedString.
+/// Calling normalize on a NormalizedString is a no-op.
+pub trait Normalizable<'a> {
+    fn normalize(self, nfc: bool) -> NormalizedString<'a>;
+}
+
+impl<'a> Normalizable<'a> for &'a str {
+    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
+        if nfc {
+            NormalizedString(self.nfc().collect())
+        } else {
+            NormalizedString(Cow::Borrowed(self))
+        }
+    }
+}
+
+impl<'a, T> Normalizable<'a> for &'a T
+where
+    T: AsRef<str>,
+{
+    fn normalize(self, nfc: bool) -> NormalizedString<'a> {
+        self.as_ref().normalize(nfc)
+    }
+}
+
+impl<'a> Normalizable<'a> for NormalizedString<'a> {
+    fn normalize(self, _: bool) -> NormalizedString<'a> {
+        self
+    }
+}
@@ -1,6 +1,6 @@
 [package]
 name = "bpe"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 description = "Fast byte-pair encoding implementation."
 repository = "https://github.com/github/rust-gems"

@@ -18,7 +18,7 @@ path = "equivalence.rs"
 test = true
 
 [dependencies]
-bpe = { path = "../../bpe" }
+bpe = { path = "../../bpe", features = ["rand", "tiktoken"] }
 bpe-openai = { path = "../../bpe-openai" }
 criterion = "0.5"
 rand = "0.9"