huggingface · ArthurZucker · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi
@@ -57,7 +57,7 @@ class ByteFallback(Decoder):
     ByteFallback Decoder
     ByteFallback is a simple trick which converts tokens looking like `<0x61>`
     to pure bytes, and attempts to make them into a string. If the tokens
-    cannot be decoded you will get � instead for each inconvertible byte token
+    cannot be decoded you will get � instead for each inconvertable byte token
 
     """
     def __init__(self):

diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi
@@ -389,7 +389,7 @@ class Nmt(Normalizer):
 class Precompiled(Normalizer):
     """
     Precompiled normalizer
-    Don't use manually it is used for compatibility for SentencePiece.
+    Don't use manually it is used for compatiblity for SentencePiece.
     """
     def __init__(self, precompiled_charsmap):
         pass

diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi
@@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
     BertPreTokenizer
 
     This pre-tokenizer splits tokens on spaces, and also on punctuation.
-    Each occurrence of a punctuation character will be treated separately.
+    Each occurence of a punctuation character will be treated separately.
     """
     def __init__(self):
         pass
@@ -421,11 +421,11 @@ class Split(PreTokenizer):
 
     Args:
         pattern (:obj:`str` or :class:`~tokenizers.Regex`):
-            A pattern used to split the string. Usually a string or a regex built with `tokenizers.Regex`.
-            If you want to use a regex pattern, it has to be wrapped around a `tokenizers.Regex`,
+            A pattern used to split the string. Usually a string or a a regex built with `tokenizers.Regex`.
+            If you want to use a regex pattern, it has to be wrapped around a `tokenizer.Regex`,
             otherwise we consider is as a string pattern. For example `pattern="|"`
             means you want to split on `|` (imagine a csv file for example), while
-            `pattern=tokenizers.Regex("1|2")` means you split on either '1' or '2'.
+            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
-            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
+            `pattern=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
-            `patter=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
+            `pattern=tokenizer.Regex("1|2")` means you split on either '1' or '2'.
         behavior (:class:`~tokenizers.SplitDelimiterBehavior`):
             The behavior to use when splitting.
             Choices: "removed", "isolated", "merged_with_previous", "merged_with_next",

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -603,24 +603,6 @@ impl Decoder for PyDecoderWrapper {
     }
 }
 
-/// Decoders Module
-#[pymodule]
-pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
-    m.add_class::<PyDecoder>()?;
-    m.add_class::<PyByteLevelDec>()?;
-    m.add_class::<PyReplaceDec>()?;
-    m.add_class::<PyWordPieceDec>()?;
-    m.add_class::<PyByteFallbackDec>()?;
-    m.add_class::<PyFuseDec>()?;
-    m.add_class::<PyStrip>()?;
-    m.add_class::<PyMetaspaceDec>()?;
-    m.add_class::<PyBPEDecoder>()?;
-    m.add_class::<PyCTCDecoder>()?;
-    m.add_class::<PySequenceDecoder>()?;
-    m.add_class::<PyDecodeStream>()?;
-    Ok(())
-}
-
 /// Class needed for streaming decode
 ///
 #[pyclass(module = "tokenizers.decoders", name = "DecodeStream")]
@@ -661,6 +643,13 @@ impl PyDecodeStream {
         }
     }
 
+    #[pyo3(signature = (sequence_ids), text_signature = "(self, sequence_ids)")]
+    fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
+        self.ids = sequence_ids;
+        self.prefix_index = self.ids.len();
+        self.prefix = "".to_string();
+    }
+
     #[pyo3(signature = (tokenizer, id), text_signature = "(self, tokenizer, id)")]
     fn step(&mut self, tokenizer: &PyTokenizer, id: u32) -> PyResult<Option<String>> {
         ToPyResult(tk::tokenizer::step_decode_stream(
@@ -675,6 +664,24 @@ impl PyDecodeStream {
     }
 }
 
+/// Decoders Module
+#[pymodule]
+pub fn decoders(m: &Bound<'_, PyModule>) -> PyResult<()> {
+    m.add_class::<PyDecoder>()?;
+    m.add_class::<PyByteLevelDec>()?;
+    m.add_class::<PyReplaceDec>()?;
+    m.add_class::<PyWordPieceDec>()?;
+    m.add_class::<PyByteFallbackDec>()?;
+    m.add_class::<PyFuseDec>()?;
+    m.add_class::<PyStrip>()?;
+    m.add_class::<PyMetaspaceDec>()?;
+    m.add_class::<PyBPEDecoder>()?;
+    m.add_class::<PyCTCDecoder>()?;
+    m.add_class::<PySequenceDecoder>()?;
+    m.add_class::<PyDecodeStream>()?;
+    Ok(())
+}
+
 #[cfg(test)]
 mod test {
     use std::sync::{Arc, RwLock};

diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -371,6 +371,9 @@ def test_decode(self):
         assert stream.step(tokenizer, 2) == " is"
         assert stream.step(tokenizer, 3) == " john"
 
+        stream.with_sequence([0, 1, 2, 3])
+        assert stream.step(tokenizer, 4) == "my name is john pair"
+
     def test_decode_stream(self):
         vocab = [
             ("<unk>", 0.0),

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -1072,6 +1072,11 @@ where
             &mut self.prefix_index,
         )
     }
+
+    // Allows prefilling the tokenizer. Bit weird because not called in python
+    pub fn with_sequence(&mut self, sequence_ids: Vec<u32>) {
+        self.ids = sequence_ids;
+    }
 }
 
 /// Internal function exposed only to bypass python limitations