huggingface · ArthurZucker · May 27, 2025 · Apr 25, 2025
diff --git a/bindings/node/lib/bindings/encoding.test.ts b/bindings/node/lib/bindings/encoding.test.ts
@@ -122,7 +122,7 @@ describe('Encoding', () => {
       expect(indexes).toEqual([3, 5])
     })
 
-    it('returns the corrent indexes with pair sequences', () => {
+    it('returns the correct indexes with pair sequences', () => {
       expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
       expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
     })

diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml
@@ -27,4 +27,4 @@ tempfile = "3.10"
 pyo3 = { version = "0.23", features = ["auto-initialize"] }
 
 [features]
-defaut = ["pyo3/extension-module"]
+default = ["pyo3/extension-module"]
diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py
@@ -397,7 +397,7 @@ def main():
         "--models",
         type=lambda s: s.split(","),
         default=pretraineds,
-        help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
+        help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
     )
     args = parser.parse_args()
 

diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs
@@ -404,7 +404,7 @@ impl PyMetaspaceDec {
 ///
 /// Args:
 ///     suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
-///         The suffix that was used to caracterize an end-of-word. This suffix will
+///         The suffix that was used to characterize an end-of-word. This suffix will
 ///         be replaced by whitespaces during the decoding
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
 pub struct PyBPEDecoder {}

diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
@@ -221,7 +221,7 @@ pub struct BPE {
     pub unk_token: Option<String>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// Do multiple unk tokens get fused
     pub fuse_unk: bool,

diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
@@ -190,7 +190,7 @@ pub struct BpeTrainer {
     pub initial_alphabet: HashSet<char>,
     /// An optional prefix to use on any subword that exist only behind another one
     pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
     pub end_of_word_suffix: Option<String>,
     /// An optional parameter to limit the max length of any single token
     pub max_token_length: Option<usize>,

diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs
@@ -401,7 +401,7 @@ impl UnigramTrainer {
 
                 let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
 
-                // The frequencies of altenatives are increased by freq[i].
+                // The frequencies of alternatives are increased by freq[i].
                 let mut logprob_alt = 0.0;
                 for n in &alternatives[id] {
                     logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;

diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs
@@ -73,7 +73,7 @@ impl WordLevelBuilder {
         self
     }
 
-    /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
+    /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordLevel> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordLevel::read_file(&vocab)?;

diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
@@ -93,7 +93,7 @@ impl WordPieceBuilder {
         self
     }
 
-    /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
+    /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
     pub fn build(mut self) -> Result<WordPiece> {
         if let Some(vocab) = self.config.files {
             self.config.vocab = WordPiece::read_file(&vocab)?;

diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs
@@ -170,7 +170,7 @@ impl WordPieceTrainer {
         // Transfer the vocab
         model.vocab = new_wordpiece.vocab;
         model.vocab_r = new_wordpiece.vocab_r;
-        // The continuing_subword_prefix is the only other option to be overriden by the trainer
+        // The continuing_subword_prefix is the only other option to be overridden by the trainer
         model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
 
         Ok(special_tokens)

diff --git a/tokenizers/src/normalizers/precompiled.rs b/tokenizers/src/normalizers/precompiled.rs
@@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
     transformations.extend(new_part.chars().map(|c| (c, 0)));
 
     match diff.cmp(&0) {
-        // If we are adding some characters, the last DIFF characters shoud be == 1
+        // If we are adding some characters, the last DIFF characters should be == 1
         Ordering::Greater => {
             transformations
                 .iter_mut()

diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -29,7 +29,7 @@ pub struct AddedToken {
 }
 
 impl AddedToken {
-    /// Build this token from the given content, specifying if it is intented to be a
+    /// Build this token from the given content, specifying if it is intended to be a
     /// special token. Special tokens are not normalized by default.
     pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
         Self {

diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
@@ -389,7 +389,7 @@ where
         self
     }
 
-    /// Set the trunaction parameters.
+    /// Set the truncation parameters.
     #[must_use]
     pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
         self.truncation = trunc;

diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs
@@ -201,9 +201,9 @@ impl NormalizedString {
                 });
 
             match (start, end) {
-                // Targeting inexistant beginning
+                // Targeting inexistent beginning
                 (Some(s), None) => Some(s..s),
-                // Targeting inexistant end
+                // Targeting inexistent end
                 (None, Some(e)) => Some(e..e),
                 // Found the range
                 (Some(s), Some(e)) => Some(s..e),

diff --git a/tokenizers/src/tokenizer/pattern.rs b/tokenizers/src/tokenizer/pattern.rs
@@ -122,7 +122,7 @@ where
     }
 }
 
-/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
+/// Invert the `is_match` flags for the wrapped Pattern. This is useful
 /// for example when we use a regex that matches words instead of a delimiter,
 /// and we want to match the delimiter.
 pub struct Invert<P: Pattern>(pub P);