Skip to content

Commit 0fd7b9d

Browse files
committed
Remove tokenizers/utils/compact_string.rs
1 parent 86f3e09 commit 0fd7b9d

File tree

6 files changed

+65
-54
lines changed

6 files changed

+65
-54
lines changed

tokenizers/src/decoders/ctc.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ impl Decoder for CTC {
7171
#[cfg(test)]
7272
mod tests {
7373
use super::*;
74-
use crate::utils::compact_string::to_compact_strings;
7574
use compact_str::ToCompactString;
7675

7776
#[test]
@@ -84,8 +83,10 @@ mod tests {
8483
assert_eq!(
8584
ctc_decoder
8685
.decode_chain(id_to_string_result)
87-
.map(to_compact_strings)
88-
.unwrap(),
86+
.unwrap()
87+
.into_iter()
88+
.map(|t| t.to_compact_string())
89+
.collect::<Vec<_>>(),
8990
vec!["h", "e", "l", "l", "o"]
9091
);
9192
}
@@ -99,8 +100,10 @@ mod tests {
99100
assert_eq!(
100101
ctc_decoder
101102
.decode_chain(id_to_string_result)
102-
.map(to_compact_strings)
103-
.unwrap(),
103+
.unwrap()
104+
.into_iter()
105+
.map(|t| t.to_compact_string())
106+
.collect::<Vec<_>>(),
104107
vec!["h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d"]
105108
);
106109
}
@@ -111,8 +114,10 @@ mod tests {
111114
assert_eq!(
112115
ctc_decoder
113116
.decode_chain(id_to_string_result)
114-
.map(to_compact_strings)
115-
.unwrap(),
117+
.unwrap()
118+
.into_iter()
119+
.map(|t| t.to_compact_string())
120+
.collect::<Vec<_>>(),
116121
vec![
117122
"A", " ", "M", "A", "N", " ", "S", "A", "I", "D", " ", "T", "O", " ", "T", "H",
118123
"E", " ", "U", "N", "I", "V", "E", "R", "S", "E", " ", "S", "I", "R", " ", "I",
@@ -127,8 +132,10 @@ mod tests {
127132
assert_eq!(
128133
ctc_decoder
129134
.decode_chain(id_to_string_result)
130-
.map(to_compact_strings)
131-
.unwrap(),
135+
.unwrap()
136+
.into_iter()
137+
.map(|t| t.to_compact_string())
138+
.collect::<Vec<_>>(),
132139
vec![
133140
"H", "I", "S", " ", "I", "N", "S", "T", "A", "N", "C", "T", " ", "P", "A", "N",
134141
"I", "C", " ", "W", "A", "S", " ", "F", "O", "L", "L", "O", "W", "E", "D", " ",

tokenizers/src/decoders/mod.rs

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ use crate::decoders::wordpiece::WordPiece;
2323
use crate::normalizers::replace::Replace;
2424
use crate::pre_tokenizers::byte_level::ByteLevel;
2525
use crate::pre_tokenizers::metaspace::Metaspace;
26-
use crate::utils::compact_string::to_compact_strings;
2726
use crate::{Decoder, Result};
2827

2928
#[derive(Serialize, Clone, Debug)]
@@ -157,16 +156,36 @@ impl Decoder for DecoderWrapper {
157156
tokens: Vec<T>,
158157
) -> Result<Vec<impl ToCompactString>> {
159158
match self {
160-
Self::BPE(bpe) => bpe.decode_chain(tokens).map(to_compact_strings),
161-
Self::ByteLevel(bl) => bl.decode_chain(tokens).map(to_compact_strings),
162-
Self::Metaspace(ms) => ms.decode_chain(tokens).map(to_compact_strings),
163-
Self::WordPiece(wp) => wp.decode_chain(tokens).map(to_compact_strings),
164-
Self::CTC(ctc) => ctc.decode_chain(tokens).map(to_compact_strings),
165-
Self::Sequence(seq) => seq.decode_chain(tokens).map(to_compact_strings),
166-
Self::Replace(seq) => seq.decode_chain(tokens).map(to_compact_strings),
167-
Self::ByteFallback(bf) => bf.decode_chain(tokens).map(to_compact_strings),
168-
Self::Strip(bf) => bf.decode_chain(tokens).map(to_compact_strings),
169-
Self::Fuse(bf) => bf.decode_chain(tokens).map(to_compact_strings),
159+
Self::BPE(bpe) => bpe
160+
.decode_chain(tokens)
161+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
162+
Self::ByteLevel(bl) => bl
163+
.decode_chain(tokens)
164+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
165+
Self::Metaspace(ms) => ms
166+
.decode_chain(tokens)
167+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
168+
Self::WordPiece(wp) => wp
169+
.decode_chain(tokens)
170+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
171+
Self::CTC(ctc) => ctc
172+
.decode_chain(tokens)
173+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
174+
Self::Sequence(seq) => seq
175+
.decode_chain(tokens)
176+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
177+
Self::Replace(seq) => seq
178+
.decode_chain(tokens)
179+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
180+
Self::ByteFallback(bf) => bf
181+
.decode_chain(tokens)
182+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
183+
Self::Strip(bf) => bf
184+
.decode_chain(tokens)
185+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
186+
Self::Fuse(bf) => bf
187+
.decode_chain(tokens)
188+
.map(|v| v.into_iter().map(|t| t.to_compact_string()).collect()),
170189
}
171190
}
172191
}

tokenizers/src/pre_tokenizers/byte_level.rs

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,6 @@ mod tests {
249249
Decoder, Encoding, OffsetReferential, OffsetType, PostProcessor, PreTokenizedString,
250250
PreTokenizer,
251251
};
252-
use crate::utils::compact_string::to_compact_strings;
253252
use std::iter::FromIterator;
254253

255254
#[test]
@@ -301,8 +300,10 @@ mod tests {
301300
.decode_chain(vec![
302301
"Hello", "Ġmy", "Ġfriend", ",", "Ġhow", "Ġis", "Ġyour", "Ġday", "Ġgoing", "?"
303302
])
304-
.map(to_compact_strings)
305-
.unwrap(),
303+
.unwrap()
304+
.into_iter()
305+
.map(|t| t.to_compact_string())
306+
.collect::<Vec<_>>(),
306307
vec!["Hello my friend, how is your day going?"]
307308
);
308309
}
@@ -359,8 +360,10 @@ mod tests {
359360
sample,
360361
bytelevel
361362
.decode_chain(separated_tokens)
362-
.map(to_compact_strings)
363363
.unwrap()
364+
.into_iter()
365+
.map(|t| t.to_compact_string())
366+
.collect::<Vec<_>>()
364367
.join("")
365368
);
366369
}
@@ -565,8 +568,10 @@ mod tests {
565568
assert_eq!(
566569
byte_level
567570
.decode_chain(vec!["Hello", "Ġthere", "Ġdear", "Ġfriend!", "Ġ", "[PA D]"])
568-
.map(to_compact_strings)
569-
.unwrap(),
571+
.unwrap()
572+
.into_iter()
573+
.map(|t| t.to_compact_string())
574+
.collect::<Vec<_>>(),
570575
vec!["Hello there dear friend! [PA D]"]
571576
);
572577
}

tokenizers/src/pre_tokenizers/metaspace.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,6 @@ mod tests {
182182
use regex::Regex;
183183

184184
use super::*;
185-
use crate::utils::compact_string::to_compact_strings;
186185
use crate::{OffsetReferential, OffsetType};
187186

188187
#[test]
@@ -364,15 +363,19 @@ mod tests {
364363
let decoder = Metaspace::new('▁', PrependScheme::Always, true);
365364
let res = decoder
366365
.decode_chain(vec!["▁Hey", "▁friend!"])
367-
.map(to_compact_strings)
368-
.unwrap();
366+
.unwrap()
367+
.into_iter()
368+
.map(|t| t.to_compact_string())
369+
.collect::<Vec<_>>();
369370
assert_eq!(res, vec!["Hey", " friend!"]);
370371

371372
let decoder = Metaspace::new('▁', PrependScheme::Never, true);
372373
let res = decoder
373374
.decode_chain(vec!["▁Hey", "▁friend!"])
374-
.map(to_compact_strings)
375-
.unwrap();
375+
.unwrap()
376+
.into_iter()
377+
.map(|t| t.to_compact_string())
378+
.collect::<Vec<_>>();
376379
assert_eq!(res, vec![" Hey", " friend!"]);
377380
}
378381
}

tokenizers/src/utils/compact_string.rs

Lines changed: 0 additions & 22 deletions
This file was deleted.

tokenizers/src/utils/mod.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ mod onig;
1111
#[cfg(not(feature = "unstable_wasm"))]
1212
pub use crate::utils::onig::SysRegex;
1313

14-
pub mod compact_string;
1514
pub mod iter;
1615
pub mod padding;
1716
pub mod parallelism;

0 commit comments

Comments
 (0)