Skip to content

Commit 1e7cd48

Browse files
authored
remove allocations in split compound words (#2080)
* remove allocations in split compound words * clear reused data
1 parent 7f51d85 commit 1e7cd48

File tree

1 file changed

+14
-8
lines changed

1 file changed

+14
-8
lines changed

src/tokenizer/split_compound_words.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
8686
SplitCompoundWordsFilter {
8787
dict: self.dict,
8888
inner: tokenizer,
89+
cuts: Vec::new(),
90+
parts: Vec::new(),
8991
}
9092
}
9193
}
@@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
9496
pub struct SplitCompoundWordsFilter<T> {
9597
dict: AhoCorasick,
9698
inner: T,
99+
cuts: Vec<usize>,
100+
parts: Vec<Token>,
97101
}
98102

99103
impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
100-
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
104+
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;
101105

102106
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
107+
self.cuts.clear();
108+
self.parts.clear();
103109
SplitCompoundWordsTokenStream {
104110
dict: self.dict.clone(),
105111
tail: self.inner.token_stream(text),
106-
cuts: Vec::new(),
107-
parts: Vec::new(),
112+
cuts: &mut self.cuts,
113+
parts: &mut self.parts,
108114
}
109115
}
110116
}
111117

112-
pub struct SplitCompoundWordsTokenStream<T> {
118+
pub struct SplitCompoundWordsTokenStream<'a, T> {
113119
dict: AhoCorasick,
114120
tail: T,
115-
cuts: Vec<usize>,
116-
parts: Vec<Token>,
121+
cuts: &'a mut Vec<usize>,
122+
parts: &'a mut Vec<Token>,
117123
}
118124

119-
impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
125+
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
120126
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
121127
// can fully be split into consecutive matches against `self.dict`.
122128
fn split(&mut self) {
@@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
152158
}
153159
}
154160

155-
impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
161+
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
156162
fn advance(&mut self) -> bool {
157163
self.parts.pop();
158164

0 commit comments

Comments
 (0)