Skip to content

Commit

Permalink
remove allocations in split compound words (#2080)
Browse files Browse the repository at this point in the history
* remove allocations in split compound words

* clear reused data
  • Loading branch information
PSeitz authored Jul 13, 2023
1 parent 7f51d85 commit 1e7cd48
Showing 1 changed file with 14 additions and 8 deletions.
22 changes: 14 additions & 8 deletions src/tokenizer/split_compound_words.rs
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ impl TokenFilter for SplitCompoundWords {
SplitCompoundWordsFilter {
dict: self.dict,
inner: tokenizer,
cuts: Vec::new(),
parts: Vec::new(),
}
}
}
Expand All @@ -94,29 +96,33 @@ impl TokenFilter for SplitCompoundWords {
pub struct SplitCompoundWordsFilter<T> {
dict: AhoCorasick,
inner: T,
cuts: Vec<usize>,
parts: Vec<Token>,
}

impl<T: Tokenizer> Tokenizer for SplitCompoundWordsFilter<T> {
type TokenStream<'a> = SplitCompoundWordsTokenStream<T::TokenStream<'a>>;
type TokenStream<'a> = SplitCompoundWordsTokenStream<'a, T::TokenStream<'a>>;

fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.cuts.clear();
self.parts.clear();
SplitCompoundWordsTokenStream {
dict: self.dict.clone(),
tail: self.inner.token_stream(text),
cuts: Vec::new(),
parts: Vec::new(),
cuts: &mut self.cuts,
parts: &mut self.parts,
}
}
}

pub struct SplitCompoundWordsTokenStream<T> {
pub struct SplitCompoundWordsTokenStream<'a, T> {
dict: AhoCorasick,
tail: T,
cuts: Vec<usize>,
parts: Vec<Token>,
cuts: &'a mut Vec<usize>,
parts: &'a mut Vec<Token>,
}

impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
impl<'a, T: TokenStream> SplitCompoundWordsTokenStream<'a, T> {
// Will use `self.cuts` to fill `self.parts` if `self.tail.token()`
// can fully be split into consecutive matches against `self.dict`.
fn split(&mut self) {
Expand Down Expand Up @@ -152,7 +158,7 @@ impl<T: TokenStream> SplitCompoundWordsTokenStream<T> {
}
}

impl<T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<T> {
impl<'a, T: TokenStream> TokenStream for SplitCompoundWordsTokenStream<'a, T> {
fn advance(&mut self) -> bool {
self.parts.pop();

Expand Down

0 comments on commit 1e7cd48

Please sign in to comment.