Skip to content

Commit

Permalink
feat: [sc-47267] Add LowerCaser to kapiche_tokenizer (quickwit-oss#193)
Browse files Browse the repository at this point in the history
  • Loading branch information
Sidhant29 authored Mar 12, 2024
1 parent a096a83 commit a671ba8
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,7 @@ pub(crate) struct Index {
// and a PossessiveContractionFilter.
fn get_kapiche_tokenizer() -> TextAnalyzer {
TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(LowerCaser)
.filter(OuterPunctuationFilter::new(vec!['#', '@']))
.filter(PossessiveContractionFilter)
.build()
Expand Down
4 changes: 2 additions & 2 deletions tests/test_stat_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,5 @@ def test_stat_searcher_memory():

result = index.stat_searcher().search(query)
items = sorted(result.unique_docs_frames)
assert len(items) == 439
assert items[:4] == [(0, 0), (2, 2), (11, 11), (18, 18)]
assert len(items) == 441
assert items[:4] == [(0, 0), (2, 2), (8, 8), (11, 11)]

0 comments on commit a671ba8

Please sign in to comment.