diff --git a/examples/aggregation.rs b/examples/aggregation.rs index eb03e7815e..7397f9aa9b 100644 --- a/examples/aggregation.rs +++ b/examples/aggregation.rs @@ -37,7 +37,7 @@ fn main() -> tantivy::Result<()> { .set_index_option(IndexRecordOption::WithFreqs) .set_tokenizer("raw"), ) - .set_fast(None) + .set_fast("default") .set_stored(); schema_builder.add_text_field("category", text_fieldtype); schema_builder.add_f64_field("stock", FAST); diff --git a/src/aggregation/bucket/term_agg.rs b/src/aggregation/bucket/term_agg.rs index adfa01673e..5a9ae5e7aa 100644 --- a/src/aggregation/bucket/term_agg.rs +++ b/src/aggregation/bucket/term_agg.rs @@ -1293,13 +1293,13 @@ mod tests { // searching for terma, but min_doc_count will return all terms let res = exec_request_with_query(agg_req, &index, Some(("string2", "hit")))?; - assert_eq!(res["my_texts"]["buckets"][0]["key"], "A"); + assert_eq!(res["my_texts"]["buckets"][0]["key"], "a"); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2); assert_eq!( res["my_texts"]["buckets"][0]["elhistogram"]["buckets"], json!([{ "doc_count": 1, "key": 1.0 }, { "doc_count": 1, "key": 2.0 } ]) ); - assert_eq!(res["my_texts"]["buckets"][1]["key"], "B"); + assert_eq!(res["my_texts"]["buckets"][1]["key"], "b"); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1); assert_eq!( res["my_texts"]["buckets"][1]["elhistogram"]["buckets"], @@ -1421,10 +1421,10 @@ mod tests { let res = exec_request_with_query(agg_req, &index, None).unwrap(); println!("{}", serde_json::to_string_pretty(&res).unwrap()); - assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo"); + assert_eq!(res["my_texts"]["buckets"][0]["key"], "hallo hallo"); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1); - assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello"); + assert_eq!(res["my_texts"]["buckets"][1]["key"], "hello hello"); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1); Ok(()) diff --git a/src/aggregation/mod.rs b/src/aggregation/mod.rs index 93116e0d9c..fc09e15bb8 100644 --- a/src/aggregation/mod.rs +++ b/src/aggregation/mod.rs @@ -411,7 +411,7 @@ mod tests { .set_index_option(IndexRecordOption::Basic) .set_fieldnorms(false), ) - .set_fast(None) + .set_fast("default") .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype.clone()); let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype); @@ -466,7 +466,7 @@ mod tests { .set_indexing_options( TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), ) - .set_fast(None) + .set_fast("default") .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let date_field = schema_builder.add_date_field("date", FAST); diff --git a/src/core/index.rs b/src/core/index.rs index dcc49b9022..bd41bbf3b3 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -120,8 +120,8 @@ impl IndexBuilder { Self { schema: None, index_settings: IndexSettings::default(), - tokenizer_manager: TokenizerManager::default(), - fast_field_tokenizer_manager: TokenizerManager::default(), + tokenizer_manager: TokenizerManager::default_for_indexing(), + fast_field_tokenizer_manager: TokenizerManager::default_for_fast_fields(), } } @@ -400,8 +400,8 @@ impl Index { settings: metas.index_settings.clone(), directory, schema, - tokenizers: TokenizerManager::default(), - fast_field_tokenizers: TokenizerManager::default(), + tokenizers: TokenizerManager::default_for_indexing(), + fast_field_tokenizers: TokenizerManager::default_for_fast_fields(), executor: Arc::new(Executor::single_thread()), inventory, } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index d450e3e593..beb0da1a6f 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -446,7 +446,8 @@ mod tests { #[test] fn test_text_fastfield() { let mut schema_builder = Schema::builder(); - let text_field = schema_builder.add_text_field("text", TEXT | FAST); + let text_options: TextOptions = TextOptions::from(TEXT).set_fast("raw"); + let text_field = schema_builder.add_text_field("text", text_options); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -1082,7 +1083,7 @@ mod tests { #[test] fn test_fast_field_in_json_field_expand_dots_disabled() { let mut schema_builder = Schema::builder(); - let json_option = JsonObjectOptions::default().set_fast(None); + let json_option = JsonObjectOptions::default().set_fast("default"); let json = schema_builder.add_json_field("json", json_option); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -1108,7 +1109,7 @@ mod tests { #[test] fn test_fast_field_in_json_field_with_tokenizer() { let mut schema_builder = Schema::builder(); - let json_option = JsonObjectOptions::default().set_fast(Some("default")); + let json_option = JsonObjectOptions::default().set_fast("default"); let json = schema_builder.add_json_field("json", json_option); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -1134,7 +1135,7 @@ mod tests { fn test_fast_field_in_json_field_expand_dots_enabled() { let mut schema_builder = Schema::builder(); let json_option = JsonObjectOptions::default() - .set_fast(None) + .set_fast("default") .set_expand_dots_enabled(); let json = schema_builder.add_json_field("json", json_option); let schema = schema_builder.build(); @@ -1202,10 +1203,10 @@ mod tests { #[test] fn test_fast_field_tokenizer() { let mut schema_builder = Schema::builder(); - let opt = TextOptions::default().set_fast(Some("custom_lowercase")); + let opt = TextOptions::default().set_fast("custom_lowercase"); let text_field = schema_builder.add_text_field("text", opt); let schema = schema_builder.build(); - let ff_tokenizer_manager = TokenizerManager::default(); + let ff_tokenizer_manager = TokenizerManager::default_for_fast_fields(); ff_tokenizer_manager.register( "custom_lowercase", TextAnalyzer::builder(RawTokenizer::default()) @@ -1238,7 +1239,7 @@ mod tests { .set_index_option(crate::schema::IndexRecordOption::WithFreqs) .set_tokenizer("raw"), ) - .set_fast(Some("default")) + .set_fast("default") .set_stored(); let log_field = schema_builder.add_text_field("log_level", text_fieldtype); @@ -1271,7 +1272,7 @@ mod tests { fn test_shadowing_fast_field_with_expand_dots() { let mut schema_builder = Schema::builder(); let json_option = JsonObjectOptions::default() - .set_fast(None) + .set_fast("default") .set_expand_dots_enabled(); let json_field = schema_builder.add_json_field("jsonfield", json_option.clone()); let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option); diff --git a/src/fastfield/readers.rs b/src/fastfield/readers.rs index c8e8b1d4a1..6495e39de3 100644 --- a/src/fastfield/readers.rs +++ b/src/fastfield/readers.rs @@ -349,7 +349,7 @@ mod tests { schema_builder.add_json_field( "json_expand_dots_enabled", JsonObjectOptions::default() - .set_fast(None) + .set_fast("default") .set_expand_dots_enabled(), ); let dynamic_field = schema_builder.add_json_field("_dyna", FAST); diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 6389ce8a2c..98bd087589 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -18,6 +18,8 @@ const JSON_DEPTH_LIMIT: usize = 20; pub struct FastFieldsWriter { columnar_writer: ColumnarWriter, fast_field_names: Vec>, //< TODO see if we can hash the field name hash too. + // Field -> Fast field tokenizer mapping. + // All text fast fields should have a tokenizer. per_field_tokenizer: Vec>, date_precisions: Vec, expand_dots: Vec, @@ -61,7 +63,7 @@ impl FastFieldsWriter { if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() { let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| { TantivyError::InvalidArgument(format!( - "Tokenizer {tokenizer_name:?} not found" + "Tokenizer `{tokenizer_name}` not found" )) })?; per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer); @@ -157,9 +159,6 @@ impl FastFieldsWriter { &token.text, ); }) - } else { - self.columnar_writer - .record_str(doc_id, field_name.as_str(), text_val); } } Value::Bytes(bytes_val) => { @@ -201,18 +200,20 @@ impl FastFieldsWriter { self.json_path_buffer.clear(); self.json_path_buffer.push_str(field_name); - let text_analyzer = + let text_analyzer_opt = &mut self.per_field_tokenizer[field_value.field().field_id() as usize]; - record_json_obj_to_columnar_writer( - doc_id, - json_obj, - expand_dots, - JSON_DEPTH_LIMIT, - &mut self.json_path_buffer, - &mut self.columnar_writer, - text_analyzer, - ); + if let Some(text_analyzer) = text_analyzer_opt { + record_json_obj_to_columnar_writer( + doc_id, + json_obj, + expand_dots, + JSON_DEPTH_LIMIT, + &mut self.json_path_buffer, + &mut self.columnar_writer, + text_analyzer, + ); + } } Value::IpAddr(ip_addr) => { self.columnar_writer @@ -263,7 +264,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit: usize, json_path_buffer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &mut Option, + text_analyzer: &mut TextAnalyzer, ) { for (key, child) in json_obj { let len_path = json_path_buffer.len(); @@ -288,7 +289,7 @@ fn record_json_obj_to_columnar_writer( remaining_depth_limit, json_path_buffer, columnar_writer, - tokenizer, + text_analyzer, ); // popping our sub path. json_path_buffer.truncate(len_path); @@ -302,7 +303,7 @@ fn record_json_value_to_columnar_writer( mut remaining_depth_limit: usize, json_path_writer: &mut String, columnar_writer: &mut columnar::ColumnarWriter, - tokenizer: &mut Option, + text_analyzer: &mut TextAnalyzer, ) { if remaining_depth_limit == 0 { return; @@ -321,14 +322,10 @@ fn record_json_value_to_columnar_writer( } } serde_json::Value::String(text) => { - if let Some(text_analyzer) = tokenizer.as_mut() { - let mut token_stream = text_analyzer.token_stream(text); - token_stream.process(&mut |token| { - columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); - }) - } else { - columnar_writer.record_str(doc, json_path_writer.as_str(), text); - } + let mut token_stream = text_analyzer.token_stream(text); + token_stream.process(&mut |token| { + columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text); + }); } serde_json::Value::Array(arr) => { for el in arr { @@ -339,7 +336,7 @@ fn record_json_value_to_columnar_writer( remaining_depth_limit, json_path_writer, columnar_writer, - tokenizer, + text_analyzer, ); } } @@ -351,7 +348,7 @@ fn record_json_value_to_columnar_writer( remaining_depth_limit, json_path_writer, columnar_writer, - tokenizer, + text_analyzer, ); } } @@ -371,6 +368,9 @@ mod tests { ) -> ColumnarReader { let mut columnar_writer = ColumnarWriter::default(); let mut json_path = String::new(); + let mut text_analyzer = crate::tokenizer::TokenizerManager::default_for_fast_fields() + .get(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER) + .unwrap(); for (doc, json_doc) in json_docs.iter().enumerate() { record_json_value_to_columnar_writer( doc as u32, @@ -379,7 +379,7 @@ mod tests { JSON_DEPTH_LIMIT, &mut json_path, &mut columnar_writer, - &mut None, + &mut text_analyzer, ); } let mut buffer = Vec::new(); @@ -399,6 +399,7 @@ mod tests { }); let columnar_reader = test_columnar_from_jsons_aux(&[json_doc], false); let columns = columnar_reader.list_columns().unwrap(); + assert_eq!(columns.len(), 5); { assert_eq!(columns[0].0, "arr"); let column_arr_opt: Option = columns[0].1.open().unwrap().into(); @@ -434,7 +435,9 @@ mod tests { { assert_eq!(columns[4].0, "text"); let column_text_opt: Option = columns[4].1.open().unwrap().into(); - assert!(column_text_opt.unwrap().term_ords(0).eq([0].into_iter())); + let column_text = column_text_opt.unwrap(); + let term_ords: Vec = column_text.term_ords(0).collect(); + assert_eq!(&term_ords[..], &[0]); } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 4a8b864699..612212e2cc 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -956,7 +956,7 @@ mod test { .iter() .flat_map(|field_name| schema.get_field(field_name)) .collect(); - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); tokenizer_manager.register( "en_with_stop_words", TextAnalyzer::builder(SimpleTokenizer::default()) @@ -1447,7 +1447,7 @@ mod test { let title = schema_builder.add_text_field("title", text_options); let schema = schema_builder.build(); let default_fields = vec![title]; - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); let query_parser = QueryParser::new(schema, default_fields, tokenizer_manager); assert_matches!( @@ -1622,7 +1622,8 @@ mod test { let mut schema_builder = Schema::builder(); schema_builder.add_text_field(r#"a\.b"#, STRING); let schema = schema_builder.build(); - let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default()); + let query_parser = + QueryParser::new(schema, Vec::new(), TokenizerManager::default_for_indexing()); let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap(); assert_eq!( format!("{query:?}"), @@ -1639,8 +1640,11 @@ mod test { schema_builder.add_text_field("first.toto.titi", STRING); schema_builder.add_text_field("third.a.b.c", STRING); let schema = schema_builder.build(); - let query_parser = - QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default()); + let query_parser = QueryParser::new( + schema.clone(), + Vec::new(), + TokenizerManager::default_for_indexing(), + ); assert_eq!( query_parser.split_full_path("first.toto"), Some((schema.get_field("first.toto").unwrap(), "")) diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs index eee3618a8e..09ec877fd7 100644 --- a/src/schema/json_object_options.rs +++ b/src/schema/json_object_options.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; use super::text_options::{FastFieldTextOptions, TokenizerName}; use crate::schema::flags::{FastFlag, SchemaFlagList, StoredFlag}; -use crate::schema::{TextFieldIndexing, TextOptions}; +use crate::schema::{TextFieldIndexing, TextOptions, DEFAULT_FAST_FIELD_TOKENIZER}; /// The `JsonObjectOptions` make it possible to /// configure how a json object field should be indexed and stored. @@ -58,20 +58,19 @@ impl JsonObjectOptions { /// Returns true if and only if the json object fields are /// to be treated as fast fields. pub fn is_fast(&self) -> bool { - matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) - || matches!( - &self.fast, - FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } - ) + match self.fast { + FastFieldTextOptions::Disabled => false, + FastFieldTextOptions::Enabled { .. } => true, + } } /// Returns true if and only if the value is a fast field. pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { match &self.fast { - FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - } => Some(tokenizer.name()), + FastFieldTextOptions::Disabled => None, + FastFieldTextOptions::Enabled { + tokenizer: with_tokenizer, + } => Some(with_tokenizer.name()), } } @@ -130,15 +129,11 @@ impl JsonObjectOptions { /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) /// from the dictionary. #[must_use] - pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> Self { - if let Some(tokenizer) = tokenizer_name { - let tokenizer = TokenizerName::from_name(tokenizer); - self.fast = FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - } - } else { - self.fast = FastFieldTextOptions::IsEnabled(true); - } + pub fn set_fast(mut self, tokenizer_name: &str) -> Self { + let with_tokenizer = TokenizerName::from_name(tokenizer_name); + self.fast = FastFieldTextOptions::Enabled { + tokenizer: with_tokenizer, + }; self } @@ -166,7 +161,9 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: false, indexing: None, - fast: FastFieldTextOptions::IsEnabled(true), + fast: FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER), + }, expand_dots_enabled: false, } } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index f8de6dd9e1..044ab7116e 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -1,7 +1,7 @@ //! Schema definition for tantivy's indices. -//! //! # Setting your schema in Tantivy //! +//! //! Tantivy has a very strict schema. //! The schema defines information about the fields your index contains, that is, for each field: //! @@ -153,6 +153,8 @@ pub use self::term::{Term, ValueBytes, JSON_END_OF_PATH}; pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT}; pub use self::value::Value; +pub(crate) const DEFAULT_FAST_FIELD_TOKENIZER: &str = "default"; + /// Validator for a potential `field_name`. /// Returns true if the name can be use for a field name. /// diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 4519fb59ac..d53fe8d07f 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -24,19 +24,68 @@ pub struct TextOptions { } #[derive(Clone, Debug, PartialEq, Serialize, Deserialize)] -#[serde(untagged)] +#[serde( + into = "FastFieldTextOptionsForSerialization", + from = "FastFieldTextOptionsForSerialization" +)] /// Enum to control how the fast field setting of a text field. +#[derive(Default)] pub(crate) enum FastFieldTextOptions { - /// Flag to enable/disable - IsEnabled(bool), + /// Fastfield disabled + #[default] + Disabled, /// Enable with tokenizer. The tokenizer must be available on the fast field tokenizer manager. /// `Index::fast_field_tokenizer`. - EnabledWithTokenizer { with_tokenizer: TokenizerName }, + Enabled { tokenizer: TokenizerName }, } -impl Default for FastFieldTextOptions { - fn default() -> Self { - FastFieldTextOptions::IsEnabled(false) +/// Enum used to control the way we serialize fast field text options. +/// +/// For backward compatiblity reasons, we folow the format introduce in tantivy 0.19. +/// `false` -> Disabled +/// `true` -> Enabled with default tokenizer +/// `{ tokenizer: "something" }` -> Enabled with a specific tokenizer. +#[derive(Serialize, Deserialize)] +#[serde(untagged)] +enum FastFieldTextOptionsForSerialization { + IsEnabled(bool), + EnabledWithTokenizer { + #[serde(alias = "with_tokenizer")] + tokenizer: TokenizerName, + }, +} + +impl From for FastFieldTextOptions { + fn from(value: FastFieldTextOptionsForSerialization) -> Self { + match value { + FastFieldTextOptionsForSerialization::IsEnabled(enabled) => { + if enabled { + FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static( + crate::schema::DEFAULT_FAST_FIELD_TOKENIZER, + ), + } + } else { + FastFieldTextOptions::Disabled + } + } + FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } => { + FastFieldTextOptions::Enabled { tokenizer } + } + } + } +} + +impl From for FastFieldTextOptionsForSerialization { + fn from(value: FastFieldTextOptions) -> Self { + match value { + FastFieldTextOptions::Disabled => { + FastFieldTextOptionsForSerialization::IsEnabled(false) + } + FastFieldTextOptions::Enabled { tokenizer } => { + FastFieldTextOptionsForSerialization::EnabledWithTokenizer { tokenizer } + } + } } } @@ -45,23 +94,13 @@ impl BitOr for FastFieldTextOptions { fn bitor(self, other: FastFieldTextOptions) -> FastFieldTextOptions { match (self, other) { - ( - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - }, - _, - ) - | ( - _, - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - }, - ) => FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - }, - (FastFieldTextOptions::IsEnabled(true), _) - | (_, FastFieldTextOptions::IsEnabled(true)) => FastFieldTextOptions::IsEnabled(true), - (_, FastFieldTextOptions::IsEnabled(false)) => FastFieldTextOptions::IsEnabled(false), + (FastFieldTextOptions::Enabled { tokenizer }, _) + | (_, FastFieldTextOptions::Enabled { tokenizer }) => { + FastFieldTextOptions::Enabled { tokenizer } + } + (FastFieldTextOptions::Disabled, FastFieldTextOptions::Disabled) => { + FastFieldTextOptions::Disabled + } } } } @@ -83,20 +122,17 @@ impl TextOptions { /// Returns true if and only if the value is a fast field. pub fn is_fast(&self) -> bool { - matches!(self.fast, FastFieldTextOptions::IsEnabled(true)) - || matches!( - &self.fast, - FastFieldTextOptions::EnabledWithTokenizer { with_tokenizer: _ } - ) + match &self.fast { + FastFieldTextOptions::Disabled => false, + FastFieldTextOptions::Enabled { .. } => true, + } } /// Returns true if and only if the value is a fast field. pub fn get_fast_field_tokenizer_name(&self) -> Option<&str> { match &self.fast { - FastFieldTextOptions::IsEnabled(true) | FastFieldTextOptions::IsEnabled(false) => None, - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - } => Some(tokenizer.name()), + FastFieldTextOptions::Disabled => None, + FastFieldTextOptions::Enabled { tokenizer } => Some(tokenizer.name()), } } @@ -121,15 +157,9 @@ impl TextOptions { /// [`TermDictionary::ord_to_term()`](crate::termdict::TermDictionary::ord_to_term) /// from the dictionary. #[must_use] - pub fn set_fast(mut self, tokenizer_name: Option<&str>) -> TextOptions { - if let Some(tokenizer) = tokenizer_name { - let tokenizer = TokenizerName::from_name(tokenizer); - self.fast = FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: tokenizer, - } - } else { - self.fast = FastFieldTextOptions::IsEnabled(true); - } + pub fn set_fast(mut self, tokenizer_name: &str) -> TextOptions { + let tokenizer = TokenizerName::from_name(tokenizer_name); + self.fast = FastFieldTextOptions::Enabled { tokenizer }; self } @@ -263,7 +293,7 @@ pub const STRING: TextOptions = TextOptions { record: IndexRecordOption::Basic, }), stored: false, - fast: FastFieldTextOptions::IsEnabled(false), + fast: FastFieldTextOptions::Disabled, coerce: false, }; @@ -276,7 +306,7 @@ pub const TEXT: TextOptions = TextOptions { }), stored: false, coerce: false, - fast: FastFieldTextOptions::IsEnabled(false), + fast: FastFieldTextOptions::Disabled, }; impl> BitOr for TextOptions { @@ -326,7 +356,9 @@ impl From for TextOptions { TextOptions { indexing: None, stored: false, - fast: FastFieldTextOptions::IsEnabled(true), + fast: FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static(crate::schema::DEFAULT_FAST_FIELD_TOKENIZER), + }, coerce: false, } } @@ -392,21 +424,21 @@ mod tests { #[test] fn serde_fast_field_tokenizer() { let json = r#" { - "fast": { "with_tokenizer": "default" } + "fast": { "tokenizer": "default" } } "#; let options: TextOptions = serde_json::from_str(json).unwrap(); assert_eq!( options.fast, - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: TokenizerName::from_static("default") + FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static("default") } ); let options: TextOptions = serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); assert_eq!( options.fast, - FastFieldTextOptions::EnabledWithTokenizer { - with_tokenizer: TokenizerName::from_static("default") + FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static("default") } ); @@ -414,18 +446,28 @@ mod tests { "fast": true } "#; let options: TextOptions = serde_json::from_str(json).unwrap(); - assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); + assert_eq!( + options.fast, + FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER) + } + ); let options: TextOptions = serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); - assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(true)); + assert_eq!( + options.fast, + FastFieldTextOptions::Enabled { + tokenizer: TokenizerName::from_static(DEFAULT_FAST_FIELD_TOKENIZER) + } + ); let json = r#" { "fast": false } "#; let options: TextOptions = serde_json::from_str(json).unwrap(); - assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); + assert_eq!(options.fast, FastFieldTextOptions::Disabled); let options: TextOptions = serde_json::from_str(&serde_json::to_string(&options).unwrap()).unwrap(); - assert_eq!(options.fast, FastFieldTextOptions::IsEnabled(false)); + assert_eq!(options.fast, FastFieldTextOptions::Disabled); } } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 23f7893d29..82edcf8746 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -189,7 +189,7 @@ pub mod tests { #[test] fn test_raw_tokenizer2() { - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); let mut en_tokenizer = tokenizer_manager.get("raw").unwrap(); let mut tokens: Vec = vec![]; { @@ -206,7 +206,7 @@ pub mod tests { #[test] fn test_en_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); assert!(tokenizer_manager.get("en_doesnotexist").is_none()); let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); let mut tokens: Vec = vec![]; @@ -228,7 +228,7 @@ pub mod tests { #[test] fn test_non_en_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); tokenizer_manager.register( "el_stem", TextAnalyzer::builder(SimpleTokenizer::default()) @@ -256,7 +256,7 @@ pub mod tests { #[test] fn test_tokenizer_empty() { - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); { let mut tokens: Vec = vec![]; @@ -282,7 +282,7 @@ pub mod tests { #[test] fn test_whitespace_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); + let tokenizer_manager = TokenizerManager::default_for_indexing(); let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); let mut tokens: Vec = vec![]; { diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index a2be123903..3825084c55 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -27,6 +27,7 @@ pub struct TokenizerManager { impl TokenizerManager { /// Creates an empty tokenizer manager. + #[allow(clippy::new_without_default)] pub fn new() -> Self { Self { tokenizers: Arc::new(RwLock::new(HashMap::new())), @@ -51,12 +52,10 @@ impl TokenizerManager { .get(tokenizer_name) .cloned() } -} -impl Default for TokenizerManager { /// Creates an `TokenizerManager` prepopulated with /// the default pre-configured tokenizers of `tantivy`. - fn default() -> TokenizerManager { + pub fn default_for_indexing() -> TokenizerManager { let manager = TokenizerManager::new(); manager.register("raw", RawTokenizer::default()); manager.register( @@ -77,4 +76,28 @@ impl Default for TokenizerManager { manager.register("whitespace", WhitespaceTokenizer::default()); manager } + + /// Creates an `TokenizerManager` prepopulated with + /// the default pre-configured tokenizers of `tantivy` + /// for fast fields. + /// + /// Fast fields usually do not really tokenize the text. + /// It is however very useful to filter / normalize the text. + pub fn default_for_fast_fields() -> TokenizerManager { + let manager = TokenizerManager::new(); + let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(RemoveLongFilter::limit(255)) + .build(); + let lower_tokenizer = TextAnalyzer::builder(RawTokenizer::default()) + .filter(RemoveLongFilter::limit(255)) + .filter(LowerCaser) + .build(); + manager.register( + crate::schema::DEFAULT_FAST_FIELD_TOKENIZER, + lower_tokenizer.clone(), + ); + manager.register("raw", raw_tokenizer); + manager.register("lower", lower_tokenizer); + manager + } }