diff --git a/common/src/bounds.rs b/common/src/bounds.rs new file mode 100644 index 0000000000..712c77852f --- /dev/null +++ b/common/src/bounds.rs @@ -0,0 +1,130 @@ +use std::io; +use std::ops::Bound; + +#[derive(Clone, Debug)] +pub struct BoundsRange { + pub lower_bound: Bound, + pub upper_bound: Bound, +} +impl BoundsRange { + pub fn new(lower_bound: Bound, upper_bound: Bound) -> Self { + BoundsRange { + lower_bound, + upper_bound, + } + } + pub fn is_unbounded(&self) -> bool { + matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded) + } + pub fn map_bound(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange { + BoundsRange { + lower_bound: map_bound(&self.lower_bound, &transform), + upper_bound: map_bound(&self.upper_bound, &transform), + } + } + + pub fn map_bound_res( + &self, + transform: impl Fn(&T) -> Result, + ) -> Result, Err> { + Ok(BoundsRange { + lower_bound: map_bound_res(&self.lower_bound, &transform)?, + upper_bound: map_bound_res(&self.upper_bound, &transform)?, + }) + } + + pub fn transform_inner( + &self, + transform_lower: impl Fn(&T) -> TransformBound, + transform_upper: impl Fn(&T) -> TransformBound, + ) -> BoundsRange { + BoundsRange { + lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower), + upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper), + } + } + + /// Returns the first set inner value + pub fn get_inner(&self) -> Option<&T> { + inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound)) + } +} + +pub enum TransformBound { + /// Overwrite the bounds + NewBound(Bound), + /// Use Existing bounds with new value + Existing(T), +} + +/// Takes a bound and transforms the inner value into a new bound via a closure. +/// The bound variant may change by the value returned value from the closure. +pub fn transform_bound_inner_res( + bound: &Bound, + transform: impl Fn(&TFrom) -> io::Result>, +) -> io::Result> { + use self::Bound::*; + Ok(match bound { + Excluded(ref from_val) => match transform(from_val)? { + TransformBound::NewBound(new_val) => new_val, + TransformBound::Existing(new_val) => Excluded(new_val), + }, + Included(ref from_val) => match transform(from_val)? { + TransformBound::NewBound(new_val) => new_val, + TransformBound::Existing(new_val) => Included(new_val), + }, + Unbounded => Unbounded, + }) +} + +/// Takes a bound and transforms the inner value into a new bound via a closure. +/// The bound variant may change by the value returned value from the closure. +pub fn transform_bound_inner( + bound: &Bound, + transform: impl Fn(&TFrom) -> TransformBound, +) -> Bound { + use self::Bound::*; + match bound { + Excluded(ref from_val) => match transform(from_val) { + TransformBound::NewBound(new_val) => new_val, + TransformBound::Existing(new_val) => Excluded(new_val), + }, + Included(ref from_val) => match transform(from_val) { + TransformBound::NewBound(new_val) => new_val, + TransformBound::Existing(new_val) => Included(new_val), + }, + Unbounded => Unbounded, + } +} + +/// Returns the inner value of a `Bound` +pub fn inner_bound(val: &Bound) -> Option<&T> { + match val { + Bound::Included(term) | Bound::Excluded(term) => Some(term), + Bound::Unbounded => None, + } +} + +pub fn map_bound( + bound: &Bound, + transform: impl Fn(&TFrom) -> TTo, +) -> Bound { + use self::Bound::*; + match bound { + Excluded(ref from_val) => Bound::Excluded(transform(from_val)), + Included(ref from_val) => Bound::Included(transform(from_val)), + Unbounded => Unbounded, + } +} + +pub fn map_bound_res( + bound: &Bound, + transform: impl Fn(&TFrom) -> Result, +) -> Result, Err> { + use self::Bound::*; + Ok(match bound { + Excluded(ref from_val) => Excluded(transform(from_val)?), + Included(ref from_val) => Included(transform(from_val)?), + Unbounded => Unbounded, + }) +} diff --git a/common/src/lib.rs b/common/src/lib.rs index bfbccecd93..0a51f91fe3 100644 --- a/common/src/lib.rs +++ b/common/src/lib.rs @@ -5,6 +5,7 @@ use std::ops::Deref; pub use byteorder::LittleEndian as Endianness; mod bitset; +pub mod bounds; mod byte_count; mod datetime; pub mod file_slice; diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs index 2767cd6d93..5f40339b7b 100644 --- a/src/core/json_utils.rs +++ b/src/core/json_utils.rs @@ -4,7 +4,7 @@ use rustc_hash::FxHashMap; use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter}; use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value}; -use crate::schema::Type; +use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED}; use crate::time::format_description::well_known::Rfc3339; use crate::time::{OffsetDateTime, UtcOffset}; use crate::tokenizer::TextAnalyzer; @@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( ctx.path_to_unordered_id .get_or_allocate_unordered_id(json_path_writer.as_str()), ); + let val = val.truncate(DATE_TIME_PRECISION_INDEXED); term_buffer.append_type_and_fast_value(val); postings_writer.subscribe(doc, 0u32, term_buffer, ctx); } @@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>( /// Tries to infer a JSON type from a string and append it to the term. /// /// The term must be json + JSON path. -pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option { +pub fn convert_to_fast_value_and_append_to_json_term( + mut term: Term, + phrase: &str, + truncate_date_for_search: bool, +) -> Option { assert_eq!( term.value() .as_json_value_bytes() @@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st "JSON value bytes should be empty" ); if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) { - let dt_utc = dt.to_offset(UtcOffset::UTC); - term.append_type_and_fast_value(DateTime::from_utc(dt_utc)); + let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC)); + if truncate_date_for_search { + dt = dt.truncate(DATE_TIME_PRECISION_INDEXED); + } + term.append_type_and_fast_value(dt); return Some(term); } if let Ok(i64_val) = str::parse::(phrase) { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 63ec869aa3..1af64607bd 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -673,7 +673,7 @@ mod tests { ] ); assert_eq!( - get_doc_ids(vec![Term::from_field_date( + get_doc_ids(vec![Term::from_field_date_for_search( date_field, DateTime::from_utc(curr_time) )])?, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 439f46aee8..1ab320006d 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -64,9 +64,9 @@ impl SegmentWriter { /// /// The arguments are defined as follows /// - /// - memory_budget: most of the segment writer data (terms, and postings lists recorders) - /// is stored in a memory arena. This makes it possible for the user to define - /// the flushing behavior as a memory limit. + /// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is + /// stored in a memory arena. This makes it possible for the user to define the flushing + /// behavior as a memory limit. /// - segment: The segment being written /// - schema pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result { @@ -431,7 +431,7 @@ mod tests { use crate::query::{PhraseQuery, QueryParser}; use crate::schema::{ Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value, - STORED, STRING, TEXT, + DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT, }; use crate::store::{Compressor, StoreReader, StoreWriter}; use crate::time::format_description::well_known::Rfc3339; @@ -651,7 +651,8 @@ mod tests { set_fast_val( DateTime::from_utc( OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(), - ), + ) + .truncate(DATE_TIME_PRECISION_INDEXED), term ) .serialized_value_bytes() diff --git a/src/query/mod.rs b/src/query/mod.rs index d57d3eeab1..1736a2fe4f 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::PhraseQuery; pub use self::query::{EnableScoring, Query, QueryClone}; pub use self::query_parser::{QueryParser, QueryParserError}; -pub use self::range_query::{FastFieldRangeWeight, RangeQuery}; +pub use self::range_query::*; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::score_combiner::{ diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 043d081df4..71c34cecda 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -241,7 +241,7 @@ impl MoreLikeThis { let timestamp = value.as_datetime().ok_or_else(|| { TantivyError::InvalidArgument("invalid value".to_string()) })?; - let term = Term::from_field_date(field, timestamp); + let term = Term::from_field_date_for_search(field, timestamp); *term_frequencies.entry(term).or_insert(0) += 1; } } diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs index d6efe388d5..858a28034b 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_query.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -2,7 +2,7 @@ use std::ops::Bound; use super::{prefix_end, PhrasePrefixWeight}; use crate::query::bm25::Bm25Weight; -use crate::query::{EnableScoring, Query, RangeQuery, Weight}; +use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight}; use crate::schema::{Field, IndexRecordOption, Term}; const DEFAULT_MAX_EXPANSIONS: u32 = 50; @@ -145,9 +145,15 @@ impl Query for PhrasePrefixQuery { Bound::Unbounded }; - let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term); - range_query.limit(self.max_expansions as u64); - range_query.weight(enable_scoring) + let lower_bound = Bound::Included(self.prefix.1.clone()); + let upper_bound = end_term; + + Ok(Box::new(InvertedIndexRangeWeight::new( + self.field, + &lower_bound, + &upper_bound, + Some(self.max_expansions as u64), + ))) } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 81115d2878..ac6d297f47 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// so-called default fields (as set up in the constructor). /// /// Assuming that the default fields are `body` and `title`, and the query parser is set with -/// conjunction as a default, our query will be interpreted as. +/// conjunction as a default, our query will be interpreted as. /// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`. /// By default, all tokenized and indexed fields are default fields. /// @@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option { /// `body:Barack OR (body:Barack OR text:Obama)` . /// /// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is -/// interpreted -/// as `(a AND b) OR c`. +/// interpreted as `(a AND b) OR c`. /// /// * In addition to the boolean operators, the `-`, `+` can help define. These operators are /// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be @@ -272,8 +271,7 @@ impl QueryParser { /// Creates a `QueryParser`, given /// * an index - /// * a set of default fields used to search if no field is specifically defined - /// in the query. + /// * a set of default fields used to search if no field is specifically defined in the query. pub fn for_index(index: &Index, default_fields: Vec) -> QueryParser { QueryParser::new(index.schema(), default_fields, index.tokenizers().clone()) } @@ -482,16 +480,33 @@ impl QueryParser { }); if terms.len() != 1 { return Err(QueryParserError::UnsupportedQuery(format!( - "Range query boundary cannot have multiple tokens: {phrase:?}." + "Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]." ))); } Ok(terms.into_iter().next().unwrap()) } - FieldType::JsonObject(_) => { - // Json range are not supported. - Err(QueryParserError::UnsupportedQuery( - "Range query are not supported on json field.".to_string(), - )) + FieldType::JsonObject(ref json_options) => { + let get_term_with_path = || { + Term::from_field_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + ) + }; + if let Some(term) = + // Try to convert the phrase to a fast value + convert_to_fast_value_and_append_to_json_term( + get_term_with_path(), + phrase, + false, + ) + { + Ok(term) + } else { + let mut term = get_term_with_path(); + term.append_type_and_str(phrase); + Ok(term) + } } FieldType::Facet(_) => match Facet::from_text(phrase) { Ok(facet) => Ok(Term::from_facet(field, &facet)), @@ -553,7 +568,7 @@ impl QueryParser { } FieldType::Date(_) => { let dt = OffsetDateTime::parse(phrase, &Rfc3339)?; - let dt_term = Term::from_field_date(field, DateTime::from_utc(dt)); + let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt)); Ok(vec![LogicalLiteral::Term(dt_term)]) } FieldType::Str(ref str_options) => { @@ -685,8 +700,8 @@ impl QueryParser { /// /// The terms are identified by a triplet: /// - tantivy field - /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON - /// object by naturally extending the json field name with a "." separated field_path + /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by + /// naturally extending the json field name with a "." separated field_path /// - field_phrase: the phrase that is being searched. /// /// The literal identifies the targeted field by a so-called *full field path*, @@ -949,7 +964,8 @@ fn generate_literals_for_json_object( || Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled()); // Try to convert the phrase to a fast value - if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase) + if let Some(term) = + convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true) { logical_literals.push(LogicalLiteral::Term(term)); } @@ -1123,8 +1139,8 @@ mod test { let query = make_query_parser().parse_query("title:[A TO B]").unwrap(); assert_eq!( format!("{query:?}"), - "RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \ - Included(Term(field=0, type=Str, \"b\")), limit: None }" + "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \ + \"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }" ); } diff --git a/src/query/range_query/mod.rs b/src/query/range_query/mod.rs index 40effb85b1..e41cb87685 100644 --- a/src/query/range_query/mod.rs +++ b/src/query/range_query/mod.rs @@ -1,40 +1,19 @@ -use std::ops::Bound; - use crate::schema::Type; mod fast_field_range_doc_set; mod range_query; mod range_query_u64_fastfield; -pub use self::range_query::RangeQuery; +pub use self::range_query::*; pub use self::range_query_u64_fastfield::FastFieldRangeWeight; // TODO is this correct? pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool { match typ { - Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true, + Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date | Type::Json => { + true + } Type::IpAddr => true, - Type::Facet | Type::Bytes | Type::Json => false, - } -} - -fn map_bound(bound: &Bound, transform: impl Fn(&TFrom) -> TTo) -> Bound { - use self::Bound::*; - match bound { - Excluded(ref from_val) => Excluded(transform(from_val)), - Included(ref from_val) => Included(transform(from_val)), - Unbounded => Unbounded, + Type::Facet | Type::Bytes => false, } } - -fn map_bound_res( - bound: &Bound, - transform: impl Fn(&TFrom) -> Result, -) -> Result, Err> { - use self::Bound::*; - Ok(match bound { - Excluded(ref from_val) => Excluded(transform(from_val)?), - Included(ref from_val) => Included(transform(from_val)?), - Unbounded => Unbounded, - }) -} diff --git a/src/query/range_query/range_query.rs b/src/query/range_query/range_query.rs index 4b27714c3d..bf2090e90f 100644 --- a/src/query/range_query/range_query.rs +++ b/src/query/range_query/range_query.rs @@ -1,9 +1,9 @@ use std::io; use std::ops::Bound; +use common::bounds::{map_bound, BoundsRange}; use common::BitSet; -use super::map_bound; use super::range_query_u64_fastfield::FastFieldRangeWeight; use crate::index::SegmentReader; use crate::query::explanation::does_not_match; @@ -69,17 +69,7 @@ use crate::{DocId, Score}; /// ``` #[derive(Clone, Debug)] pub struct RangeQuery { - lower_bound: Bound, - upper_bound: Bound, - limit: Option, -} - -/// Returns the inner value of a `Bound` -pub(crate) fn inner_bound(val: &Bound) -> Option<&Term> { - match val { - Bound::Included(term) | Bound::Excluded(term) => Some(term), - Bound::Unbounded => None, - } + bounds: BoundsRange, } impl RangeQuery { @@ -89,9 +79,7 @@ impl RangeQuery { /// the `Weight` object is created. pub fn new(lower_bound: Bound, upper_bound: Bound) -> RangeQuery { RangeQuery { - lower_bound, - upper_bound, - limit: None, + bounds: BoundsRange::new(lower_bound, upper_bound), } } @@ -106,18 +94,10 @@ impl RangeQuery { } pub(crate) fn get_term(&self) -> &Term { - inner_bound(&self.lower_bound) - .or(inner_bound(&self.upper_bound)) + self.bounds + .get_inner() .expect("At least one bound must be set") } - - /// Limit the number of term the `RangeQuery` will go through. - /// - /// This does not limit the number of matching document, only the number of - /// different terms that get matched. - pub(crate) fn limit(&mut self, limit: u64) { - self.limit = Some(limit); - } } impl Query for RangeQuery { @@ -126,31 +106,90 @@ impl Query for RangeQuery { let field_type = schema.get_field_entry(self.field()).field_type(); if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) { - Ok(Box::new(FastFieldRangeWeight::new( + Ok(Box::new(FastFieldRangeWeight::new(self.bounds.clone()))) + } else { + if field_type.is_json() { + return Err(crate::TantivyError::InvalidArgument( + "RangeQuery on JSON is only supported for fast fields currently".to_string(), + )); + } + Ok(Box::new(InvertedIndexRangeWeight::new( self.field(), - self.lower_bound.clone(), - self.upper_bound.clone(), + &self.bounds.lower_bound, + &self.bounds.upper_bound, + None, ))) - } else { - let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); - Ok(Box::new(RangeWeight { - field: self.field(), - lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term), - upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term), - limit: self.limit, - })) } } } -pub struct RangeWeight { +#[derive(Clone, Debug)] +/// `InvertedIndexRangeQuery` is the same as [RangeQuery] but only uses the inverted index +pub struct InvertedIndexRangeQuery { + bounds: BoundsRange, + limit: Option, +} +impl InvertedIndexRangeQuery { + /// Create new `InvertedIndexRangeQuery` + pub fn new(lower_bound: Bound, upper_bound: Bound) -> InvertedIndexRangeQuery { + InvertedIndexRangeQuery { + bounds: BoundsRange::new(lower_bound, upper_bound), + limit: None, + } + } + /// Limit the number of term the `RangeQuery` will go through. + /// + /// This does not limit the number of matching document, only the number of + /// different terms that get matched. + pub fn limit(&mut self, limit: u64) { + self.limit = Some(limit); + } +} + +impl Query for InvertedIndexRangeQuery { + fn weight(&self, _enable_scoring: EnableScoring<'_>) -> crate::Result> { + let field = self + .bounds + .get_inner() + .expect("At least one bound must be set") + .field(); + + Ok(Box::new(InvertedIndexRangeWeight::new( + field, + &self.bounds.lower_bound, + &self.bounds.upper_bound, + self.limit, + ))) + } +} + +/// Range weight on the inverted index +pub struct InvertedIndexRangeWeight { field: Field, lower_bound: Bound>, upper_bound: Bound>, limit: Option, } -impl RangeWeight { +impl InvertedIndexRangeWeight { + /// Creates a new RangeWeight + /// + /// Note: The limit is only enabled with the quickwit feature flag. + pub fn new( + field: Field, + lower_bound: &Bound, + upper_bound: &Bound, + limit: Option, + ) -> Self { + let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned(); + Self { + field, + lower_bound: map_bound(lower_bound, verify_and_unwrap_term), + upper_bound: map_bound(upper_bound, verify_and_unwrap_term), + limit, + } + } + fn term_range<'a>(&self, term_dict: &'a TermDictionary) -> io::Result> { use std::ops::Bound::*; let mut term_stream_builder = term_dict.range(); @@ -172,7 +211,7 @@ impl RangeWeight { } } -impl Weight for RangeWeight { +impl Weight for InvertedIndexRangeWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { let max_doc = reader.max_doc(); let mut doc_bitset = BitSet::with_max_value(max_doc); @@ -227,6 +266,7 @@ mod tests { use super::RangeQuery; use crate::collector::{Count, TopDocs}; use crate::indexer::NoMergePolicy; + use crate::query::range_query::range_query::InvertedIndexRangeQuery; use crate::query::QueryParser; use crate::schema::{ Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT, @@ -253,7 +293,7 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); - let docs_in_the_sixties = RangeQuery::new( + let docs_in_the_sixties = InvertedIndexRangeQuery::new( Bound::Included(Term::from_field_u64(year_field, 1960)), Bound::Excluded(Term::from_field_u64(year_field, 1970)), ); @@ -287,7 +327,7 @@ mod tests { let reader = index.reader()?; let searcher = reader.searcher(); - let mut docs_in_the_sixties = RangeQuery::new( + let mut docs_in_the_sixties = InvertedIndexRangeQuery::new( Bound::Included(Term::from_field_u64(year_field, 1960)), Bound::Excluded(Term::from_field_u64(year_field, 1970)), ); diff --git a/src/query/range_query/range_query_u64_fastfield.rs b/src/query/range_query/range_query_u64_fastfield.rs index 918d524ccf..9ae0c5dce6 100644 --- a/src/query/range_query/range_query_u64_fastfield.rs +++ b/src/query/range_query/range_query_u64_fastfield.rs @@ -5,32 +5,27 @@ use std::net::Ipv6Addr; use std::ops::{Bound, RangeInclusive}; -use columnar::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn}; -use common::BinarySerializable; +use columnar::{ + Column, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalType, + StrColumn, +}; +use common::bounds::{BoundsRange, TransformBound}; use super::fast_field_range_doc_set::RangeDocSet; -use super::{map_bound, map_bound_res}; -use crate::query::range_query::range_query::inner_bound; use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight}; -use crate::schema::{Field, Type}; +use crate::schema::{Type, ValueBytes}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term}; /// `FastFieldRangeWeight` uses the fast field to execute range queries. #[derive(Clone, Debug)] pub struct FastFieldRangeWeight { - lower_bound: Bound, - upper_bound: Bound, - field: Field, + bounds: BoundsRange, } impl FastFieldRangeWeight { /// Create a new FastFieldRangeWeight - pub fn new(field: Field, lower_bound: Bound, upper_bound: Bound) -> Self { - Self { - lower_bound, - upper_bound, - field, - } + pub(crate) fn new(bounds: BoundsRange) -> Self { + Self { bounds } } } @@ -46,15 +41,15 @@ impl Query for FastFieldRangeWeight { impl Weight for FastFieldRangeWeight { fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { // Check if both bounds are Bound::Unbounded - if self.lower_bound == Bound::Unbounded && self.upper_bound == Bound::Unbounded { + if self.bounds.is_unbounded() { return Ok(Box::new(AllScorer::new(reader.max_doc()))); } - let field_name = reader.schema().get_field_name(self.field); - let field_type = reader.schema().get_field_entry(self.field).field_type(); - let term = inner_bound(&self.lower_bound) - .or(inner_bound(&self.upper_bound)) + let term = self + .bounds + .get_inner() .expect("At least one bound must be set"); + let field_type = reader.schema().get_field_entry(term.field()).field_type(); assert_eq!( term.typ(), field_type.value_type(), @@ -62,83 +57,153 @@ impl Weight for FastFieldRangeWeight { field_type, term.typ() ); - if field_type.is_ip_addr() { + let field_name = term.get_full_path(reader.schema()); + + let get_value_bytes = |term: &Term| term.value().value_bytes_payload(); + + let term_value = term.value(); + if field_type.is_json() { + let bounds = self + .bounds + .map_bound(|term| term.value().as_json_value_bytes().unwrap().to_owned()); + // Unlike with other field types JSON may have multiple columns of different types + // under the same name + // + // In the JSON case the provided type in term may not exactly match the column type, + // especially with the numeric type interpolation + let json_value_bytes = term_value + .as_json_value_bytes() + .expect("expected json type in term"); + let typ = json_value_bytes.typ(); + + match typ { + Type::Str => { + let Some(str_dict_column): Option = + reader.fast_fields().str(&field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + let dict = str_dict_column.dictionary(); + + let bounds = self.bounds.map_bound(get_value_bytes); + // Get term ids for terms + let (lower_bound, upper_bound) = + dict.term_bounds_to_ord(bounds.lower_bound, bounds.upper_bound)?; + let fast_field_reader = reader.fast_fields(); + let Some((column, _col_type)) = fast_field_reader + .u64_lenient_for_type(Some(&[ColumnType::Str]), &field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) + } + Type::U64 | Type::I64 | Type::F64 => { + search_on_json_numerical_field(reader, &field_name, typ, bounds, boost) + } + Type::Date => { + let fast_field_reader = reader.fast_fields(); + let Some((column, _col_type)) = fast_field_reader + .u64_lenient_for_type(Some(&[ColumnType::DateTime]), &field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + let bounds = bounds.map_bound(|term| term.as_date().unwrap().to_u64()); + search_on_u64_ff( + column, + boost, + BoundsRange::new(bounds.lower_bound, bounds.upper_bound), + ) + } + Type::Bool | Type::Facet | Type::Bytes | Type::Json | Type::IpAddr => { + Err(crate::TantivyError::InvalidArgument(format!( + "unsupported value bytes type in json term value_bytes {:?}", + term_value.typ() + ))) + } + } + } else if field_type.is_ip_addr() { let parse_ip_from_bytes = |term: &Term| { term.value().as_ip_addr().ok_or_else(|| { crate::TantivyError::InvalidArgument("Expected ip address".to_string()) }) }; - let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?; - let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?; + let bounds: BoundsRange = self.bounds.map_bound_res(parse_ip_from_bytes)?; let Some(ip_addr_column): Option> = - reader.fast_fields().column_opt(field_name)? + reader.fast_fields().column_opt(&field_name)? else { return Ok(Box::new(EmptyScorer)); }; - let value_range = bound_to_value_range_ip( - &lower_bound, - &upper_bound, + let value_range = bound_range_inclusive_ip( + &bounds.lower_bound, + &bounds.upper_bound, ip_addr_column.min_value(), ip_addr_column.max_value(), ); let docset = RangeDocSet::new(value_range, ip_addr_column); Ok(Box::new(ConstScorer::new(docset, boost))) - } else { - let (lower_bound, upper_bound) = if field_type.is_str() { - let Some(str_dict_column): Option = - reader.fast_fields().str(field_name)? - else { - return Ok(Box::new(EmptyScorer)); - }; - let dict = str_dict_column.dictionary(); - - let lower_bound = map_bound(&self.lower_bound, |term| { - term.serialized_value_bytes().to_vec() - }); - let upper_bound = map_bound(&self.upper_bound, |term| { - term.serialized_value_bytes().to_vec() - }); - // Get term ids for terms - let (lower_bound, upper_bound) = - dict.term_bounds_to_ord(lower_bound, upper_bound)?; - (lower_bound, upper_bound) - } else { - assert!( - maps_to_u64_fastfield(field_type.value_type()), - "{:?}", - field_type - ); - let parse_from_bytes = |term: &Term| { - u64::from_be( - BinarySerializable::deserialize(&mut &term.serialized_value_bytes()[..]) - .unwrap(), - ) - }; + } else if field_type.is_str() { + let Some(str_dict_column): Option = reader.fast_fields().str(&field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + let dict = str_dict_column.dictionary(); - let lower_bound = map_bound(&self.lower_bound, parse_from_bytes); - let upper_bound = map_bound(&self.upper_bound, parse_from_bytes); - (lower_bound, upper_bound) + let bounds = self.bounds.map_bound(get_value_bytes); + // Get term ids for terms + let (lower_bound, upper_bound) = + dict.term_bounds_to_ord(bounds.lower_bound, bounds.upper_bound)?; + let fast_field_reader = reader.fast_fields(); + let Some((column, _col_type)) = + fast_field_reader.u64_lenient_for_type(None, &field_name)? + else { + return Ok(Box::new(EmptyScorer)); }; + search_on_u64_ff(column, boost, BoundsRange::new(lower_bound, upper_bound)) + } else { + assert!( + maps_to_u64_fastfield(field_type.value_type()), + "{:?}", + field_type + ); + + let bounds = self.bounds.map_bound_res(|term| { + let value = term.value(); + let val = if let Some(val) = value.as_u64() { + val + } else if let Some(val) = value.as_i64() { + val.to_u64() + } else if let Some(val) = value.as_f64() { + val.to_u64() + } else if let Some(val) = value.as_date() { + val.to_u64() + } else { + return Err(TantivyError::InvalidArgument(format!( + "Expected term with u64, i64, f64 or date, but got {:?}", + term + ))); + }; + Ok(val) + })?; let fast_field_reader = reader.fast_fields(); - let Some((column, _)) = fast_field_reader.u64_lenient_for_type(None, field_name)? + let Some((column, _col_type)) = fast_field_reader.u64_lenient_for_type( + Some(&[ + ColumnType::U64, + ColumnType::I64, + ColumnType::F64, + ColumnType::DateTime, + ]), + &field_name, + )? else { return Ok(Box::new(EmptyScorer)); }; - #[allow(clippy::reversed_empty_ranges)] - let value_range = bound_to_value_range( - &lower_bound, - &upper_bound, - column.min_value(), - column.max_value(), + search_on_u64_ff( + column, + boost, + BoundsRange::new(bounds.lower_bound, bounds.upper_bound), ) - .unwrap_or(1..=0); // empty range - if value_range.is_empty() { - return Ok(Box::new(EmptyScorer)); - } - let docset = RangeDocSet::new(value_range, column); - Ok(Box::new(ConstScorer::new(docset, boost))) } } @@ -155,6 +220,189 @@ impl Weight for FastFieldRangeWeight { } } +/// On numerical fields the column type may not match the user provided one. +/// +/// Convert into fast field value space and search. +fn search_on_json_numerical_field( + reader: &SegmentReader, + field_name: &str, + typ: Type, + bounds: BoundsRange>>, + boost: Score, +) -> crate::Result> { + // Since we don't know which type was interpolated for the internal column whe + // have to check for all numeric types (only one exists) + let allowed_column_types: Option<&[ColumnType]> = + Some(&[ColumnType::F64, ColumnType::I64, ColumnType::U64]); + let fast_field_reader = reader.fast_fields(); + let Some((column, col_type)) = + fast_field_reader.u64_lenient_for_type(allowed_column_types, field_name)? + else { + return Ok(Box::new(EmptyScorer)); + }; + let actual_colum_type: NumericalType = col_type.numerical_type().unwrap_or_else(|| { + panic!( + "internal error: couldn't cast to numerical_type: {:?}", + col_type + ) + }); + + let bounds = match typ.numerical_type().unwrap() { + NumericalType::I64 => { + let bounds = bounds.map_bound(|term| (term.as_i64().unwrap())); + match actual_colum_type { + NumericalType::I64 => bounds.map_bound(|&term| term.to_u64()), + NumericalType::U64 => { + bounds.transform_inner( + |&val| { + if val < 0 { + return TransformBound::NewBound(Bound::Unbounded); + } + TransformBound::Existing(val as u64) + }, + |&val| { + if val < 0 { + // no hits case + return TransformBound::NewBound(Bound::Excluded(0)); + } + TransformBound::Existing(val as u64) + }, + ) + } + NumericalType::F64 => bounds.map_bound(|&term| (term as f64).to_u64()), + } + } + NumericalType::U64 => { + let bounds = bounds.map_bound(|term| (term.as_u64().unwrap())); + match actual_colum_type { + NumericalType::U64 => bounds.map_bound(|&term| term.to_u64()), + NumericalType::I64 => { + bounds.transform_inner( + |&val| { + if val > i64::MAX as u64 { + // Actual no hits case + return TransformBound::NewBound(Bound::Excluded(i64::MAX as u64)); + } + TransformBound::Existing((val as i64).to_u64()) + }, + |&val| { + if val > i64::MAX as u64 { + return TransformBound::NewBound(Bound::Unbounded); + } + TransformBound::Existing((val as i64).to_u64()) + }, + ) + } + NumericalType::F64 => bounds.map_bound(|&term| (term as f64).to_u64()), + } + } + NumericalType::F64 => { + let bounds = bounds.map_bound(|term| (term.as_f64().unwrap())); + match actual_colum_type { + NumericalType::U64 => transform_from_f64_bounds::(&bounds), + NumericalType::I64 => transform_from_f64_bounds::(&bounds), + NumericalType::F64 => bounds.map_bound(|&term| term.to_u64()), + } + } + }; + search_on_u64_ff( + column, + boost, + BoundsRange::new(bounds.lower_bound, bounds.upper_bound), + ) +} + +trait IntType { + fn min() -> Self; + fn max() -> Self; + fn to_f64(self) -> f64; + fn from_f64(val: f64) -> Self; +} +impl IntType for i64 { + fn min() -> Self { + Self::MIN + } + fn max() -> Self { + Self::MAX + } + fn to_f64(self) -> f64 { + self as f64 + } + fn from_f64(val: f64) -> Self { + val as Self + } +} +impl IntType for u64 { + fn min() -> Self { + Self::MIN + } + fn max() -> Self { + Self::MAX + } + fn to_f64(self) -> f64 { + self as f64 + } + fn from_f64(val: f64) -> Self { + val as Self + } +} + +fn transform_from_f64_bounds( + bounds: &BoundsRange, +) -> BoundsRange { + bounds.transform_inner( + |&lower_bound| { + if lower_bound < T::min().to_f64() { + return TransformBound::NewBound(Bound::Unbounded); + } + if lower_bound > T::max().to_f64() { + // no hits case + return TransformBound::NewBound(Bound::Excluded(u64::MAX)); + } + + if lower_bound.fract() == 0.0 { + TransformBound::Existing(T::from_f64(lower_bound).to_u64()) + } else { + TransformBound::NewBound(Bound::Included(T::from_f64(lower_bound.trunc()).to_u64())) + } + }, + |&upper_bound| { + if upper_bound < T::min().to_f64() { + return TransformBound::NewBound(Bound::Unbounded); + } + if upper_bound > T::max().to_f64() { + // no hits case + return TransformBound::NewBound(Bound::Included(u64::MAX)); + } + if upper_bound.fract() == 0.0 { + TransformBound::Existing(T::from_f64(upper_bound).to_u64()) + } else { + TransformBound::NewBound(Bound::Included(T::from_f64(upper_bound.trunc()).to_u64())) + } + }, + ) +} + +fn search_on_u64_ff( + column: Column, + boost: Score, + bounds: BoundsRange, +) -> crate::Result> { + #[allow(clippy::reversed_empty_ranges)] + let value_range = bound_to_value_range( + &bounds.lower_bound, + &bounds.upper_bound, + column.min_value(), + column.max_value(), + ) + .unwrap_or(1..=0); // empty range + if value_range.is_empty() { + return Ok(Box::new(EmptyScorer)); + } + let docset = RangeDocSet::new(value_range, column); + Ok(Box::new(ConstScorer::new(docset, boost))) +} + /// Returns true if the type maps to a u64 fast field pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { match typ { @@ -164,7 +412,7 @@ pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool { } } -fn bound_to_value_range_ip( +fn bound_range_inclusive_ip( lower_bound: &Bound, upper_bound: &Bound, min_value: Ipv6Addr, @@ -212,18 +460,24 @@ fn bound_to_value_range( pub mod tests { use std::ops::{Bound, RangeInclusive}; + use common::bounds::BoundsRange; + use common::DateTime; use proptest::prelude::*; use rand::rngs::StdRng; use rand::seq::SliceRandom; use rand::SeedableRng; + use time::format_description::well_known::Rfc3339; + use time::OffsetDateTime; use crate::collector::{Count, TopDocs}; + use crate::fastfield::FastValue; use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight; - use crate::query::{QueryParser, Weight}; + use crate::query::{QueryParser, RangeQuery, Weight}; use crate::schema::{ - NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT, + DateOptions, Field, NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, + TEXT, }; - use crate::{Index, IndexWriter, Term, TERMINATED}; + use crate::{Index, IndexWriter, TantivyDocument, Term, TERMINATED}; #[test] fn test_text_field_ff_range_query() -> crate::Result<()> { @@ -284,6 +538,496 @@ pub mod tests { Ok(()) } + #[test] + fn test_date_range_query() { + let mut schema_builder = Schema::builder(); + let options = DateOptions::default() + .set_precision(common::DateTimePrecision::Microseconds) + .set_fast(); + let date_field = schema_builder.add_date_field("date", options); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema.clone()); + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + // This is added a string and creates a string column! + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer + .add_document(doc!(date_field => DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ))) + .unwrap(); + index_writer.commit().unwrap(); + } + + // Date field + let dt1 = + DateTime::from_utc(OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap()); + let dt2 = + DateTime::from_utc(OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap()); + let dt3 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ); + let dt4 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(), + ); + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let query_parser = QueryParser::for_index(&index, vec![date_field]); + let test_query = |query, num_hits| { + let query = query_parser.parse_query(query).unwrap(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + assert_eq!(top_docs.len(), num_hits); + }; + + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.001Z]", + 1, + ); + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z}", + 1, + ); + test_query( + "date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 1, + ); + test_query( + "date:{2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 0, + ); + + let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt3)), + Bound::Excluded(Term::from_field_date(date_field, dt4)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt3)), + Bound::Included(Term::from_field_date(date_field, dt4)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt1)), + Bound::Included(Term::from_field_date(date_field, dt2)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(Term::from_field_date(date_field, dt1)), + Bound::Excluded(Term::from_field_date(date_field, dt2)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Excluded(Term::from_field_date(date_field, dt1)), + Bound::Excluded(Term::from_field_date(date_field, dt2)), + )), + 0 + ); + } + + fn get_json_term(field: Field, path: &str, value: T) -> Term { + let mut term = Term::from_field_json_path(field, path, true); + term.append_type_and_fast_value(value); + term + } + + #[test] + fn mixed_numerical_test() { + let mut schema_builder = Schema::builder(); + schema_builder.add_i64_field("id_i64", STORED | FAST); + schema_builder.add_u64_field("id_u64", STORED | FAST); + schema_builder.add_f64_field("id_f64", STORED | FAST); + let schema = schema_builder.build(); + + fn get_json_term(schema: &Schema, path: &str, value: T) -> Term { + let field = schema.get_field(path).unwrap(); + Term::from_fast_value(field, &value) + // term.append_type_and_fast_value(value); + // term + } + let index = Index::create_in_ram(schema.clone()); + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + + let doc = json!({ + "id_u64": 0, + "id_i64": 50, + }); + let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&doc).unwrap()) + .unwrap(); + index_writer.add_document(doc).unwrap(); + let doc = json!({ + "id_u64": 10, + "id_i64": 1000, + }); + let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&doc).unwrap()) + .unwrap(); + index_writer.add_document(doc).unwrap(); + + index_writer.commit().unwrap(); + } + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + + // u64 on u64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(&schema, "id_u64", 10u64)), + Bound::Included(get_json_term(&schema, "id_u64", 10u64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(&schema, "id_u64", 9u64)), + Bound::Excluded(get_json_term(&schema, "id_u64", 10u64)), + )), + 0 + ); + + // i64 on i64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(&schema, "id_i64", 50i64)), + Bound::Included(get_json_term(&schema, "id_i64", 1000i64)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(&schema, "id_i64", 50i64)), + Bound::Excluded(get_json_term(&schema, "id_i64", 1000i64)), + )), + 1 + ); + } + + #[test] + fn json_range_mixed_val() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT | STORED | FAST); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + let doc = json!({ + "mixed_val": 10000, + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "mixed_val": 20000, + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "mixed_val": "1000a", + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "mixed_val": "2000a", + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + index_writer.commit().unwrap(); + } + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "mixed_val", 10000u64)), + Bound::Included(get_json_term(json_field, "mixed_val", 20000u64)), + )), + 2 + ); + fn get_json_term_str(field: Field, path: &str, value: &str) -> Term { + let mut term = Term::from_field_json_path(field, path, true); + term.append_type_and_str(value); + term + } + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term_str(json_field, "mixed_val", "1000a")), + Bound::Included(get_json_term_str(json_field, "mixed_val", "2000b")), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term_str(json_field, "mixed_val", "1000")), + Bound::Included(get_json_term_str(json_field, "mixed_val", "2000a")), + )), + 2 + ); + } + + #[test] + fn json_range_test() { + let mut schema_builder = Schema::builder(); + let json_field = schema_builder.add_json_field("json", TEXT | STORED | FAST); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + let u64_val = u64::MAX - 1; + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + let doc = json!({ + "id_u64": 0, + "id_f64": 10.5, + "id_i64": -100, + "date": "2022-12-01T00:00:01Z" + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "id_u64": u64_val, + "id_f64": 1000.5, + "id_i64": 1000, + "date": "2023-12-01T00:00:01Z" + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + let doc = json!({ + "date": "2015-02-01T00:00:00.001Z" + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + + index_writer.commit().unwrap(); + } + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap(); + + // u64 on u64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_u64", u64_val)), + Bound::Included(get_json_term(json_field, "id_u64", u64_val)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_u64", u64_val)), + Bound::Excluded(get_json_term(json_field, "id_u64", u64_val)), + )), + 0 + ); + // f64 on u64 field + assert_eq!( + count(RangeQuery::new( + // We need to subtract since there is some inaccuracy + Bound::Included(get_json_term( + json_field, + "id_u64", + (u64_val - 10000) as f64 + )), + Bound::Included(get_json_term(json_field, "id_u64", (u64_val) as f64)), + )), + 1 + ); + // i64 on u64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_u64", 0_i64)), + Bound::Included(get_json_term(json_field, "id_u64", 0_i64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_u64", 1_i64)), + Bound::Included(get_json_term(json_field, "id_u64", 1_i64)), + )), + 0 + ); + // u64 on f64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_f64", 10_u64)), + Bound::Included(get_json_term(json_field, "id_f64", 11_u64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_f64", 10_u64)), + Bound::Included(get_json_term(json_field, "id_f64", 2000_u64)), + )), + 2 + ); + // i64 on f64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_f64", 10_i64)), + Bound::Included(get_json_term(json_field, "id_f64", 2000_i64)), + )), + 2 + ); + + // i64 on i64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000i64)), + Bound::Included(get_json_term(json_field, "id_i64", 1000i64)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", 1000i64)), + Bound::Excluded(get_json_term(json_field, "id_i64", 1001i64)), + )), + 1 + ); + + // u64 on i64 + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", 0_u64)), + Bound::Included(get_json_term(json_field, "id_i64", 1000u64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", 0_u64)), + Bound::Included(get_json_term(json_field, "id_i64", 999u64)), + )), + 0 + ); + // f64 on i64 field + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000.0)), + Bound::Included(get_json_term(json_field, "id_i64", 1000.0)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000.0f64)), + Bound::Excluded(get_json_term(json_field, "id_i64", 1000.0f64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000.0f64)), + Bound::Included(get_json_term(json_field, "id_i64", 1000.0f64)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000.0f64)), + Bound::Excluded(get_json_term(json_field, "id_i64", 1000.01f64)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "id_i64", -1000.0f64)), + Bound::Included(get_json_term(json_field, "id_i64", 999.99f64)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Excluded(get_json_term(json_field, "id_i64", 999.9)), + Bound::Excluded(get_json_term(json_field, "id_i64", 1000.1)), + )), + 1 + ); + + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + let query_parser = QueryParser::for_index(&index, vec![json_field]); + let test_query = |query, num_hits| { + let query = query_parser.parse_query(query).unwrap(); + let top_docs = searcher.search(&query, &TopDocs::with_limit(10)).unwrap(); + assert_eq!(top_docs.len(), num_hits); + }; + + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.001Z]", + 1, + ); + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z}", + 1, + ); + test_query( + "json.date:[2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 1, + ); + test_query( + "json.date:{2015-02-01T00:00:00.001Z TO 2015-02-01T00:00:00.002Z]", + 0, + ); + + // Date field + let dt1 = + DateTime::from_utc(OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap()); + let dt2 = + DateTime::from_utc(OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap()); + + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "date", dt1)), + Bound::Included(get_json_term(json_field, "date", dt2)), + )), + 2 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Included(get_json_term(json_field, "date", dt1)), + Bound::Excluded(get_json_term(json_field, "date", dt2)), + )), + 1 + ); + assert_eq!( + count(RangeQuery::new( + Bound::Excluded(get_json_term(json_field, "date", dt1)), + Bound::Excluded(get_json_term(json_field, "date", dt2)), + )), + 0 + ); + // Date precision test. We don't want to truncate the precision + let dt3 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(), + ); + let dt4 = DateTime::from_utc( + OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(), + ); + let query = RangeQuery::new( + Bound::Included(get_json_term(json_field, "date", dt3)), + Bound::Excluded(get_json_term(json_field, "date", dt4)), + ); + assert_eq!(count(query), 1); + } + #[derive(Clone, Debug)] pub struct Doc { pub id_name: String, @@ -326,6 +1070,12 @@ pub mod tests { assert!(test_id_range_for_docs(ops).is_ok()); } + #[test] + fn range_regression1_test_json() { + let ops = vec![doc_from_id_1(0)]; + assert!(test_id_range_for_docs_json(ops).is_ok()); + } + #[test] fn test_range_regression2() { let ops = vec![ @@ -354,11 +1104,10 @@ pub mod tests { writer.add_document(doc!(field=>52_000u64)).unwrap(); writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); - let range_query = FastFieldRangeWeight::new( - field, + let range_query = FastFieldRangeWeight::new(BoundsRange::new( Bound::Included(Term::from_field_u64(field, 50_000)), Bound::Included(Term::from_field_u64(field, 50_002)), - ); + )); let scorer = range_query .scorer(searcher.segment_reader(0), 1.0f32) .unwrap(); @@ -377,56 +1126,94 @@ pub mod tests { assert!(test_id_range_for_docs(ops).is_ok()); } - pub fn create_index_from_docs(docs: &[Doc]) -> Index { + pub fn create_index_from_docs(docs: &[Doc], json_field: bool) -> Index { let mut schema_builder = Schema::builder(); - let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST); - let ids_u64_field = - schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed()); - - let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST); - let ids_f64_field = schema_builder.add_f64_field( - "ids_f64", - NumericOptions::default().set_fast().set_indexed(), - ); + if json_field { + let json_field = schema_builder.add_json_field("json", TEXT | STORED | FAST); + let schema = schema_builder.build(); - let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST); - let ids_i64_field = schema_builder.add_i64_field( - "ids_i64", - NumericOptions::default().set_fast().set_indexed(), - ); + let index = Index::create_in_ram(schema); - let text_field = schema_builder.add_text_field("id_name", STRING | STORED | FAST); - let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + for doc in docs.iter() { + let doc = json!({ + "ids_i64": doc.id as i64, + "ids_i64": doc.id as i64, + "ids_f64": doc.id as f64, + "ids_f64": doc.id as f64, + "ids": doc.id, + "ids": doc.id, + "id": doc.id, + "id_f64": doc.id as f64, + "id_i64": doc.id as i64, + "id_name": doc.id_name.to_string(), + "id_name_fast": doc.id_name.to_string(), + }); + index_writer.add_document(doc!(json_field => doc)).unwrap(); + } - { - let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); - for doc in docs.iter() { - index_writer - .add_document(doc!( - ids_i64_field => doc.id as i64, - ids_i64_field => doc.id as i64, - ids_f64_field => doc.id as f64, - ids_f64_field => doc.id as f64, - ids_u64_field => doc.id, - ids_u64_field => doc.id, - id_u64_field => doc.id, - id_f64_field => doc.id as f64, - id_i64_field => doc.id as i64, - text_field => doc.id_name.to_string(), - text_field2 => doc.id_name.to_string(), - )) - .unwrap(); + index_writer.commit().unwrap(); } + index + } else { + let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST); + let ids_u64_field = schema_builder + .add_u64_field("ids", NumericOptions::default().set_fast().set_indexed()); + + let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST); + let ids_f64_field = schema_builder.add_f64_field( + "ids_f64", + NumericOptions::default().set_fast().set_indexed(), + ); - index_writer.commit().unwrap(); + let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST); + let ids_i64_field = schema_builder.add_i64_field( + "ids_i64", + NumericOptions::default().set_fast().set_indexed(), + ); + + let text_field = schema_builder.add_text_field("id_name", STRING | STORED); + let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + + { + let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap(); + for doc in docs.iter() { + index_writer + .add_document(doc!( + ids_i64_field => doc.id as i64, + ids_i64_field => doc.id as i64, + ids_f64_field => doc.id as f64, + ids_f64_field => doc.id as f64, + ids_u64_field => doc.id, + ids_u64_field => doc.id, + id_u64_field => doc.id, + id_f64_field => doc.id as f64, + id_i64_field => doc.id as i64, + text_field => doc.id_name.to_string(), + text_field2 => doc.id_name.to_string(), + )) + .unwrap(); + } + + index_writer.commit().unwrap(); + } + index } - index } fn test_id_range_for_docs(docs: Vec) -> crate::Result<()> { - let index = create_index_from_docs(&docs); + test_id_range_for_docs_with_opt(docs, false) + } + fn test_id_range_for_docs_json(docs: Vec) -> crate::Result<()> { + test_id_range_for_docs_with_opt(docs, true) + } + + fn test_id_range_for_docs_with_opt(docs: Vec, json: bool) -> crate::Result<()> { + let index = create_index_from_docs(&docs, json); let reader = index.reader().unwrap(); let searcher = reader.searcher(); @@ -439,11 +1226,29 @@ pub mod tests { .unwrap() }; + let field_path = |field: &str| { + if json { + format!("json.{}", field) + } else { + field.to_string() + } + }; + let gen_query_inclusive = |field: &str, range: RangeInclusive| { - format!("{}:[{} TO {}]", field, range.start(), range.end()) + format!( + "{}:[{} TO {}]", + field_path(field), + range.start(), + range.end() + ) }; let gen_query_exclusive = |field: &str, range: RangeInclusive| { - format!("{}:{{{} TO {}}}", field, range.start(), range.end()) + format!( + "{}:{{{} TO {}}}", + field_path(field), + range.start(), + range.end() + ) }; let test_sample = |sample_docs: Vec| { @@ -470,7 +1275,12 @@ pub mod tests { .iter() .filter(|doc| (id_names[0]..=id_names[1]).contains(&doc.id_name.as_str())) .count(); - let query = format!("{}:[{} TO {}]", field_name, id_names[0], id_names[1]); + let query = format!( + "{}:[{} TO {}]", + field_path(field_name), + id_names[0], + id_names[1] + ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); }; @@ -499,20 +1309,23 @@ pub mod tests { .filter(|doc| (ids[0]..=ids[1]).contains(&doc.id) && doc.id_name == id_filter) .count(); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("id", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("id_f64", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("id_i64", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); @@ -520,20 +1333,23 @@ pub mod tests { // Intersection search on multivalue id field let id_filter = sample_docs[0].id_name.to_string(); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("ids", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("ids_f64", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); let query = format!( - "{} AND id_name:{}", + "{} AND {}:{}", gen_query_inclusive("ids_i64", ids[0]..=ids[1]), + field_path("id_name"), &id_filter ); assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits); @@ -647,11 +1463,10 @@ pub mod ip_range_tests { } writer.commit().unwrap(); let searcher = index.reader().unwrap().searcher(); - let range_weight = FastFieldRangeWeight::new( - ips_field, + let range_weight = FastFieldRangeWeight::new(BoundsRange::new( Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])), Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])), - ); + )); let count = crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap(); @@ -783,7 +1598,7 @@ mod bench { }) .collect(); - create_index_from_docs(&docs) + create_index_from_docs(&docs, false) } fn get_90_percent() -> RangeInclusive { @@ -1158,7 +1973,6 @@ mod bench_ip { #[bench] fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) { let index = get_index_0_to_100(); - bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index)); } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index cc7daf8b0e..8b203f5b37 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -3,7 +3,7 @@ use std::str::FromStr; use base64::engine::general_purpose::STANDARD as BASE64; use base64::Engine; -use columnar::ColumnType; +use columnar::{ColumnType, NumericalType}; use serde::{Deserialize, Serialize}; use serde_json::Value as JsonValue; use thiserror::Error; @@ -102,6 +102,18 @@ const ALL_TYPES: [Type; 10] = [ ]; impl Type { + /// Returns the numerical type if applicable + /// It does not do any mapping, e.g. Date is None although it's also stored as I64 in the + /// column store + pub fn numerical_type(&self) -> Option { + match self { + Type::I64 => Some(NumericalType::I64), + Type::U64 => Some(NumericalType::U64), + Type::F64 => Some(NumericalType::F64), + _ => None, + } + } + /// Returns an iterator over the different values /// the Type enum can tape. pub fn iter_values() -> impl Iterator { @@ -196,6 +208,11 @@ impl FieldType { } } + /// returns true if this is an json field + pub fn is_json(&self) -> bool { + matches!(self, FieldType::JsonObject(_)) + } + /// returns true if this is an ip address field pub fn is_ip_addr(&self) -> bool { matches!(self, FieldType::IpAddr(_)) diff --git a/src/schema/term.rs b/src/schema/term.rs index a0d1a8f677..0443482634 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -2,12 +2,12 @@ use std::hash::{Hash, Hasher}; use std::net::Ipv6Addr; use std::{fmt, str}; -use columnar::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; +use columnar::MonotonicallyMappableToU128; use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP_STR}; use common::JsonPathWriter; use super::date_time_options::DATE_TIME_PRECISION_INDEXED; -use super::Field; +use super::{Field, Schema}; use crate::fastfield::FastValue; use crate::json_utils::split_json_path; use crate::schema::{Facet, Type}; @@ -57,6 +57,29 @@ impl Term { term } + /// Gets the full path of the field name + optional json path. + pub fn get_full_path(&self, schema: &Schema) -> String { + let field = self.field(); + let mut field = schema.get_field_name(field).to_string(); + if let Some(json_path) = self.get_json_path() { + field.push('.'); + field.push_str(&json_path); + }; + field + } + + /// Gets the json path if the type is JSON + pub fn get_json_path(&self) -> Option { + let value = self.value(); + if let Some((json_path, _)) = value.as_json() { + Some(unsafe { + std::str::from_utf8_unchecked(&json_path[..json_path.len() - 1]).to_string() + }) + } else { + None + } + } + pub(crate) fn with_type_and_field(typ: Type, field: Field) -> Term { let mut term = Self::with_capacity(8); term.set_field_and_type(field, typ); @@ -70,7 +93,7 @@ impl Term { term } - fn from_fast_value(field: Field, val: &T) -> Term { + pub(crate) fn from_fast_value(field: Field, val: &T) -> Term { let mut term = Self::with_type_and_field(T::to_type(), field); term.set_u64(val.to_u64()); term @@ -118,8 +141,20 @@ impl Term { Term::from_fast_value(field, &val) } - /// Builds a term given a field, and a `DateTime` value + /// Builds a term given a field, and a `DateTime` value. + /// + /// The contained value may not match the value, due do the truncation used + /// for indexed data [super::DATE_TIME_PRECISION_INDEXED]. + /// To create a term used for search use `from_field_date_for_search`. pub fn from_field_date(field: Field, val: DateTime) -> Term { + Term::from_fast_value(field, &val) + } + + /// Builds a term given a field, and a `DateTime` value to be used in searching the inverted + /// index. + /// It truncates the `DateTime` to the precision used in the index + /// ([super::DATE_TIME_PRECISION_INDEXED]). + pub fn from_field_date_for_search(field: Field, val: DateTime) -> Term { Term::from_fast_value(field, &val.truncate(DATE_TIME_PRECISION_INDEXED)) } @@ -191,13 +226,7 @@ impl Term { /// It will not clear existing bytes. pub fn append_type_and_fast_value(&mut self, val: T) { self.0.push(T::to_type().to_code()); - let value = if T::to_type() == Type::Date { - DateTime::from_u64(val.to_u64()) - .truncate(DATE_TIME_PRECISION_INDEXED) - .to_u64() - } else { - val.to_u64() - }; + let value = val.to_u64(); self.0.extend(value.to_be_bytes().as_ref()); } @@ -324,6 +353,11 @@ where B: AsRef<[u8]> ValueBytes(data) } + /// Wraps a object holding Vec + pub fn to_owned(&self) -> ValueBytes> { + ValueBytes(self.0.as_ref().to_vec()) + } + fn typ_code(&self) -> u8 { self.0.as_ref()[0] } @@ -345,7 +379,7 @@ where B: AsRef<[u8]> if self.typ() != T::to_type() { return None; } - let value_bytes = self.value_bytes(); + let value_bytes = self.raw_value_bytes_payload(); let value_u64 = u64::from_be_bytes(value_bytes.try_into().ok()?); Some(T::from_u64(value_u64)) } @@ -390,7 +424,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Str { return None; } - str::from_utf8(self.value_bytes()).ok() + str::from_utf8(self.raw_value_bytes_payload()).ok() } /// Returns the facet associated with the term. @@ -401,7 +435,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Facet { return None; } - let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?; + let facet_encode_str = str::from_utf8(self.raw_value_bytes_payload()).ok()?; Some(Facet::from_encoded_string(facet_encode_str.to_string())) } @@ -412,7 +446,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Bytes { return None; } - Some(self.value_bytes()) + Some(self.raw_value_bytes_payload()) } /// Returns a `Ipv6Addr` value from the term. @@ -420,7 +454,7 @@ where B: AsRef<[u8]> if self.typ() != Type::IpAddr { return None; } - let ip_u128 = u128::from_be_bytes(self.value_bytes().try_into().ok()?); + let ip_u128 = u128::from_be_bytes(self.raw_value_bytes_payload().try_into().ok()?); Some(Ipv6Addr::from_u128(ip_u128)) } @@ -441,7 +475,7 @@ where B: AsRef<[u8]> if self.typ() != Type::Json { return None; } - let bytes = self.value_bytes(); + let bytes = self.raw_value_bytes_payload(); let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; // split at pos + 1, so that json_path_bytes includes the JSON_END_OF_PATH byte. @@ -456,16 +490,25 @@ where B: AsRef<[u8]> if self.typ() != Type::Json { return None; } - let bytes = self.value_bytes(); + let bytes = self.raw_value_bytes_payload(); let pos = bytes.iter().cloned().position(|b| b == JSON_END_OF_PATH)?; Some(ValueBytes::wrap(&bytes[pos + 1..])) } - /// Returns the serialized value of ValueBytes without the type. - fn value_bytes(&self) -> &[u8] { + /// Returns the raw value of ValueBytes payload, without the type tag. + pub(crate) fn raw_value_bytes_payload(&self) -> &[u8] { &self.0.as_ref()[1..] } + /// Returns the serialized value of ValueBytes payload, without the type tag. + pub(crate) fn value_bytes_payload(&self) -> Vec { + if let Some(value_bytes) = self.as_json_value_bytes() { + value_bytes.raw_value_bytes_payload().to_vec() + } else { + self.raw_value_bytes_payload().to_vec() + } + } + /// Returns the serialized representation of Term. /// /// Do NOT rely on this byte representation in the index. diff --git a/sstable/src/dictionary.rs b/sstable/src/dictionary.rs index 298a70e838..4cc438713b 100644 --- a/sstable/src/dictionary.rs +++ b/sstable/src/dictionary.rs @@ -4,6 +4,7 @@ use std::marker::PhantomData; use std::ops::{Bound, RangeBounds}; use std::sync::Arc; +use common::bounds::{transform_bound_inner_res, TransformBound}; use common::file_slice::FileSlice; use common::{BinarySerializable, OwnedBytes}; use tantivy_fst::automaton::AlwaysMatch; @@ -56,29 +57,6 @@ impl Dictionary { } } -fn map_bound(bound: &Bound, transform: impl Fn(&TFrom) -> TTo) -> Bound { - use self::Bound::*; - match bound { - Excluded(ref from_val) => Bound::Excluded(transform(from_val)), - Included(ref from_val) => Bound::Included(transform(from_val)), - Unbounded => Unbounded, - } -} - -/// Takes a bound and transforms the inner value into a new bound via a closure. -/// The bound variant may change by the value returned value from the closure. -fn transform_bound_inner( - bound: &Bound, - transform: impl Fn(&TFrom) -> io::Result>, -) -> io::Result> { - use self::Bound::*; - Ok(match bound { - Excluded(ref from_val) => transform(from_val)?, - Included(ref from_val) => transform(from_val)?, - Unbounded => Unbounded, - }) -} - #[derive(Debug, Clone, PartialEq, Eq)] pub enum TermOrdHit { /// Exact term ord hit @@ -409,18 +387,18 @@ impl Dictionary { lower_bound: Bound, upper_bound: Bound, ) -> io::Result<(Bound, Bound)> { - let lower_bound = transform_bound_inner(&lower_bound, |start_bound_bytes| { + let lower_bound = transform_bound_inner_res(&lower_bound, |start_bound_bytes| { let ord = self.term_ord_or_next(start_bound_bytes)?; match ord { - TermOrdHit::Exact(ord) => Ok(map_bound(&lower_bound, |_| ord)), - TermOrdHit::Next(ord) => Ok(Bound::Included(ord)), // Change bounds to included + TermOrdHit::Exact(ord) => Ok(TransformBound::Existing(ord)), + TermOrdHit::Next(ord) => Ok(TransformBound::NewBound(Bound::Included(ord))), /* Change bounds to included */ } })?; - let upper_bound = transform_bound_inner(&upper_bound, |end_bound_bytes| { + let upper_bound = transform_bound_inner_res(&upper_bound, |end_bound_bytes| { let ord = self.term_ord_or_next(end_bound_bytes)?; match ord { - TermOrdHit::Exact(ord) => Ok(map_bound(&upper_bound, |_| ord)), - TermOrdHit::Next(ord) => Ok(Bound::Excluded(ord)), // Change bounds to excluded + TermOrdHit::Exact(ord) => Ok(TransformBound::Existing(ord)), + TermOrdHit::Next(ord) => Ok(TransformBound::NewBound(Bound::Excluded(ord))), /* Change bounds to excluded */ } })?; Ok((lower_bound, upper_bound))