support ff range queries on json fields (quickwit-oss#2456)

* support ff range queries on json fields * fix term date truncation * use inverted index range query for phrase prefix queries * rename to InvertedIndexRangeQuery * fix column filter, add mixed column test
paradedb · Aug 31, 2024 · a3243d8 · a3243d8
1 parent be2f941
commit a3243d8
Show file tree

Hide file tree

Showing 15 changed files with 1,317 additions and 284 deletions.
diff --git a/common/src/bounds.rs b/common/src/bounds.rs
@@ -0,0 +1,130 @@
+use std::io;
+use std::ops::Bound;
+
+#[derive(Clone, Debug)]
+pub struct BoundsRange<T> {
+    pub lower_bound: Bound<T>,
+    pub upper_bound: Bound<T>,
+}
+impl<T> BoundsRange<T> {
+    pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
+        BoundsRange {
+            lower_bound,
+            upper_bound,
+        }
+    }
+    pub fn is_unbounded(&self) -> bool {
+        matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
+    }
+    pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
+        BoundsRange {
+            lower_bound: map_bound(&self.lower_bound, &transform),
+            upper_bound: map_bound(&self.upper_bound, &transform),
+        }
+    }
+
+    pub fn map_bound_res<TTo, Err>(
+        &self,
+        transform: impl Fn(&T) -> Result<TTo, Err>,
+    ) -> Result<BoundsRange<TTo>, Err> {
+        Ok(BoundsRange {
+            lower_bound: map_bound_res(&self.lower_bound, &transform)?,
+            upper_bound: map_bound_res(&self.upper_bound, &transform)?,
+        })
+    }
+
+    pub fn transform_inner<TTo>(
+        &self,
+        transform_lower: impl Fn(&T) -> TransformBound<TTo>,
+        transform_upper: impl Fn(&T) -> TransformBound<TTo>,
+    ) -> BoundsRange<TTo> {
+        BoundsRange {
+            lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
+            upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
+        }
+    }
+
+    /// Returns the first set inner value
+    pub fn get_inner(&self) -> Option<&T> {
+        inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
+    }
+}
+
+pub enum TransformBound<T> {
+    /// Overwrite the bounds
+    NewBound(Bound<T>),
+    /// Use Existing bounds with new value
+    Existing(T),
+}
+
+/// Takes a bound and transforms the inner value into a new bound via a closure.
+/// The bound variant may change by the value returned value from the closure.
+pub fn transform_bound_inner_res<TFrom, TTo>(
+    bound: &Bound<TFrom>,
+    transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
+) -> io::Result<Bound<TTo>> {
+    use self::Bound::*;
+    Ok(match bound {
+        Excluded(ref from_val) => match transform(from_val)? {
+            TransformBound::NewBound(new_val) => new_val,
+            TransformBound::Existing(new_val) => Excluded(new_val),
+        },
+        Included(ref from_val) => match transform(from_val)? {
+            TransformBound::NewBound(new_val) => new_val,
+            TransformBound::Existing(new_val) => Included(new_val),
+        },
+        Unbounded => Unbounded,
+    })
+}
+
+/// Takes a bound and transforms the inner value into a new bound via a closure.
+/// The bound variant may change by the value returned value from the closure.
+pub fn transform_bound_inner<TFrom, TTo>(
+    bound: &Bound<TFrom>,
+    transform: impl Fn(&TFrom) -> TransformBound<TTo>,
+) -> Bound<TTo> {
+    use self::Bound::*;
+    match bound {
+        Excluded(ref from_val) => match transform(from_val) {
+            TransformBound::NewBound(new_val) => new_val,
+            TransformBound::Existing(new_val) => Excluded(new_val),
+        },
+        Included(ref from_val) => match transform(from_val) {
+            TransformBound::NewBound(new_val) => new_val,
+            TransformBound::Existing(new_val) => Included(new_val),
+        },
+        Unbounded => Unbounded,
+    }
+}
+
+/// Returns the inner value of a `Bound`
+pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
+    match val {
+        Bound::Included(term) | Bound::Excluded(term) => Some(term),
+        Bound::Unbounded => None,
+    }
+}
+
+pub fn map_bound<TFrom, TTo>(
+    bound: &Bound<TFrom>,
+    transform: impl Fn(&TFrom) -> TTo,
+) -> Bound<TTo> {
+    use self::Bound::*;
+    match bound {
+        Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
+        Included(ref from_val) => Bound::Included(transform(from_val)),
+        Unbounded => Unbounded,
+    }
+}
+
+pub fn map_bound_res<TFrom, TTo, Err>(
+    bound: &Bound<TFrom>,
+    transform: impl Fn(&TFrom) -> Result<TTo, Err>,
+) -> Result<Bound<TTo>, Err> {
+    use self::Bound::*;
+    Ok(match bound {
+        Excluded(ref from_val) => Excluded(transform(from_val)?),
+        Included(ref from_val) => Included(transform(from_val)?),
+        Unbounded => Unbounded,
+    })
+}
diff --git a/common/src/lib.rs b/common/src/lib.rs
@@ -5,6 +5,7 @@ use std::ops::Deref;
 pub use byteorder::LittleEndian as Endianness;
 
 mod bitset;
+pub mod bounds;
 mod byte_count;
 mod datetime;
 pub mod file_slice;

diff --git a/src/core/json_utils.rs b/src/core/json_utils.rs
@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;
 
 use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
 use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
-use crate::schema::Type;
+use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
 use crate::time::format_description::well_known::Rfc3339;
 use crate::time::{OffsetDateTime, UtcOffset};
 use crate::tokenizer::TextAnalyzer;
@@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
                     ctx.path_to_unordered_id
                         .get_or_allocate_unordered_id(json_path_writer.as_str()),
                 );
+                let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
                 term_buffer.append_type_and_fast_value(val);
                 postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
             }
@@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
 /// Tries to infer a JSON type from a string and append it to the term.
 ///
 /// The term must be json + JSON path.
-pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
+pub fn convert_to_fast_value_and_append_to_json_term(
+    mut term: Term,
+    phrase: &str,
+    truncate_date_for_search: bool,
+) -> Option<Term> {
     assert_eq!(
         term.value()
             .as_json_value_bytes()
@@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
         "JSON value bytes should be empty"
     );
     if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
-        let dt_utc = dt.to_offset(UtcOffset::UTC);
-        term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
+        let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
+        if truncate_date_for_search {
+            dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
+        }
+        term.append_type_and_fast_value(dt);
         return Some(term);
     }
     if let Ok(i64_val) = str::parse::<i64>(phrase) {

diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
@@ -673,7 +673,7 @@ mod tests {
                     ]
                 );
                 assert_eq!(
-                    get_doc_ids(vec![Term::from_field_date(
+                    get_doc_ids(vec![Term::from_field_date_for_search(
                         date_field,
                         DateTime::from_utc(curr_time)
                     )])?,

diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
@@ -64,9 +64,9 @@ impl SegmentWriter {
     ///
     /// The arguments are defined as follows
     ///
-    /// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
-    /// is stored in a memory arena. This makes it possible for the user to define
-    /// the flushing behavior as a memory limit.
+    /// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
+    ///   stored in a memory arena. This makes it possible for the user to define the flushing
+    ///   behavior as a memory limit.
     /// - segment: The segment being written
     /// - schema
     pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
@@ -431,7 +431,7 @@ mod tests {
     use crate::query::{PhraseQuery, QueryParser};
     use crate::schema::{
         Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
-        STORED, STRING, TEXT,
+        DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
     };
     use crate::store::{Compressor, StoreReader, StoreWriter};
     use crate::time::format_description::well_known::Rfc3339;
@@ -651,7 +651,8 @@ mod tests {
             set_fast_val(
                 DateTime::from_utc(
                     OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
-                ),
+                )
+                .truncate(DATE_TIME_PRECISION_INDEXED),
                 term
             )
             .serialized_value_bytes()

diff --git a/src/query/mod.rs b/src/query/mod.rs
@@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
 pub use self::phrase_query::PhraseQuery;
 pub use self::query::{EnableScoring, Query, QueryClone};
 pub use self::query_parser::{QueryParser, QueryParserError};
-pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
+pub use self::range_query::*;
 pub use self::regex_query::RegexQuery;
 pub use self::reqopt_scorer::RequiredOptionalScorer;
 pub use self::score_combiner::{

diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs
@@ -241,7 +241,7 @@ impl MoreLikeThis {
                     let timestamp = value.as_datetime().ok_or_else(|| {
                         TantivyError::InvalidArgument("invalid value".to_string())
                     })?;
-                    let term = Term::from_field_date(field, timestamp);
+                    let term = Term::from_field_date_for_search(field, timestamp);
                     *term_frequencies.entry(term).or_insert(0) += 1;
                 }
             }

diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs
@@ -2,7 +2,7 @@ use std::ops::Bound;
 
 use super::{prefix_end, PhrasePrefixWeight};
 use crate::query::bm25::Bm25Weight;
-use crate::query::{EnableScoring, Query, RangeQuery, Weight};
+use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight};
 use crate::schema::{Field, IndexRecordOption, Term};
 
 const DEFAULT_MAX_EXPANSIONS: u32 = 50;
@@ -145,9 +145,15 @@ impl Query for PhrasePrefixQuery {
                     Bound::Unbounded
                 };
 
-            let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
-            range_query.limit(self.max_expansions as u64);
-            range_query.weight(enable_scoring)
+            let lower_bound = Bound::Included(self.prefix.1.clone());
+            let upper_bound = end_term;
+
+            Ok(Box::new(InvertedIndexRangeWeight::new(
+                self.field,
+                &lower_bound,
+                &upper_bound,
+                Some(self.max_expansions as u64),
+            )))
         }
     }
 

diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
@@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///   so-called default fields (as set up in the constructor).
 ///
 ///   Assuming that the default fields are `body` and `title`, and the query parser is set with
-/// conjunction   as a default, our query will be interpreted as.
+///   conjunction as a default, our query will be interpreted as.
 ///   `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
 ///   By default, all tokenized and indexed fields are default fields.
 ///
@@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
 ///   `body:Barack OR (body:Barack OR text:Obama)` .
 ///
 /// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
-///   interpreted
-/// as `(a AND b) OR c`.
+///   interpreted as `(a AND b) OR c`.
 ///
 /// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
 ///   sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
@@ -272,8 +271,7 @@ impl QueryParser {
 
     /// Creates a `QueryParser`, given
     ///  * an index
-    ///  * a set of default fields used to search if no field is specifically defined
-    ///   in the query.
+    ///  * a set of default fields used to search if no field is specifically defined in the query.
     pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
         QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
     }
@@ -482,16 +480,33 @@ impl QueryParser {
                 });
                 if terms.len() != 1 {
                     return Err(QueryParserError::UnsupportedQuery(format!(
-                        "Range query boundary cannot have multiple tokens: {phrase:?}."
+                        "Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]."
                     )));
                 }
                 Ok(terms.into_iter().next().unwrap())
             }
-            FieldType::JsonObject(_) => {
-                // Json range are not supported.
-                Err(QueryParserError::UnsupportedQuery(
-                    "Range query are not supported on json field.".to_string(),
-                ))
+            FieldType::JsonObject(ref json_options) => {
+                let get_term_with_path = || {
+                    Term::from_field_json_path(
+                        field,
+                        json_path,
+                        json_options.is_expand_dots_enabled(),
+                    )
+                };
+                if let Some(term) =
+                    // Try to convert the phrase to a fast value
+                    convert_to_fast_value_and_append_to_json_term(
+                        get_term_with_path(),
+                        phrase,
+                        false,
+                    )
+                {
+                    Ok(term)
+                } else {
+                    let mut term = get_term_with_path();
+                    term.append_type_and_str(phrase);
+                    Ok(term)
+                }
             }
             FieldType::Facet(_) => match Facet::from_text(phrase) {
                 Ok(facet) => Ok(Term::from_facet(field, &facet)),
@@ -553,7 +568,7 @@ impl QueryParser {
             }
             FieldType::Date(_) => {
                 let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
-                let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
+                let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
                 Ok(vec![LogicalLiteral::Term(dt_term)])
             }
             FieldType::Str(ref str_options) => {
@@ -685,8 +700,8 @@ impl QueryParser {
     ///
     /// The terms are identified by a triplet:
     /// - tantivy field
-    /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
-    /// object by naturally extending the json field name with a "." separated field_path
+    /// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
+    ///   naturally extending the json field name with a "." separated field_path
     /// - field_phrase: the phrase that is being searched.
     ///
     /// The literal identifies the targeted field by a so-called *full field path*,
@@ -949,7 +964,8 @@ fn generate_literals_for_json_object(
         || Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());
 
     // Try to convert the phrase to a fast value
-    if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase)
+    if let Some(term) =
+        convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
     {
         logical_literals.push(LogicalLiteral::Term(term));
     }
@@ -1123,8 +1139,8 @@ mod test {
         let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
         assert_eq!(
             format!("{query:?}"),
-            "RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
-             Included(Term(field=0, type=Str, \"b\")), limit: None }"
+            "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \
+             \"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }"
         );
     }