Skip to content

Commit

Permalink
support ff range queries on json fields (quickwit-oss#2456)
Browse files Browse the repository at this point in the history
* support ff range queries on json fields

* fix term date truncation

* use inverted index range query for phrase prefix queries

* rename to InvertedIndexRangeQuery

* fix column filter, add mixed column test
  • Loading branch information
PSeitz authored and philippemnoel committed Aug 31, 2024
1 parent be2f941 commit a3243d8
Show file tree
Hide file tree
Showing 15 changed files with 1,317 additions and 284 deletions.
130 changes: 130 additions & 0 deletions common/src/bounds.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
use std::io;
use std::ops::Bound;

#[derive(Clone, Debug)]
pub struct BoundsRange<T> {
pub lower_bound: Bound<T>,
pub upper_bound: Bound<T>,
}
impl<T> BoundsRange<T> {
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
BoundsRange {
lower_bound,
upper_bound,
}
}
pub fn is_unbounded(&self) -> bool {
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
}
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
BoundsRange {
lower_bound: map_bound(&self.lower_bound, &transform),
upper_bound: map_bound(&self.upper_bound, &transform),
}
}

pub fn map_bound_res<TTo, Err>(
&self,
transform: impl Fn(&T) -> Result<TTo, Err>,
) -> Result<BoundsRange<TTo>, Err> {
Ok(BoundsRange {
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
})
}

pub fn transform_inner<TTo>(
&self,
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
) -> BoundsRange<TTo> {
BoundsRange {
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
}
}

/// Returns the first set inner value
pub fn get_inner(&self) -> Option<&T> {
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
}
}

pub enum TransformBound<T> {
/// Overwrite the bounds
NewBound(Bound<T>),
/// Use Existing bounds with new value
Existing(T),
}

/// Takes a bound and transforms the inner value into a new bound via a closure.
/// The bound variant may change by the value returned value from the closure.
pub fn transform_bound_inner_res<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
) -> io::Result<Bound<TTo>> {
use self::Bound::*;
Ok(match bound {
Excluded(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val),
},
Included(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val),
},
Unbounded => Unbounded,
})
}

/// Takes a bound and transforms the inner value into a new bound via a closure.
/// The bound variant may change by the value returned value from the closure.
pub fn transform_bound_inner<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val),
},
Included(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val),
},
Unbounded => Unbounded,
}
}

/// Returns the inner value of a `Bound`
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
match val {
Bound::Included(term) | Bound::Excluded(term) => Some(term),
Bound::Unbounded => None,
}
}

pub fn map_bound<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> TTo,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
Included(ref from_val) => Bound::Included(transform(from_val)),
Unbounded => Unbounded,
}
}

pub fn map_bound_res<TFrom, TTo, Err>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
) -> Result<Bound<TTo>, Err> {
use self::Bound::*;
Ok(match bound {
Excluded(ref from_val) => Excluded(transform(from_val)?),
Included(ref from_val) => Included(transform(from_val)?),
Unbounded => Unbounded,
})
}
1 change: 1 addition & 0 deletions common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;

mod bitset;
pub mod bounds;
mod byte_count;
mod datetime;
pub mod file_slice;
Expand Down
16 changes: 12 additions & 4 deletions src/core/json_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;

use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::Type;
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
Expand Down Expand Up @@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
Expand Down Expand Up @@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
/// Tries to infer a JSON type from a string and append it to the term.
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
pub fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
term.value()
.as_json_value_bytes()
Expand All @@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
"JSON value bytes should be empty"
);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let dt_utc = dt.to_offset(UtcOffset::UTC);
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
Expand Down
2 changes: 1 addition & 1 deletion src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ mod tests {
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(
get_doc_ids(vec![Term::from_field_date_for_search(
date_field,
DateTime::from_utc(curr_time)
)])?,
Expand Down
11 changes: 6 additions & 5 deletions src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ impl SegmentWriter {
///
/// The arguments are defined as follows
///
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
/// is stored in a memory arena. This makes it possible for the user to define
/// the flushing behavior as a memory limit.
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
/// stored in a memory arena. This makes it possible for the user to define the flushing
/// behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
Expand Down Expand Up @@ -431,7 +431,7 @@ mod tests {
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STORED, STRING, TEXT,
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
Expand Down Expand Up @@ -651,7 +651,8 @@ mod tests {
set_fast_val(
DateTime::from_utc(
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
),
)
.truncate(DATE_TIME_PRECISION_INDEXED),
term
)
.serialized_value_bytes()
Expand Down
2 changes: 1 addition & 1 deletion src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
pub use self::range_query::*;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{
Expand Down
2 changes: 1 addition & 1 deletion src/query/more_like_this/more_like_this.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,7 @@ impl MoreLikeThis {
let timestamp = value.as_datetime().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
let term = Term::from_field_date(field, timestamp);
let term = Term::from_field_date_for_search(field, timestamp);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
Expand Down
14 changes: 10 additions & 4 deletions src/query/phrase_prefix_query/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::ops::Bound;

use super::{prefix_end, PhrasePrefixWeight};
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight};
use crate::schema::{Field, IndexRecordOption, Term};

const DEFAULT_MAX_EXPANSIONS: u32 = 50;
Expand Down Expand Up @@ -145,9 +145,15 @@ impl Query for PhrasePrefixQuery {
Bound::Unbounded
};

let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
range_query.limit(self.max_expansions as u64);
range_query.weight(enable_scoring)
let lower_bound = Bound::Included(self.prefix.1.clone());
let upper_bound = end_term;

Ok(Box::new(InvertedIndexRangeWeight::new(
self.field,
&lower_bound,
&upper_bound,
Some(self.max_expansions as u64),
)))
}
}

Expand Down
50 changes: 33 additions & 17 deletions src/query/query_parser/query_parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// so-called default fields (as set up in the constructor).
///
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
/// conjunction as a default, our query will be interpreted as.
/// conjunction as a default, our query will be interpreted as.
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
/// By default, all tokenized and indexed fields are default fields.
///
Expand All @@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// `body:Barack OR (body:Barack OR text:Obama)` .
///
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
/// interpreted
/// as `(a AND b) OR c`.
/// interpreted as `(a AND b) OR c`.
///
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
Expand Down Expand Up @@ -272,8 +271,7 @@ impl QueryParser {

/// Creates a `QueryParser`, given
/// * an index
/// * a set of default fields used to search if no field is specifically defined
/// in the query.
/// * a set of default fields used to search if no field is specifically defined in the query.
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
}
Expand Down Expand Up @@ -482,16 +480,33 @@ impl QueryParser {
});
if terms.len() != 1 {
return Err(QueryParserError::UnsupportedQuery(format!(
"Range query boundary cannot have multiple tokens: {phrase:?}."
"Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]."
)));
}
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(_) => {
// Json range are not supported.
Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on json field.".to_string(),
))
FieldType::JsonObject(ref json_options) => {
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(Term::from_facet(field, &facet)),
Expand Down Expand Up @@ -553,7 +568,7 @@ impl QueryParser {
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
Ok(vec![LogicalLiteral::Term(dt_term)])
}
FieldType::Str(ref str_options) => {
Expand Down Expand Up @@ -685,8 +700,8 @@ impl QueryParser {
///
/// The terms are identified by a triplet:
/// - tantivy field
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
/// object by naturally extending the json field name with a "." separated field_path
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
/// naturally extending the json field name with a "." separated field_path
/// - field_phrase: the phrase that is being searched.
///
/// The literal identifies the targeted field by a so-called *full field path*,
Expand Down Expand Up @@ -949,7 +964,8 @@ fn generate_literals_for_json_object(
|| Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());

// Try to convert the phrase to a fast value
if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase)
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}
Expand Down Expand Up @@ -1123,8 +1139,8 @@ mod test {
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
assert_eq!(
format!("{query:?}"),
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
Included(Term(field=0, type=Str, \"b\")), limit: None }"
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \
\"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }"
);
}

Expand Down
Loading

0 comments on commit a3243d8

Please sign in to comment.