diff --git a/quickwit/quickwit-search/src/extract_timestamp_range.rs b/quickwit/quickwit-search/src/extract_timestamp_range.rs new file mode 100644 index 00000000000..93f9e291ad4 --- /dev/null +++ b/quickwit/quickwit-search/src/extract_timestamp_range.rs @@ -0,0 +1,445 @@ +// Copyright 2021-Present Datadog, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::ops::Bound; + +use quickwit_query::query_ast::{ + BoolQuery, QueryAst, QueryAstTransformer, RangeQuery, TermQuery, TermSetQuery, +}; +use tantivy::DateTime; +use tracing::warn; + +const NANOS_IN_SEC: i64 = 1_000_000_000; + +/// Extract timestamp range in QueryAst. +/// +/// This can allow MatchAll queries (post timestamp range removal) to be optimized. +pub(crate) fn extract_start_end_timestamp_from_ast( + query_ast: QueryAst, + timestamp_field: &str, + start_timestamp: &mut Option, + end_timestamp: &mut Option, +) -> QueryAst { + let mut timestamp_range_extractor = ExtractTimestampRange { + timestamp_field, + start_timestamp: start_timestamp + .map(|t| DateTime::from_timestamp_nanos(t.saturating_mul(NANOS_IN_SEC))) + .map(Bound::Included) + .unwrap_or(Bound::Unbounded), + end_timestamp: end_timestamp + .map(|t| DateTime::from_timestamp_nanos(t.saturating_mul(NANOS_IN_SEC))) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded), + }; + + let new_ast = timestamp_range_extractor + .transform(query_ast) + .expect("can't fail unwrapping Infallible") + .unwrap_or(QueryAst::MatchAll); + + *start_timestamp = match timestamp_range_extractor.start_timestamp { + Bound::Included(x) => Some(x.into_timestamp_secs()), + Bound::Excluded(x) => Some(x.into_timestamp_secs().saturating_add(1)), + Bound::Unbounded => None, + }; + + *end_timestamp = match timestamp_range_extractor.end_timestamp { + Bound::Included(x) => Some(x.into_timestamp_secs().saturating_add(1)), + Bound::Excluded(x) => { + let round_up = (x.into_timestamp_nanos() % NANOS_IN_SEC) != 0; + if round_up { + Some(x.into_timestamp_secs().saturating_add(1)) + } else { + Some(x.into_timestamp_secs()) + } + } + Bound::Unbounded => None, + }; + + new_ast +} + +/// Boundaries identified as being implied by the QueryAst. +/// +/// `start_timestamp` is to be interpreted as Inclusive (or Unbounded) +/// `end_timestamp` is to be interpreted as Exclusive (or Unbounded) +/// In other word, this is a `[start_timestamp..end_timestamp)` interval. +#[derive(Debug, Clone)] +pub(crate) struct ExtractTimestampRange<'a> { + pub(crate) timestamp_field: &'a str, + pub(crate) start_timestamp: Bound, + pub(crate) end_timestamp: Bound, +} + +impl ExtractTimestampRange<'_> { + fn update_start_timestamp( + &mut self, + lower_bound: &quickwit_query::JsonLiteral, + included: bool, + ) { + use quickwit_query::InterpretUserInput; + let Some(lower_bound) = DateTime::interpret_json(lower_bound) else { + // we shouldn't be able to get here, we would have errored much earlier + warn!("unparsable time bound in search: {lower_bound:?}"); + return; + }; + let bound = if included { + Bound::Included(lower_bound) + } else { + Bound::Excluded(lower_bound) + }; + + self.start_timestamp = max_bound(self.start_timestamp, bound); + } + + fn update_end_timestamp(&mut self, upper_bound: &quickwit_query::JsonLiteral, included: bool) { + use quickwit_query::InterpretUserInput; + let Some(upper_bound) = DateTime::interpret_json(upper_bound) else { + // we shouldn't be able to get here, we would have errored much earlier + warn!("unparsable time bound in search: {upper_bound:?}"); + return; + }; + let bound = if included { + Bound::Included(upper_bound) + } else { + Bound::Excluded(upper_bound) + }; + + self.end_timestamp = min_bound(self.end_timestamp, bound); + } +} + +impl QueryAstTransformer for ExtractTimestampRange<'_> { + type Err = std::convert::Infallible; + + fn transform_bool(&mut self, mut bool_query: BoolQuery) -> Result, Self::Err> { + // we only want to visit sub-queries which are strict (positive) requirements + bool_query.must = bool_query + .must + .into_iter() + .filter_map(|query_ast| self.transform(query_ast).transpose()) + .filter(|result| result != &Ok(QueryAst::MatchAll)) + .collect::, _>>()?; + bool_query.filter = bool_query + .filter + .into_iter() + .filter_map(|query_ast| self.transform(query_ast).transpose()) + .filter(|result| result != &Ok(QueryAst::MatchAll)) + .collect::, _>>()?; + + let ast = if bool_query == BoolQuery::default() { + QueryAst::MatchAll + } else { + QueryAst::Bool(bool_query) + }; + + Ok(Some(ast)) + } + + fn transform_range(&mut self, range_query: RangeQuery) -> Result, Self::Err> { + use std::ops::Bound; + + if range_query.field != self.timestamp_field { + return Ok(Some(range_query.into())); + } + + match range_query.lower_bound { + Bound::Included(lower_bound) => { + self.update_start_timestamp(&lower_bound, true); + } + Bound::Excluded(lower_bound) => { + self.update_start_timestamp(&lower_bound, false); + } + Bound::Unbounded => (), + }; + + match range_query.upper_bound { + Bound::Included(upper_bound) => { + self.update_end_timestamp(&upper_bound, true); + } + Bound::Excluded(upper_bound) => { + self.update_end_timestamp(&upper_bound, false); + } + Bound::Unbounded => (), + }; + + Ok(Some(QueryAst::MatchAll)) + } + + // if we visit a term, limit the range to DATE..=DATE + fn transform_term(&mut self, term_query: TermQuery) -> Result, Self::Err> { + if term_query.field != self.timestamp_field { + return Ok(Some(term_query.into())); + } + + // TODO when fixing #3323, this may need to be modified to support numbers too + let json_term = quickwit_query::JsonLiteral::String(term_query.value.clone()); + self.update_start_timestamp(&json_term, true); + self.update_end_timestamp(&json_term, true); + Ok(Some(QueryAst::MatchAll)) + } + + // if we visit a termset, limit the range to LOWEST..=HIGHEST + fn transform_term_set( + &mut self, + mut term_query: TermSetQuery, + ) -> Result, Self::Err> { + let Some(term_set) = term_query.terms_per_field.remove(self.timestamp_field) else { + return Ok(Some(term_query.into())); + }; + + // rfc3339 is lexicographically ordered if YEAR <= 9999, so we can use string + // ordering to get the start and end quickly. + if let Some(first) = term_set.first() { + let json_term = quickwit_query::JsonLiteral::String(first.clone()); + self.update_start_timestamp(&json_term, true); + } + if let Some(last) = term_set.last() { + let json_term = quickwit_query::JsonLiteral::String(last.clone()); + self.update_end_timestamp(&json_term, true); + } + + let ast = if term_query.terms_per_field.is_empty() { + QueryAst::MatchAll + } else { + term_query.into() + }; + + Ok(Some(ast)) + } +} + +// returns the max of left and right, that isn't unbounded. Useful for making +// the intersection of lower bound of ranges +fn max_bound(left: Bound, right: Bound) -> Bound { + use Bound::*; + match (left, right) { + (Unbounded, right) => right, + (left, Unbounded) => left, + (Included(left), Included(right)) => Included(left.max(right)), + (Excluded(left), Excluded(right)) => Excluded(left.max(right)), + (excluded_total @ Excluded(excluded), included_total @ Included(included)) => { + if included > excluded { + included_total + } else { + excluded_total + } + } + (included_total @ Included(included), excluded_total @ Excluded(excluded)) => { + if included > excluded { + included_total + } else { + excluded_total + } + } + } +} + +// returns the min of left and right, that isn't unbounded. Useful for making +// the intersection of upper bound of ranges +fn min_bound(left: Bound, right: Bound) -> Bound { + use Bound::*; + match (left, right) { + (Unbounded, right) => right, + (left, Unbounded) => left, + (Included(left), Included(right)) => Included(left.min(right)), + (Excluded(left), Excluded(right)) => Excluded(left.min(right)), + (excluded_total @ Excluded(excluded), included_total @ Included(included)) => { + if included < excluded { + included_total + } else { + excluded_total + } + } + (included_total @ Included(included), excluded_total @ Excluded(excluded)) => { + if included < excluded { + included_total + } else { + excluded_total + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_timestamp_range_from_ast() { + use std::ops::Bound; + + use quickwit_query::JsonLiteral; + + let timestamp_field = "timestamp"; + + let simple_range: QueryAst = quickwit_query::query_ast::RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Included(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), + upper_bound: Bound::Excluded(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), + } + .into(); + + // direct range + let mut start_timestamp = None; + let mut end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + simple_range.clone(), + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618353941)); + assert_eq!(end_timestamp, Some(1620283879)); + + // range inside a must bool query + let bool_query_must = quickwit_query::query_ast::BoolQuery { + must: vec![simple_range.clone()], + ..Default::default() + }; + start_timestamp = None; + end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + bool_query_must.into(), + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618353941)); + assert_eq!(end_timestamp, Some(1620283879)); + + // range inside a should bool query + let bool_query_should: QueryAst = quickwit_query::query_ast::BoolQuery { + should: vec![simple_range.clone()], + ..Default::default() + } + .into(); + start_timestamp = Some(123); + end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + bool_query_should.clone(), + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + bool_query_should + ); + assert_eq!(start_timestamp, Some(123)); + assert_eq!(end_timestamp, None); + + // start bound was already more restrictive + start_timestamp = Some(1618601297); + end_timestamp = Some(i64::MAX); + assert_eq!( + extract_start_end_timestamp_from_ast( + simple_range.clone(), + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618601297)); + assert_eq!(end_timestamp, Some(1620283879)); + + // end bound was already more restrictive + start_timestamp = Some(1); + end_timestamp = Some(1618601297); + assert_eq!( + extract_start_end_timestamp_from_ast( + simple_range, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618353941)); + assert_eq!(end_timestamp, Some(1618601297)); + + // bounds are (start..end] instead of [start..end) + let unusual_bounds = quickwit_query::query_ast::RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Excluded(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), + upper_bound: Bound::Included(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), + } + .into(); + start_timestamp = None; + end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + unusual_bounds, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618353942)); + assert_eq!(end_timestamp, Some(1620283880)); + + let wrong_field: QueryAst = quickwit_query::query_ast::RangeQuery { + field: "other_field".to_string(), + lower_bound: Bound::Included(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), + upper_bound: Bound::Excluded(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), + } + .into(); + start_timestamp = None; + end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + wrong_field.clone(), + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + wrong_field + ); + assert_eq!(start_timestamp, None); + assert_eq!(end_timestamp, None); + + let high_precision = quickwit_query::query_ast::RangeQuery { + field: timestamp_field.to_string(), + lower_bound: Bound::Included(JsonLiteral::String( + "2021-04-13T22:45:41.001Z".to_owned(), + )), + upper_bound: Bound::Excluded(JsonLiteral::String( + "2021-05-06T06:51:19.001Z".to_owned(), + )), + } + .into(); + + // the upper bound should be rounded up as to includes documents from X.000 to X.001 + start_timestamp = None; + end_timestamp = None; + assert_eq!( + extract_start_end_timestamp_from_ast( + high_precision, + timestamp_field, + &mut start_timestamp, + &mut end_timestamp, + ), + QueryAst::MatchAll + ); + assert_eq!(start_timestamp, Some(1618353941)); + assert_eq!(end_timestamp, Some(1620283880)); + } +} diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 5cb126b931d..7279e4defe2 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -29,7 +29,7 @@ use quickwit_proto::search::{ CountHits, LeafSearchRequest, LeafSearchResponse, PartialHit, ResourceStats, SearchRequest, SortOrder, SortValue, SplitIdAndFooterOffsets, SplitSearchError, }; -use quickwit_query::query_ast::{BoolQuery, QueryAst, QueryAstTransformer, RangeQuery, TermQuery}; +use quickwit_query::query_ast::{BoolQuery, QueryAst, QueryAstTransformer, RangeQuery}; use quickwit_query::tokenizers::TokenizerManager; use quickwit_storage::{ BundleStorage, ByteRangeCache, MemorySizedCache, OwnedBytes, SplitCache, Storage, @@ -45,6 +45,7 @@ use tokio::task::JoinError; use tracing::*; use crate::collector::{IncrementalCollector, make_collector_for_split, make_merge_collector}; +use crate::extract_timestamp_range::ExtractTimestampRange; use crate::metrics::SEARCH_METRICS; use crate::root::is_metadata_count_request_with_ast; use crate::search_permit_provider::{SearchPermit, compute_initial_memory_allocation}; @@ -639,58 +640,6 @@ pub fn map_bound(bound: Bound, f: impl FnOnce(T) -> U) -> Bound { } } -// returns the max of left and right, that isn't unbounded. Useful for making -// the intersection of lower bound of ranges -fn max_bound(left: Bound, right: Bound) -> Bound { - use Bound::*; - match (left, right) { - (Unbounded, right) => right, - (left, Unbounded) => left, - (Included(left), Included(right)) => Included(left.max(right)), - (Excluded(left), Excluded(right)) => Excluded(left.max(right)), - (excluded_total @ Excluded(excluded), included_total @ Included(included)) => { - if included > excluded { - included_total - } else { - excluded_total - } - } - (included_total @ Included(included), excluded_total @ Excluded(excluded)) => { - if included > excluded { - included_total - } else { - excluded_total - } - } - } -} - -// returns the min of left and right, that isn't unbounded. Useful for making -// the intersection of upper bound of ranges -fn min_bound(left: Bound, right: Bound) -> Bound { - use Bound::*; - match (left, right) { - (Unbounded, right) => right, - (left, Unbounded) => left, - (Included(left), Included(right)) => Included(left.min(right)), - (Excluded(left), Excluded(right)) => Excluded(left.min(right)), - (excluded_total @ Excluded(excluded), included_total @ Included(included)) => { - if included < excluded { - included_total - } else { - excluded_total - } - } - (included_total @ Included(included), excluded_total @ Excluded(excluded)) => { - if included < excluded { - included_total - } else { - excluded_total - } - } - } -} - /// remove timestamp range that would be present both in QueryAst and SearchRequest /// /// this can save us from doing double the work in some cases, and help with the partial request @@ -716,7 +665,7 @@ fn remove_redundant_timestamp_range( .map(Bound::Excluded) .unwrap_or(Bound::Unbounded); - let mut visitor = RemoveTimestampRange { + let mut visitor = ExtractTimestampRange { timestamp_field, start_timestamp, end_timestamp, @@ -810,106 +759,6 @@ fn remove_redundant_timestamp_range( search_request.end_timestamp = None; } -/// Remove all `must` and `filter timestamp ranges, and summarize them -#[derive(Debug, Clone)] -struct RemoveTimestampRange<'a> { - timestamp_field: &'a str, - start_timestamp: Bound, - end_timestamp: Bound, -} - -impl RemoveTimestampRange<'_> { - fn update_start_timestamp( - &mut self, - lower_bound: &quickwit_query::JsonLiteral, - included: bool, - ) { - use quickwit_query::InterpretUserInput; - let Some(lower_bound) = DateTime::interpret_json(lower_bound) else { - // we shouldn't be able to get here, we would have errored much earlier in root search - warn!("unparsable time bound in leaf search: {lower_bound:?}"); - return; - }; - let bound = if included { - Bound::Included(lower_bound) - } else { - Bound::Excluded(lower_bound) - }; - - self.start_timestamp = max_bound(self.start_timestamp, bound); - } - - fn update_end_timestamp(&mut self, upper_bound: &quickwit_query::JsonLiteral, included: bool) { - use quickwit_query::InterpretUserInput; - let Some(upper_bound) = DateTime::interpret_json(upper_bound) else { - // we shouldn't be able to get here, we would have errored much earlier in root search - warn!("unparsable time bound in leaf search: {upper_bound:?}"); - return; - }; - let bound = if included { - Bound::Included(upper_bound) - } else { - Bound::Excluded(upper_bound) - }; - - self.end_timestamp = min_bound(self.end_timestamp, bound); - } -} - -impl QueryAstTransformer for RemoveTimestampRange<'_> { - type Err = std::convert::Infallible; - - fn transform_bool(&mut self, mut bool_query: BoolQuery) -> Result, Self::Err> { - // we only want to visit sub-queries which are strict (positive) requirements - bool_query.must = bool_query - .must - .into_iter() - .filter_map(|query_ast| self.transform(query_ast).transpose()) - .collect::, _>>()?; - bool_query.filter = bool_query - .filter - .into_iter() - .filter_map(|query_ast| self.transform(query_ast).transpose()) - .collect::, _>>()?; - - Ok(Some(QueryAst::Bool(bool_query))) - } - - fn transform_range(&mut self, range_query: RangeQuery) -> Result, Self::Err> { - if range_query.field == self.timestamp_field { - match range_query.lower_bound { - Bound::Included(lower_bound) => { - self.update_start_timestamp(&lower_bound, true); - } - Bound::Excluded(lower_bound) => { - self.update_start_timestamp(&lower_bound, false); - } - Bound::Unbounded => (), - }; - - match range_query.upper_bound { - Bound::Included(upper_bound) => { - self.update_end_timestamp(&upper_bound, true); - } - Bound::Excluded(upper_bound) => { - self.update_end_timestamp(&upper_bound, false); - } - Bound::Unbounded => (), - }; - - Ok(Some(QueryAst::MatchAll)) - } else { - Ok(Some(range_query.into())) - } - } - - fn transform_term(&mut self, term_query: TermQuery) -> Result, Self::Err> { - // TODO we could remove query bounds, this point query surely is more precise, and it - // doesn't require loading a fastfield - Ok(Some(QueryAst::Term(term_query))) - } -} - pub(crate) fn rewrite_start_end_time_bounds( start_timestamp_opt: &mut Option, end_timestamp_opt: &mut Option, diff --git a/quickwit/quickwit-search/src/lib.rs b/quickwit/quickwit-search/src/lib.rs index babf2786fce..7a021fdb1a0 100644 --- a/quickwit/quickwit-search/src/lib.rs +++ b/quickwit/quickwit-search/src/lib.rs @@ -20,6 +20,7 @@ mod client; mod cluster_client; mod collector; +mod extract_timestamp_range; mod error; mod fetch_docs; mod filters; diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index aa27acd5d5d..eea2337753c 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -36,9 +36,7 @@ use quickwit_proto::search::{ SnippetRequest, SortDatetimeFormat, SortField, SortValue, SplitIdAndFooterOffsets, }; use quickwit_proto::types::{IndexUid, SplitId}; -use quickwit_query::query_ast::{ - BoolQuery, QueryAst, QueryAstVisitor, RangeQuery, TermQuery, TermSetQuery, -}; +use quickwit_query::query_ast::QueryAst; use serde::{Deserialize, Serialize}; use tantivy::TantivyError; use tantivy::aggregation::agg_result::AggregationResults; @@ -49,6 +47,7 @@ use tracing::{debug, info_span, instrument}; use crate::cluster_client::ClusterClient; use crate::collector::{QuickwitAggregations, make_merge_collector}; +use crate::extract_timestamp_range::extract_start_end_timestamp_from_ast; use crate::metrics::SEARCH_METRICS; use crate::scroll_context::{ScrollContext, ScrollKeyAndStartOffset}; use crate::search_job_placer::{Job, group_by, group_jobs_by_index_id}; @@ -1104,30 +1103,36 @@ pub fn check_all_index_metadata_found( async fn refine_and_list_matches( metastore: &mut MetastoreServiceClient, search_request: &mut SearchRequest, + request_metadata: &mut RequestMetadata, indexes_metadata: Vec, - query_ast_resolved: QueryAst, - sort_fields_is_datetime: HashMap, - timestamp_field_opt: Option, ) -> crate::Result> { let index_uids = indexes_metadata .iter() .map(|index_metadata| index_metadata.index_uid.clone()) .collect_vec(); - search_request.query_ast = serde_json::to_string(&query_ast_resolved)?; + + let RequestMetadata { + query_ast_resolved, + sort_fields_is_datetime, + timestamp_field_opt, + .. + } = request_metadata; // convert search_after datetime values from input datetime format to nanos. convert_search_after_datetime_values(search_request, &sort_fields_is_datetime)?; // update_search_after_datetime_in_nanos(&mut search_request)?; if let Some(timestamp_field) = ×tamp_field_opt { - refine_start_end_timestamp_from_ast( - &query_ast_resolved, + *query_ast_resolved = extract_start_end_timestamp_from_ast( + query_ast_resolved.clone(), timestamp_field, &mut search_request.start_timestamp, &mut search_request.end_timestamp, ); } - let tag_filter_ast = extract_tags_from_query(query_ast_resolved); + search_request.query_ast = serde_json::to_string(&query_ast_resolved)?; + + let tag_filter_ast = extract_tags_from_query(query_ast_resolved.clone()); // TODO if search after is set, we sort by timestamp and we don't want to count all results, // we can refine more here. Same if we sort by _shard_doc @@ -1182,14 +1187,12 @@ pub async fn root_search( return Ok(search_response); } - let request_metadata = validate_request_and_build_metadata(&indexes_metadata, &search_request)?; + let mut request_metadata = validate_request_and_build_metadata(&indexes_metadata, &search_request)?; let split_metadatas = refine_and_list_matches( &mut metastore, &mut search_request, + &mut request_metadata, indexes_metadata, - request_metadata.query_ast_resolved, - request_metadata.sort_fields_is_datetime, - request_metadata.timestamp_field_opt, ) .await?; @@ -1266,14 +1269,12 @@ pub async fn search_plan( ) .map_err(|err| SearchError::Internal(format!("failed to build doc mapper. cause: {err}")))?; - let request_metadata = validate_request_and_build_metadata(&indexes_metadata, &search_request)?; + let mut request_metadata = validate_request_and_build_metadata(&indexes_metadata, &search_request)?; let split_metadatas = refine_and_list_matches( &mut metastore, &mut search_request, + &mut request_metadata, indexes_metadata, - request_metadata.query_ast_resolved.clone(), - request_metadata.sort_fields_is_datetime, - request_metadata.timestamp_field_opt, ) .await?; @@ -1469,134 +1470,6 @@ fn convert_sort_datetime_value( Ok(()) } -pub(crate) fn refine_start_end_timestamp_from_ast( - query_ast: &QueryAst, - timestamp_field: &str, - start_timestamp: &mut Option, - end_timestamp: &mut Option, -) { - let mut timestamp_range_extractor = ExtractTimestampRange { - timestamp_field, - start_timestamp: *start_timestamp, - end_timestamp: *end_timestamp, - }; - timestamp_range_extractor - .visit(query_ast) - .expect("can't fail unwrapping Infallible"); - *start_timestamp = timestamp_range_extractor.start_timestamp; - *end_timestamp = timestamp_range_extractor.end_timestamp; -} - -/// Boundaries identified as being implied by the QueryAst. -/// -/// `start_timestamp` is to be interpreted as Inclusive (or Unbounded) -/// `end_timestamp` is to be interpreted as Exclusive (or Unbounded) -/// In other word, this is a `[start_timestamp..end_timestamp)` interval. -struct ExtractTimestampRange<'a> { - timestamp_field: &'a str, - start_timestamp: Option, - end_timestamp: Option, -} - -impl ExtractTimestampRange<'_> { - fn update_start_timestamp( - &mut self, - lower_bound: &quickwit_query::JsonLiteral, - included: bool, - ) { - use quickwit_query::InterpretUserInput; - let Some(lower_bound) = tantivy::DateTime::interpret_json(lower_bound) else { - return; - }; - let mut lower_bound = lower_bound.into_timestamp_secs(); - if !included { - // TODO saturating isn't exactly right, we should replace the RangeQuery with - // a match_none, but the visitor doesn't allow mutation. - lower_bound = lower_bound.saturating_add(1); - } - self.start_timestamp = Some( - self.start_timestamp - .map_or(lower_bound, |current| current.max(lower_bound)), - ); - } - - fn update_end_timestamp(&mut self, upper_bound: &quickwit_query::JsonLiteral, included: bool) { - use quickwit_query::InterpretUserInput; - let Some(upper_bound_timestamp) = tantivy::DateTime::interpret_json(upper_bound) else { - return; - }; - let mut upper_bound = upper_bound_timestamp.into_timestamp_secs(); - let round_up = (upper_bound_timestamp.into_timestamp_nanos() % 1_000_000_000) != 0; - if included || round_up { - // TODO saturating isn't exactly right, we should replace the RangeQuery with - // a match_none, but the visitor doesn't allow mutation. - upper_bound = upper_bound.saturating_add(1); - } - self.end_timestamp = Some( - self.end_timestamp - .map_or(upper_bound, |current| current.min(upper_bound)), - ); - } -} - -impl<'b> QueryAstVisitor<'b> for ExtractTimestampRange<'_> { - type Err = std::convert::Infallible; - - fn visit_bool(&mut self, bool_query: &'b BoolQuery) -> Result<(), Self::Err> { - // we only want to visit sub-queries which are strict (positive) requirements - for ast in bool_query.must.iter().chain(bool_query.filter.iter()) { - self.visit(ast)?; - } - Ok(()) - } - - fn visit_range(&mut self, range_query: &'b RangeQuery) -> Result<(), Self::Err> { - use std::ops::Bound; - - if range_query.field == self.timestamp_field { - match &range_query.lower_bound { - Bound::Included(lower_bound) => self.update_start_timestamp(lower_bound, true), - Bound::Excluded(lower_bound) => self.update_start_timestamp(lower_bound, false), - Bound::Unbounded => (), - } - match &range_query.upper_bound { - Bound::Included(upper_bound) => self.update_end_timestamp(upper_bound, true), - Bound::Excluded(upper_bound) => self.update_end_timestamp(upper_bound, false), - Bound::Unbounded => (), - } - } - Ok(()) - } - - // if we visit a term, limit the range to DATE..=DATE - fn visit_term(&mut self, term_query: &'b TermQuery) -> Result<(), Self::Err> { - if term_query.field == self.timestamp_field { - // TODO when fixing #3323, this may need to be modified to support numbers too - let json_term = quickwit_query::JsonLiteral::String(term_query.value.clone()); - self.update_start_timestamp(&json_term, true); - self.update_end_timestamp(&json_term, true); - } - Ok(()) - } - - // if we visit a termset, limit the range to LOWEST..=HIGHEST - fn visit_term_set(&mut self, term_query: &'b TermSetQuery) -> Result<(), Self::Err> { - if let Some(term_set) = term_query.terms_per_field.get(self.timestamp_field) { - // rfc3339 is lexicographically ordered if YEAR <= 9999, so we can use string - // ordering to get the start and end quickly. - if let Some(first) = term_set.first() { - let json_term = quickwit_query::JsonLiteral::String(first.clone()); - self.update_start_timestamp(&json_term, true); - } - if let Some(last) = term_set.last() { - let json_term = quickwit_query::JsonLiteral::String(last.clone()); - self.update_end_timestamp(&json_term, true); - } - } - Ok(()) - } -} - async fn assign_client_fetch_docs_jobs( partial_hits: &[PartialHit], split_metadatas: &[SplitMetadata], @@ -4117,118 +3990,6 @@ mod tests { Ok(()) } - #[test] - fn test_extract_timestamp_range_from_ast() { - use std::ops::Bound; - - use quickwit_query::JsonLiteral; - - let timestamp_field = "timestamp"; - - let simple_range = quickwit_query::query_ast::RangeQuery { - field: timestamp_field.to_string(), - lower_bound: Bound::Included(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), - upper_bound: Bound::Excluded(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), - } - .into(); - - // direct range - let mut timestamp_range_extractor = ExtractTimestampRange { - timestamp_field, - start_timestamp: None, - end_timestamp: None, - }; - timestamp_range_extractor.visit(&simple_range).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283879)); - - // range inside a must bool query - let bool_query_must = quickwit_query::query_ast::BoolQuery { - must: vec![simple_range.clone()], - ..Default::default() - }; - timestamp_range_extractor.start_timestamp = None; - timestamp_range_extractor.end_timestamp = None; - timestamp_range_extractor - .visit(&bool_query_must.into()) - .unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283879)); - - // range inside a should bool query - let bool_query_should = quickwit_query::query_ast::BoolQuery { - should: vec![simple_range.clone()], - ..Default::default() - }; - timestamp_range_extractor.start_timestamp = Some(123); - timestamp_range_extractor.end_timestamp = None; - timestamp_range_extractor - .visit(&bool_query_should.into()) - .unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(123)); - assert_eq!(timestamp_range_extractor.end_timestamp, None); - - // start bound was already more restrictive - timestamp_range_extractor.start_timestamp = Some(1618601297); - timestamp_range_extractor.end_timestamp = Some(i64::MAX); - timestamp_range_extractor.visit(&simple_range).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618601297)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283879)); - - // end bound was already more restrictive - timestamp_range_extractor.start_timestamp = Some(1); - timestamp_range_extractor.end_timestamp = Some(1618601297); - timestamp_range_extractor.visit(&simple_range).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1618601297)); - - // bounds are (start..end] instead of [start..end) - let unusual_bounds = quickwit_query::query_ast::RangeQuery { - field: timestamp_field.to_string(), - lower_bound: Bound::Excluded(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), - upper_bound: Bound::Included(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), - } - .into(); - timestamp_range_extractor.start_timestamp = None; - timestamp_range_extractor.end_timestamp = None; - timestamp_range_extractor.visit(&unusual_bounds).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353942)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283880)); - - let wrong_field = quickwit_query::query_ast::RangeQuery { - field: "other_field".to_string(), - lower_bound: Bound::Included(JsonLiteral::String("2021-04-13T22:45:41Z".to_owned())), - upper_bound: Bound::Excluded(JsonLiteral::String("2021-05-06T06:51:19Z".to_owned())), - } - .into(); - timestamp_range_extractor.start_timestamp = None; - timestamp_range_extractor.end_timestamp = None; - timestamp_range_extractor.visit(&wrong_field).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, None); - assert_eq!(timestamp_range_extractor.end_timestamp, None); - - let high_precision = quickwit_query::query_ast::RangeQuery { - field: timestamp_field.to_string(), - lower_bound: Bound::Included(JsonLiteral::String( - "2021-04-13T22:45:41.001Z".to_owned(), - )), - upper_bound: Bound::Excluded(JsonLiteral::String( - "2021-05-06T06:51:19.001Z".to_owned(), - )), - } - .into(); - - // the upper bound should be rounded up as to includes documents from X.000 to X.001 - let mut timestamp_range_extractor = ExtractTimestampRange { - timestamp_field, - start_timestamp: None, - end_timestamp: None, - }; - timestamp_range_extractor.visit(&high_precision).unwrap(); - assert_eq!(timestamp_range_extractor.start_timestamp, Some(1618353941)); - assert_eq!(timestamp_range_extractor.end_timestamp, Some(1620283880)); - } - fn create_search_resp( index_uri: &str, hit_range: Range, diff --git a/quickwit/quickwit-search/src/search_stream/root.rs b/quickwit/quickwit-search/src/search_stream/root.rs index 5f76a9d29cd..bf2d5838b3e 100644 --- a/quickwit/quickwit-search/src/search_stream/root.rs +++ b/quickwit/quickwit-search/src/search_stream/root.rs @@ -27,7 +27,8 @@ use tokio_stream::StreamMap; use tracing::*; use crate::cluster_client::ClusterClient; -use crate::root::{SearchJob, refine_start_end_timestamp_from_ast}; +use crate::extract_timestamp_range::extract_start_end_timestamp_from_ast; +use crate::root::SearchJob; use crate::{SearchError, list_relevant_splits}; /// Perform a distributed search stream. @@ -55,12 +56,12 @@ pub async fn root_search_stream( let query_ast: QueryAst = serde_json::from_str(&search_stream_request.query_ast) .map_err(|err| SearchError::InvalidQuery(err.to_string()))?; - let query_ast_resolved = query_ast.parse_user_query(doc_mapper.default_search_fields())?; + let mut query_ast_resolved = query_ast.parse_user_query(doc_mapper.default_search_fields())?; let tags_filter_ast = extract_tags_from_query(query_ast_resolved.clone()); if let Some(timestamp_field) = doc_mapper.timestamp_field_name() { - refine_start_end_timestamp_from_ast( - &query_ast_resolved, + query_ast_resolved = extract_start_end_timestamp_from_ast( + query_ast_resolved, timestamp_field, &mut search_stream_request.start_timestamp, &mut search_stream_request.end_timestamp,