diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 865195df5..a6601034b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -100,6 +100,27 @@ pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosi (field_id as u32) << 16 | (relative as u32) } +/// Compute the "bucketed" absolute position from the field id and relative position in the field. +/// +/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger. +pub fn bucketed_absolute_from_relative_position( + field_id: FieldId, + relative: RelativePosition, +) -> Position { + // The first few relative positions are kept intact. + if relative < 16 { + absolute_from_relative_position(field_id, relative) + } else if relative < 24 { + // Relative positions between 16 and 24 all become equal to 24 + absolute_from_relative_position(field_id, 24) + } else { + // Then, groups of positions that have the same base-2 logarithm are reduced to + // the same relative position: the smallest power of 2 that is greater than them + let relative = (relative as f64).log2().ceil().exp2() as u16; + absolute_from_relative_position(field_id, relative) + } +} + /// Transform a raw obkv store into a JSON Object. pub fn obkv_to_json( displayed_fields: &[FieldId], @@ -329,4 +350,51 @@ mod tests { assert_eq!(&actual, expected); } + + #[test] + fn bucketed_position() { + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 0), @"0"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1), @"1"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2), @"2"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 15), @"15"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 16), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 19), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 20), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 21), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 22), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 23), @"24"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 24), @"32"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 25), @"32"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 30), @"32"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 40), @"64"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 50), @"64"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 60), @"64"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 70), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 80), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 90), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 100), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 110), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 120), @"128"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 130), @"256"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 1000), @"1024"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 2000), @"2048"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 4000), @"4096"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 8000), @"8192"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 9000), @"16384"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 10_000), @"16384"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_500), @"65535"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(0, 65_535), @"65535"); + + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 0), @"65536"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1), @"65537"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 20), @"65560"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 1000), @"66560"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(1, 65_535), @"131071"); + + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 0), @"131072"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(2, 65_535), @"196607"); + + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 0), @"4294901760"); + insta::assert_debug_snapshot!(bucketed_absolute_from_relative_position(65_535, 65_535), @"4294967295"); + } } diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index b389a5d1e..d5ca0b8ea 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -11,7 +11,9 @@ use crate::search::criteria::{ InitialCandidates, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{absolute_from_relative_position, FieldId, Result}; +use crate::{ + absolute_from_relative_position, bucketed_absolute_from_relative_position, FieldId, Result, +}; pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, @@ -285,30 +287,34 @@ fn attribute_start_with_docids( ) -> heed::Result> { let mut attribute_candidates_array = Vec::new(); // start from attribute first position - let mut pos = absolute_from_relative_position(attribute_id, 0); + let mut relative_pos = 0; for part in query { use ExactQueryPart::*; match part { Synonyms(synonyms) => { + let bucketed_position = + bucketed_absolute_from_relative_position(attribute_id, relative_pos); let mut synonyms_candidates = RoaringBitmap::new(); for word in synonyms { - let wc = ctx.word_position_docids(word, pos)?; + let wc = ctx.word_position_docids(word, bucketed_position)?; if let Some(word_candidates) = wc { synonyms_candidates |= word_candidates; } } attribute_candidates_array.push(synonyms_candidates); - pos += 1; + relative_pos += 1; } Phrase(phrase) => { for word in phrase { + let bucketed_position = + bucketed_absolute_from_relative_position(attribute_id, relative_pos); if let Some(word) = word { - let wc = ctx.word_position_docids(word, pos)?; + let wc = ctx.word_position_docids(word, bucketed_position)?; if let Some(word_candidates) = wc { attribute_candidates_array.push(word_candidates); } } - pos += 1; + relative_pos += 1; } } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index d95db4157..33e83dda9 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,7 +7,9 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{DocumentId, Result}; +use crate::{ + bucketed_absolute_from_relative_position, relative_from_absolute_position, DocumentId, Result, +}; /// Extracts the word positions and the documents ids where this word appear. /// @@ -37,9 +39,12 @@ pub fn extract_word_position_docids( let document_id = DocumentId::from_be_bytes(document_id_bytes); for position in read_u32_ne_bytes(value) { + let (field_id, relative) = relative_from_absolute_position(position); + let bucketed_position = bucketed_absolute_from_relative_position(field_id, relative); + key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - key_buffer.extend_from_slice(&position.to_be_bytes()); + key_buffer.extend_from_slice(&bucketed_position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; }