Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.

Commit cf5c934

Browse files
committed
Add a fuzz test for index operations (clear, import, delete, settings)
It is very limited so far. It is meant to catch bugs with soft-deleted document ids.
1 parent e44ca84 commit cf5c934

File tree

4 files changed

+356
-3
lines changed

4 files changed

+356
-3
lines changed

.gitignore

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ milli/target/
1616
## ... unreviewed
1717
*.snap.new
1818

19-
# Fuzzcheck data for the facet indexing fuzz test
20-
milli/fuzz/update::facet::incremental::fuzz::fuzz/
19+
# Fuzzcheck data
20+
milli/fuzz/*

milli/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
#![cfg_attr(all(test, fuzzing), feature(no_coverage))]
1+
#![cfg_attr(all(test, fuzzing), feature(no_coverage, once_cell))]
22
#[macro_use]
33
pub mod documents;
44

milli/src/update/fuzz.rs

Lines changed: 351 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,351 @@
1+
// Things tested by this fuzz test
2+
//
3+
// - A few different document identifiers only
4+
// - Simple setting updates (searchable and filterable attributes only)
5+
// - Document Deletion (given existing or unexisting external document ids)
6+
// - Clear Documents
7+
// - Batched document imports
8+
// - Update/Replacememt of existing documents
9+
// - Each operation with and without soft deletion
10+
// - Empty document imports
11+
// - No crash should ever happen
12+
13+
// A small sample of what isn't tested:
14+
//
15+
// - The correctness of the indexing operations
16+
// - Indexing mistakes that happen when many different documents are inserted
17+
// - Long batches of document imports
18+
// - Nested fields (not tested well anyway)
19+
// - Any search result
20+
// - Arbitrary document contents
21+
// (instead, the components of the documents are pre-written manually)
22+
// - Index creation / Deletion
23+
// - Autogenerated docids
24+
// - Indexing for geosearch
25+
// - Documents with too many field ids or too many words in a field id
26+
// - Anything related to the prefix databases
27+
// - Incorrect setting updates
28+
// - The logic that chooses between soft and hard deletion
29+
// (the choice is instead set manually for each operation)
30+
// - Different IndexerConfig parameters
31+
32+
// Efficiency tips:
33+
//
34+
// - Use a RAM disk (see https://stackoverflow.com/questions/46224103/create-apfs-ram-disk-on-macos-high-sierra)
35+
// - change the value of the TMPDIR environment variable to a folder in the RAM disk
36+
37+
// Quality:
38+
// - finds issue 2945 if any of the last two fixes are not present (within a few minutes)
39+
// - issue 2945: https://github.com/meilisearch/meilisearch/issues/2945
40+
// - fix 1: https://github.com/meilisearch/milli/pull/723
41+
// - fix 2: https://github.com/meilisearch/milli/pull/734
42+
// - but doesn't detect anything wrong if this fix is not included: https://github.com/meilisearch/milli/pull/690
43+
// - because it doesn't cause any crash, I think
44+
// - each fuzz test iteration is quite slow
45+
// - for this fuzz test in particular, it is good to let it run for a few hours, or even a day
46+
47+
use std::hash::Hash;
48+
use std::sync::LazyLock;
49+
50+
use fuzzcheck::mutators::integer_within_range::U8WithinRangeMutator;
51+
use fuzzcheck::mutators::option::OptionMutator;
52+
use fuzzcheck::mutators::unique::UniqueMutator;
53+
use fuzzcheck::mutators::vector::VecMutator;
54+
use fuzzcheck::DefaultMutator;
55+
use heed::{EnvOpenOptions, RwTxn};
56+
use serde::{Deserialize, Serialize};
57+
use tempfile::TempDir;
58+
59+
use super::{
60+
ClearDocuments, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
61+
};
62+
use crate::Index;
63+
64+
/// The list of document identifiers that we choose to test
65+
static DOCUMENT_IDENTIFIERS: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
66+
let mut ids = vec![];
67+
for i in 0..10 {
68+
ids.push(serde_json::json!(i));
69+
ids.push(serde_json::json!(format!("{i}")));
70+
}
71+
ids.push(serde_json::json!("complex-ID-1_2"));
72+
ids.push(serde_json::json!("1-2-3-4"));
73+
ids.push(serde_json::json!("invalidsupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious"));
74+
ids.push(serde_json::json!("invalid.id"));
75+
ids
76+
});
77+
78+
/// The list of field values that we choose to test
79+
static FIELD_VALUES: LazyLock<Vec<serde_json::Value>> = LazyLock::new(|| {
80+
let mut vals = vec![];
81+
for i in 0..10i32 {
82+
vals.push(serde_json::json!(i));
83+
vals.push(serde_json::json!((i as f64) / 3.4));
84+
vals.push(serde_json::json!(111.1_f32.powi(i)));
85+
vals.push(serde_json::json!(format!("{i}")));
86+
vals.push(serde_json::json!([i - 1, format!("{i}"), i + 1, format!("{}", i - 1), i - 2]));
87+
vals.push(serde_json::json!(format!("{}", "a".repeat(i as usize))));
88+
}
89+
vals.push(serde_json::json!({ "nested": ["value", { "nested": ["value", "value", "the quick brown fox jumps over the lazy dog, wow!"] }], "value": 0}));
90+
vals.push(serde_json::json!("the quick brown fox jumps over the lazy dog, wow!"));
91+
vals.push(serde_json::json!("the quick brown supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious fox jumps over the lazy dog"));
92+
vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1 }));
93+
vals.push(serde_json::json!({ "lat": 23.0, "lon": 22.1, "other": 10.0 }));
94+
vals.push(serde_json::json!({ "lat": -23.0, "lon": -22.1 }));
95+
vals.push(serde_json::json!({ "lat": 93.0, "lon": 22.1 }));
96+
vals.push(serde_json::json!({ "lat": 90.0, "lon": 221.1 }));
97+
vals
98+
});
99+
/// The list of field keys that we choose to test
100+
static FIELD_KEYS: LazyLock<Vec<String>> = LazyLock::new(|| {
101+
let mut keys = vec![];
102+
for f in ["identifier", "field1", "field2", "_geo"] {
103+
keys.push(f.to_owned());
104+
for g in [
105+
"nested",
106+
"value",
107+
"nested.value",
108+
"nested.value.nested",
109+
"_geo",
110+
"lat",
111+
"lon",
112+
"other",
113+
] {
114+
let mut key = f.to_owned();
115+
key.push('.');
116+
key.push_str(g);
117+
keys.push(key);
118+
}
119+
}
120+
keys
121+
});
122+
fn document_identifier(i: u8) -> serde_json::Value {
123+
DOCUMENT_IDENTIFIERS[i as usize].clone()
124+
}
125+
fn field_key(i: u8) -> String {
126+
FIELD_KEYS[i as usize].clone()
127+
}
128+
fn field_value(i: u8) -> serde_json::Value {
129+
FIELD_VALUES[i as usize].clone()
130+
}
131+
fn document_identifier_index_mutator() -> U8WithinRangeMutator {
132+
U8WithinRangeMutator::new(..DOCUMENT_IDENTIFIERS.len() as u8)
133+
}
134+
fn field_key_index_mutator() -> U8WithinRangeMutator {
135+
U8WithinRangeMutator::new(..FIELD_KEYS.len() as u8)
136+
}
137+
fn field_value_index_mutator() -> U8WithinRangeMutator {
138+
U8WithinRangeMutator::new(..FIELD_VALUES.len() as u8)
139+
}
140+
141+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
142+
enum Operation {
143+
SettingsUpdate(SettingsUpdate),
144+
DocumentImport(DocumentImport),
145+
DocumentDeletion(DocumentDeletion),
146+
Clear,
147+
}
148+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
149+
enum Method {
150+
Update,
151+
Replace,
152+
}
153+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
154+
struct DocumentImport {
155+
disable_soft_deletion: bool,
156+
method: Method,
157+
documents: DocumentImportBatch,
158+
}
159+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
160+
struct SettingsUpdate {
161+
// Adding filterable fields slows down the fuzzer a lot
162+
// #[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
163+
// OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
164+
// })]
165+
// filterable_fields: Option<Vec<u8>>,
166+
#[field_mutator(OptionMutator<Vec<u8>, VecMutator<u8, U8WithinRangeMutator>> = {
167+
OptionMutator::new(VecMutator::new(field_key_index_mutator(), 0..=10))
168+
})]
169+
searchable_fields: Option<Vec<u8>>,
170+
}
171+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
172+
struct DocumentDeletion {
173+
disable_soft_deletion: bool,
174+
#[field_mutator(VecMutator<u8, U8WithinRangeMutator> = {
175+
VecMutator::new(document_identifier_index_mutator(), 0..=10)
176+
})]
177+
external_document_ids: Vec<u8>,
178+
}
179+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
180+
struct Document {
181+
#[field_mutator(U8WithinRangeMutator = { document_identifier_index_mutator() })]
182+
identifier: u8,
183+
#[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
184+
OptionMutator::new(field_value_index_mutator())
185+
})]
186+
field1: Option<u8>,
187+
#[field_mutator(OptionMutator<u8, U8WithinRangeMutator> = {
188+
OptionMutator::new(field_value_index_mutator())
189+
})]
190+
field2: Option<u8>,
191+
}
192+
#[derive(Debug, Clone, Serialize, Deserialize, DefaultMutator, PartialEq, Eq, Hash)]
193+
struct DocumentImportBatch {
194+
#[field_mutator(VecMutator<Document, DocumentMutator> = {
195+
VecMutator::new(Document::default_mutator(), 0..=10)
196+
})]
197+
docs1: Vec<Document>,
198+
#[field_mutator(VecMutator<Document, DocumentMutator> = {
199+
VecMutator::new(Document::default_mutator(), 0..=5)
200+
})]
201+
docs2: Vec<Document>,
202+
}
203+
204+
fn apply_document_deletion<'i>(
205+
wtxn: &mut RwTxn<'i, '_>,
206+
index: &'i Index,
207+
deletion: &DocumentDeletion,
208+
) {
209+
let DocumentDeletion { disable_soft_deletion, external_document_ids } = deletion;
210+
let mut builder = DeleteDocuments::new(wtxn, index).unwrap();
211+
builder.disable_soft_deletion(*disable_soft_deletion);
212+
for id in external_document_ids {
213+
let id = document_identifier(*id);
214+
let id = match id {
215+
serde_json::Value::Number(n) => format!("{n}"),
216+
serde_json::Value::String(s) => s,
217+
_ => panic!(),
218+
};
219+
let _ = builder.delete_external_id(id.as_str());
220+
}
221+
builder.execute().unwrap();
222+
}
223+
224+
fn apply_document_import<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, import: &DocumentImport) {
225+
let DocumentImport {
226+
disable_soft_deletion,
227+
method,
228+
documents: DocumentImportBatch { docs1, docs2 },
229+
} = import;
230+
let indexer_config = IndexerConfig::default();
231+
let mut builder = IndexDocuments::new(
232+
wtxn,
233+
index,
234+
&indexer_config,
235+
IndexDocumentsConfig {
236+
update_method: match method {
237+
Method::Update => super::IndexDocumentsMethod::UpdateDocuments,
238+
Method::Replace => super::IndexDocumentsMethod::ReplaceDocuments,
239+
},
240+
disable_soft_deletion: *disable_soft_deletion,
241+
autogenerate_docids: false,
242+
..IndexDocumentsConfig::default()
243+
},
244+
|_| {},
245+
|| false,
246+
)
247+
.unwrap();
248+
249+
let make_real_docs = |docs: &Vec<Document>| {
250+
docs.iter()
251+
.map(|doc| {
252+
let Document { identifier, field1, field2 } = doc;
253+
let mut object = crate::Object::new();
254+
let identifier = document_identifier(*identifier);
255+
object.insert("identifier".to_owned(), serde_json::json!(identifier));
256+
if let Some(field1) = field1 {
257+
let field1 = field_value(*field1);
258+
object.insert("field1".to_owned(), field1);
259+
}
260+
if let Some(field2) = field2 {
261+
let field2 = field_value(*field2);
262+
object.insert("field2".to_owned(), field2);
263+
}
264+
object
265+
})
266+
.collect::<Vec<_>>()
267+
};
268+
269+
let docs1 = make_real_docs(docs1);
270+
271+
let (new_builder, _user_error) = builder.add_documents(documents!(docs1)).unwrap();
272+
builder = new_builder;
273+
274+
let docs2 = make_real_docs(docs2);
275+
276+
let (new_builder, _user_error) = builder.add_documents(documents!(docs2)).unwrap();
277+
builder = new_builder;
278+
279+
let _ = builder.execute().unwrap();
280+
}
281+
282+
fn apply_settings_update<'i>(
283+
wtxn: &mut RwTxn<'i, '_>,
284+
index: &'i Index,
285+
settings: &SettingsUpdate,
286+
) {
287+
let SettingsUpdate { searchable_fields /* , filterable_fields */ } = settings;
288+
let indexer_config = IndexerConfig::default();
289+
let mut settings = Settings::new(wtxn, index, &indexer_config);
290+
// match filterable_fields {
291+
// Some(fields) => {
292+
// let fields = fields.iter().map(|f| field_key(*f)).collect();
293+
// settings.set_filterable_fields(fields);
294+
// }
295+
// None => settings.reset_filterable_fields(),
296+
// }
297+
match searchable_fields {
298+
Some(fields) => {
299+
let fields = fields.iter().map(|f| field_key(*f)).collect();
300+
settings.set_searchable_fields(fields);
301+
}
302+
None => settings.reset_searchable_fields(),
303+
}
304+
settings.execute(|_| {}, || false).unwrap();
305+
}
306+
307+
fn apply_operation<'i>(wtxn: &mut RwTxn<'i, '_>, index: &'i Index, operation: &Operation) {
308+
match operation {
309+
Operation::SettingsUpdate(settings) => apply_settings_update(wtxn, index, settings),
310+
Operation::DocumentImport(import) => apply_document_import(wtxn, index, import),
311+
Operation::DocumentDeletion(deletion) => apply_document_deletion(wtxn, index, deletion),
312+
Operation::Clear => {
313+
let builder = ClearDocuments::new(wtxn, index);
314+
let _result = builder.execute().unwrap();
315+
}
316+
}
317+
}
318+
319+
#[test]
320+
fn fuzz() {
321+
let tempdir = TempDir::new_in("/Volumes/Ramdisk").unwrap();
322+
323+
let mut options = EnvOpenOptions::new();
324+
options.map_size(4096 * 1000 * 1000);
325+
326+
let index = {
327+
let index = Index::new(options, tempdir.path()).unwrap();
328+
let mut wtxn = index.write_txn().unwrap();
329+
let indexer_config = IndexerConfig::default();
330+
let mut settings = Settings::new(&mut wtxn, &index, &indexer_config);
331+
settings.set_primary_key("identifier".to_owned());
332+
settings.execute(|_| {}, || false).unwrap();
333+
wtxn.commit().unwrap();
334+
index
335+
};
336+
337+
let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| {
338+
let mut wtxn = index.write_txn().unwrap();
339+
for operation in operations {
340+
apply_operation(&mut wtxn, &index, operation);
341+
}
342+
wtxn.abort().unwrap();
343+
})
344+
// We use a bloom filter (through UniqueMutator) to prevent the same test input from being tested too many times
345+
.mutator(UniqueMutator::new(VecMutator::new(Operation::default_mutator(), 0..=20), |x| x))
346+
.serde_serializer()
347+
.default_sensor_and_pool()
348+
.arguments_from_cargo_fuzzcheck()
349+
.launch();
350+
assert!(!result.found_test_failure);
351+
}

milli/src/update/mod.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ mod available_documents_ids;
1818
mod clear_documents;
1919
mod delete_documents;
2020
pub(crate) mod facet;
21+
#[cfg(all(fuzzing, test))]
22+
mod fuzz;
2123
mod index_documents;
2224
mod indexer_config;
2325
mod prefix_word_pairs;

0 commit comments

Comments
 (0)