Skip to content

Commit 3147e9e

Browse files
committed
Increase fuzz testing of streaming group by / low cardinality columns
1 parent 818ce3f commit 3147e9e

File tree

3 files changed

+104
-49
lines changed

3 files changed

+104
-49
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{
6565
//
6666
// TODO: test other aggregate functions
6767
// - AVG (unstable given the wide range of inputs)
68-
//
69-
// TODO: specific test for ordering (ensure all group by columns are ordered)
70-
// Currently the data is sorted by random columns, so there are almost no
71-
// repeated runs. To improve coverage we should also sort by lower cardinality columns
7268
#[tokio::test(flavor = "multi_thread")]
7369
async fn test_min() {
7470
let data_gen_config = baseline_config();
@@ -172,15 +168,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
172168
// TODO add support for utf8view in data generator
173169
// ColumnDescr::new("utf8view", DataType::Utf8View),
174170
// todo binary
171+
// low cardinality columns
172+
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
173+
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
175174
];
176175

176+
let min_num_rows = 512;
177+
let max_num_rows = 1024;
178+
177179
DatasetGeneratorConfig {
178180
columns,
179-
rows_num_range: (512, 1024),
181+
rows_num_range: (min_num_rows, max_num_rows),
180182
sort_keys_set: vec![
181183
// low cardinality to try and get many repeated runs
182-
vec![String::from("u8")],
183-
vec![String::from("utf8"), String::from("u8")],
184+
vec![String::from("u8_low")],
185+
vec![String::from("utf8_low"), String::from("u8_low")],
184186
],
185187
}
186188
}

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs

Lines changed: 64 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -170,11 +170,16 @@ impl Dataset {
170170

171171
#[derive(Debug, Clone)]
172172
pub struct ColumnDescr {
173-
// Column name
173+
/// Column name
174174
name: String,
175175

176-
// Data type of this column
176+
/// Data type of this column
177177
column_type: DataType,
178+
179+
/// The maximum number of distinct values in this column.
180+
///
181+
/// See [`ColumnDescr::with_max_num_distinct`] for more information
182+
max_num_distinct: Option<usize>,
178183
}
179184

180185
impl ColumnDescr {
@@ -183,8 +188,18 @@ impl ColumnDescr {
183188
Self {
184189
name: name.to_string(),
185190
column_type,
191+
max_num_distinct: None,
186192
}
187193
}
194+
195+
/// set the maximum number of distinct values in this column
196+
///
197+
/// If `None`, the number of distinct values is randomly selected between 1
198+
/// and the number of rows.
199+
pub fn with_max_num_distinct(mut self, num_distinct: usize) -> Self {
200+
self.max_num_distinct = Some(num_distinct);
201+
self
202+
}
188203
}
189204

190205
/// Record batch generator
@@ -199,20 +214,15 @@ struct RecordBatchGenerator {
199214
}
200215

201216
macro_rules! generate_string_array {
202-
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
217+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
203218
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
204219
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
205220
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
206-
let num_distinct_strings = if $NUM_ROWS > 1 {
207-
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
208-
} else {
209-
$NUM_ROWS
210-
};
211221

212222
let mut generator = StringArrayGenerator {
213223
max_len,
214224
num_strings: $NUM_ROWS,
215-
num_distinct_strings,
225+
num_distinct_strings: $MAX_NUM_DISTINCT,
216226
null_pct,
217227
rng: $ARRAY_GEN_RNG,
218228
};
@@ -222,19 +232,14 @@ macro_rules! generate_string_array {
222232
}
223233

224234
macro_rules! generate_primitive_array {
225-
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $DATA_TYPE:ident) => {
235+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $DATA_TYPE:ident) => {
226236
paste::paste! {{
227237
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
228238
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
229-
let num_distinct_primitives = if $NUM_ROWS > 1 {
230-
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
231-
} else {
232-
$NUM_ROWS
233-
};
234239

235240
let mut generator = PrimitiveArrayGenerator {
236241
num_primitives: $NUM_ROWS,
237-
num_distinct_primitives,
242+
num_distinct_primitives: $MAX_NUM_DISTINCT,
238243
null_pct,
239244
rng: $ARRAY_GEN_RNG,
240245
};
@@ -264,7 +269,7 @@ impl RecordBatchGenerator {
264269
let mut arrays = Vec::with_capacity(self.columns.len());
265270
for col in self.columns.iter() {
266271
let array = self.generate_array_of_type(
267-
col.column_type.clone(),
272+
col,
268273
num_rows,
269274
&mut rng,
270275
array_gen_rng.clone(),
@@ -285,16 +290,28 @@ impl RecordBatchGenerator {
285290

286291
fn generate_array_of_type(
287292
&self,
288-
data_type: DataType,
293+
col: &ColumnDescr,
289294
num_rows: usize,
290295
batch_gen_rng: &mut ThreadRng,
291296
array_gen_rng: StdRng,
292297
) -> ArrayRef {
293-
match data_type {
298+
let num_distinct = if num_rows > 1 {
299+
batch_gen_rng.gen_range(1..num_rows)
300+
} else {
301+
num_rows
302+
};
303+
// cap to at most the num_distinct values
304+
let max_num_distinct = col
305+
.max_num_distinct
306+
.map(|max| num_distinct.min(max))
307+
.unwrap_or(num_distinct);
308+
309+
match col.column_type {
294310
DataType::Int8 => {
295311
generate_primitive_array!(
296312
self,
297313
num_rows,
314+
max_num_distinct,
298315
batch_gen_rng,
299316
array_gen_rng,
300317
i8
@@ -304,6 +321,7 @@ impl RecordBatchGenerator {
304321
generate_primitive_array!(
305322
self,
306323
num_rows,
324+
max_num_distinct,
307325
batch_gen_rng,
308326
array_gen_rng,
309327
i16
@@ -313,6 +331,7 @@ impl RecordBatchGenerator {
313331
generate_primitive_array!(
314332
self,
315333
num_rows,
334+
max_num_distinct,
316335
batch_gen_rng,
317336
array_gen_rng,
318337
i32
@@ -322,6 +341,7 @@ impl RecordBatchGenerator {
322341
generate_primitive_array!(
323342
self,
324343
num_rows,
344+
max_num_distinct,
325345
batch_gen_rng,
326346
array_gen_rng,
327347
i64
@@ -331,6 +351,7 @@ impl RecordBatchGenerator {
331351
generate_primitive_array!(
332352
self,
333353
num_rows,
354+
max_num_distinct,
334355
batch_gen_rng,
335356
array_gen_rng,
336357
u8
@@ -340,6 +361,7 @@ impl RecordBatchGenerator {
340361
generate_primitive_array!(
341362
self,
342363
num_rows,
364+
max_num_distinct,
343365
batch_gen_rng,
344366
array_gen_rng,
345367
u16
@@ -349,6 +371,7 @@ impl RecordBatchGenerator {
349371
generate_primitive_array!(
350372
self,
351373
num_rows,
374+
max_num_distinct,
352375
batch_gen_rng,
353376
array_gen_rng,
354377
u32
@@ -358,6 +381,7 @@ impl RecordBatchGenerator {
358381
generate_primitive_array!(
359382
self,
360383
num_rows,
384+
max_num_distinct,
361385
batch_gen_rng,
362386
array_gen_rng,
363387
u64
@@ -367,6 +391,7 @@ impl RecordBatchGenerator {
367391
generate_primitive_array!(
368392
self,
369393
num_rows,
394+
max_num_distinct,
370395
batch_gen_rng,
371396
array_gen_rng,
372397
f32
@@ -376,19 +401,34 @@ impl RecordBatchGenerator {
376401
generate_primitive_array!(
377402
self,
378403
num_rows,
404+
max_num_distinct,
379405
batch_gen_rng,
380406
array_gen_rng,
381407
f64
382408
)
383409
}
384410
DataType::Utf8 => {
385-
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i32)
411+
generate_string_array!(
412+
self,
413+
num_rows,
414+
max_num_distinct,
415+
batch_gen_rng,
416+
array_gen_rng,
417+
i32
418+
)
386419
}
387420
DataType::LargeUtf8 => {
388-
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i64)
421+
generate_string_array!(
422+
self,
423+
num_rows,
424+
max_num_distinct,
425+
batch_gen_rng,
426+
array_gen_rng,
427+
i64
428+
)
389429
}
390430
_ => {
391-
panic!("Unsupported data generator type: {data_type}")
431+
panic!("Unsupported data generator type: {}", col.column_type)
392432
}
393433
}
394434
}
@@ -413,14 +453,8 @@ mod test {
413453
// - Their rows num should be same and between [16, 32]
414454
let config = DatasetGeneratorConfig {
415455
columns: vec![
416-
ColumnDescr {
417-
name: "a".to_string(),
418-
column_type: DataType::Utf8,
419-
},
420-
ColumnDescr {
421-
name: "b".to_string(),
422-
column_type: DataType::UInt32,
423-
},
456+
ColumnDescr::new("a", DataType::Utf8),
457+
ColumnDescr::new("b", DataType::UInt32),
424458
],
425459
rows_num_range: (16, 32),
426460
sort_keys_set: vec![vec!["b".to_string()]],

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -63,17 +63,35 @@ impl AggregationFuzzerBuilder {
6363
}
6464

6565
/// Adds random SQL queries to the fuzzer along with the table name
66-
pub fn add_query_builder(mut self, query_builder: QueryBuilder) -> Self {
67-
const NUM_QUERIES: usize = 10;
66+
///
67+
/// Adds
68+
/// - 3 random queries
69+
/// - 3 random queries for each group by selected from the sort keys
70+
/// - 1 random query with no grouping
71+
pub fn add_query_builder(mut self, mut query_builder: QueryBuilder) -> Self {
72+
const NUM_QUERIES: usize = 3;
6873
for _ in 0..NUM_QUERIES {
69-
self = self.add_sql(&query_builder.generate_query());
74+
let sql = query_builder.generate_query();
75+
self.candidate_sqls.push(Arc::from(sql));
7076
}
71-
self.table_name(query_builder.table_name())
72-
}
73-
74-
fn add_sql(mut self, sql: &str) -> Self {
77+
// also add several queries limited to grouping on the group by columns only, if any
78+
// So if the data is sorted on `a,b` only group by `a,b` or`a` or `b`
79+
if let Some(data_gen_config) = &self.data_gen_config {
80+
for sort_keys in &data_gen_config.sort_keys_set {
81+
let group_by_columns = sort_keys.iter().map(|s| s.as_str());
82+
query_builder = query_builder.with_group_by_columns(group_by_columns);
83+
for _ in 0..NUM_QUERIES {
84+
let sql = query_builder.generate_query();
85+
self.candidate_sqls.push(Arc::from(sql));
86+
}
87+
}
88+
}
89+
// also add a query with no grouping
90+
query_builder = query_builder.with_group_by_columns(vec![]);
91+
let sql = query_builder.generate_query();
7592
self.candidate_sqls.push(Arc::from(sql));
76-
self
93+
94+
self.table_name(query_builder.table_name())
7795
}
7896

7997
pub fn table_name(mut self, table_name: &str) -> Self {
@@ -359,7 +377,7 @@ fn format_batches_with_limit(batches: &[RecordBatch]) -> impl std::fmt::Display
359377
/// ```sql
360378
/// SELECT AGG(..) FROM table_name GROUP BY <group_by_columns>
361379
///```
362-
#[derive(Debug, Default)]
380+
#[derive(Debug, Default, Clone)]
363381
pub struct QueryBuilder {
364382
/// The name of the table to query
365383
table_name: String,
@@ -417,8 +435,7 @@ impl QueryBuilder {
417435
mut self,
418436
group_by: impl IntoIterator<Item = &'a str>,
419437
) -> Self {
420-
let group_by = group_by.into_iter().map(String::from);
421-
self.group_by_columns.extend(group_by);
438+
self.group_by_columns = group_by.into_iter().map(String::from).collect();
422439
self
423440
}
424441

@@ -497,7 +514,9 @@ impl QueryBuilder {
497514

498515
let mut already_used = HashSet::new();
499516
let mut group_by = vec![];
500-
while group_by.len() < num_group_by {
517+
while group_by.len() < num_group_by
518+
&& already_used.len() != self.group_by_columns.len()
519+
{
501520
let idx = rng.gen_range(0..self.group_by_columns.len());
502521
if already_used.insert(idx) {
503522
group_by.push(self.group_by_columns[idx].clone());

0 commit comments

Comments
 (0)