Skip to content

Commit 7d34ccc

Browse files
authored
Increase fuzz testing of streaming group by / low cardinality columns (#12990)
1 parent 2d7892b commit 7d34ccc

File tree

3 files changed

+113
-56
lines changed

3 files changed

+113
-56
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{
6565
//
6666
// TODO: test other aggregate functions
6767
// - AVG (unstable given the wide range of inputs)
68-
//
69-
// TODO: specific test for ordering (ensure all group by columns are ordered)
70-
// Currently the data is sorted by random columns, so there are almost no
71-
// repeated runs. To improve coverage we should also sort by lower cardinality columns
7268
#[tokio::test(flavor = "multi_thread")]
7369
async fn test_min() {
7470
let data_gen_config = baseline_config();
@@ -79,7 +75,7 @@ async fn test_min() {
7975
.with_aggregate_function("min")
8076
// min works on all column types
8177
.with_aggregate_arguments(data_gen_config.all_columns())
82-
.with_group_by_columns(data_gen_config.all_columns());
78+
.set_group_by_columns(data_gen_config.all_columns());
8379

8480
AggregationFuzzerBuilder::from(data_gen_config)
8581
.add_query_builder(query_builder)
@@ -98,7 +94,7 @@ async fn test_max() {
9894
.with_aggregate_function("max")
9995
// max works on all column types
10096
.with_aggregate_arguments(data_gen_config.all_columns())
101-
.with_group_by_columns(data_gen_config.all_columns());
97+
.set_group_by_columns(data_gen_config.all_columns());
10298

10399
AggregationFuzzerBuilder::from(data_gen_config)
104100
.add_query_builder(query_builder)
@@ -118,7 +114,7 @@ async fn test_sum() {
118114
.with_distinct_aggregate_function("sum")
119115
// sum only works on numeric columns
120116
.with_aggregate_arguments(data_gen_config.numeric_columns())
121-
.with_group_by_columns(data_gen_config.all_columns());
117+
.set_group_by_columns(data_gen_config.all_columns());
122118

123119
AggregationFuzzerBuilder::from(data_gen_config)
124120
.add_query_builder(query_builder)
@@ -138,7 +134,7 @@ async fn test_count() {
138134
.with_distinct_aggregate_function("count")
139135
// count work for all arguments
140136
.with_aggregate_arguments(data_gen_config.all_columns())
141-
.with_group_by_columns(data_gen_config.all_columns());
137+
.set_group_by_columns(data_gen_config.all_columns());
142138

143139
AggregationFuzzerBuilder::from(data_gen_config)
144140
.add_query_builder(query_builder)
@@ -174,15 +170,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
174170
// TODO add support for utf8view in data generator
175171
// ColumnDescr::new("utf8view", DataType::Utf8View),
176172
// todo binary
173+
// low cardinality columns
174+
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
175+
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
177176
];
178177

178+
let min_num_rows = 512;
179+
let max_num_rows = 1024;
180+
179181
DatasetGeneratorConfig {
180182
columns,
181-
rows_num_range: (512, 1024),
183+
rows_num_range: (min_num_rows, max_num_rows),
182184
sort_keys_set: vec![
183185
// low cardinality to try and get many repeated runs
184-
vec![String::from("u8")],
185-
vec![String::from("utf8"), String::from("u8")],
186+
vec![String::from("u8_low")],
187+
vec![String::from("utf8_low"), String::from("u8_low")],
186188
],
187189
}
188190
}

datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs

Lines changed: 66 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -174,11 +174,16 @@ impl Dataset {
174174

175175
#[derive(Debug, Clone)]
176176
pub struct ColumnDescr {
177-
// Column name
177+
/// Column name
178178
name: String,
179179

180-
// Data type of this column
180+
/// Data type of this column
181181
column_type: DataType,
182+
183+
/// The maximum number of distinct values in this column.
184+
///
185+
/// See [`ColumnDescr::with_max_num_distinct`] for more information
186+
max_num_distinct: Option<usize>,
182187
}
183188

184189
impl ColumnDescr {
@@ -187,8 +192,18 @@ impl ColumnDescr {
187192
Self {
188193
name: name.to_string(),
189194
column_type,
195+
max_num_distinct: None,
190196
}
191197
}
198+
199+
/// set the maximum number of distinct values in this column
200+
///
201+
/// If `None`, the number of distinct values is randomly selected between 1
202+
/// and the number of rows.
203+
pub fn with_max_num_distinct(mut self, num_distinct: usize) -> Self {
204+
self.max_num_distinct = Some(num_distinct);
205+
self
206+
}
192207
}
193208

194209
/// Record batch generator
@@ -203,20 +218,15 @@ struct RecordBatchGenerator {
203218
}
204219

205220
macro_rules! generate_string_array {
206-
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
221+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
207222
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
208223
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
209224
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
210-
let num_distinct_strings = if $NUM_ROWS > 1 {
211-
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
212-
} else {
213-
$NUM_ROWS
214-
};
215225

216226
let mut generator = StringArrayGenerator {
217227
max_len,
218228
num_strings: $NUM_ROWS,
219-
num_distinct_strings,
229+
num_distinct_strings: $MAX_NUM_DISTINCT,
220230
null_pct,
221231
rng: $ARRAY_GEN_RNG,
222232
};
@@ -226,19 +236,14 @@ macro_rules! generate_string_array {
226236
}
227237

228238
macro_rules! generate_primitive_array {
229-
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
239+
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
230240
paste::paste! {{
231241
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
232242
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
233-
let num_distinct_primitives = if $NUM_ROWS > 1 {
234-
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
235-
} else {
236-
$NUM_ROWS
237-
};
238243

239244
let mut generator = PrimitiveArrayGenerator {
240245
num_primitives: $NUM_ROWS,
241-
num_distinct_primitives,
246+
num_distinct_primitives: $MAX_NUM_DISTINCT,
242247
null_pct,
243248
rng: $ARRAY_GEN_RNG,
244249
};
@@ -268,7 +273,7 @@ impl RecordBatchGenerator {
268273
let mut arrays = Vec::with_capacity(self.columns.len());
269274
for col in self.columns.iter() {
270275
let array = self.generate_array_of_type(
271-
col.column_type.clone(),
276+
col,
272277
num_rows,
273278
&mut rng,
274279
array_gen_rng.clone(),
@@ -289,16 +294,28 @@ impl RecordBatchGenerator {
289294

290295
fn generate_array_of_type(
291296
&self,
292-
data_type: DataType,
297+
col: &ColumnDescr,
293298
num_rows: usize,
294299
batch_gen_rng: &mut ThreadRng,
295300
array_gen_rng: StdRng,
296301
) -> ArrayRef {
297-
match data_type {
302+
let num_distinct = if num_rows > 1 {
303+
batch_gen_rng.gen_range(1..num_rows)
304+
} else {
305+
num_rows
306+
};
307+
// cap to at most the num_distinct values
308+
let max_num_distinct = col
309+
.max_num_distinct
310+
.map(|max| num_distinct.min(max))
311+
.unwrap_or(num_distinct);
312+
313+
match col.column_type {
298314
DataType::Int8 => {
299315
generate_primitive_array!(
300316
self,
301317
num_rows,
318+
max_num_distinct,
302319
batch_gen_rng,
303320
array_gen_rng,
304321
Int8Type
@@ -308,6 +325,7 @@ impl RecordBatchGenerator {
308325
generate_primitive_array!(
309326
self,
310327
num_rows,
328+
max_num_distinct,
311329
batch_gen_rng,
312330
array_gen_rng,
313331
Int16Type
@@ -317,6 +335,7 @@ impl RecordBatchGenerator {
317335
generate_primitive_array!(
318336
self,
319337
num_rows,
338+
max_num_distinct,
320339
batch_gen_rng,
321340
array_gen_rng,
322341
Int32Type
@@ -326,6 +345,7 @@ impl RecordBatchGenerator {
326345
generate_primitive_array!(
327346
self,
328347
num_rows,
348+
max_num_distinct,
329349
batch_gen_rng,
330350
array_gen_rng,
331351
Int64Type
@@ -335,6 +355,7 @@ impl RecordBatchGenerator {
335355
generate_primitive_array!(
336356
self,
337357
num_rows,
358+
max_num_distinct,
338359
batch_gen_rng,
339360
array_gen_rng,
340361
UInt8Type
@@ -344,6 +365,7 @@ impl RecordBatchGenerator {
344365
generate_primitive_array!(
345366
self,
346367
num_rows,
368+
max_num_distinct,
347369
batch_gen_rng,
348370
array_gen_rng,
349371
UInt16Type
@@ -353,6 +375,7 @@ impl RecordBatchGenerator {
353375
generate_primitive_array!(
354376
self,
355377
num_rows,
378+
max_num_distinct,
356379
batch_gen_rng,
357380
array_gen_rng,
358381
UInt32Type
@@ -362,6 +385,7 @@ impl RecordBatchGenerator {
362385
generate_primitive_array!(
363386
self,
364387
num_rows,
388+
max_num_distinct,
365389
batch_gen_rng,
366390
array_gen_rng,
367391
UInt64Type
@@ -371,6 +395,7 @@ impl RecordBatchGenerator {
371395
generate_primitive_array!(
372396
self,
373397
num_rows,
398+
max_num_distinct,
374399
batch_gen_rng,
375400
array_gen_rng,
376401
Float32Type
@@ -380,6 +405,7 @@ impl RecordBatchGenerator {
380405
generate_primitive_array!(
381406
self,
382407
num_rows,
408+
max_num_distinct,
383409
batch_gen_rng,
384410
array_gen_rng,
385411
Float64Type
@@ -389,6 +415,7 @@ impl RecordBatchGenerator {
389415
generate_primitive_array!(
390416
self,
391417
num_rows,
418+
max_num_distinct,
392419
batch_gen_rng,
393420
array_gen_rng,
394421
Date32Type
@@ -398,19 +425,34 @@ impl RecordBatchGenerator {
398425
generate_primitive_array!(
399426
self,
400427
num_rows,
428+
max_num_distinct,
401429
batch_gen_rng,
402430
array_gen_rng,
403431
Date64Type
404432
)
405433
}
406434
DataType::Utf8 => {
407-
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i32)
435+
generate_string_array!(
436+
self,
437+
num_rows,
438+
max_num_distinct,
439+
batch_gen_rng,
440+
array_gen_rng,
441+
i32
442+
)
408443
}
409444
DataType::LargeUtf8 => {
410-
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i64)
445+
generate_string_array!(
446+
self,
447+
num_rows,
448+
max_num_distinct,
449+
batch_gen_rng,
450+
array_gen_rng,
451+
i64
452+
)
411453
}
412454
_ => {
413-
panic!("Unsupported data generator type: {data_type}")
455+
panic!("Unsupported data generator type: {}", col.column_type)
414456
}
415457
}
416458
}
@@ -435,14 +477,8 @@ mod test {
435477
// - Their rows num should be same and between [16, 32]
436478
let config = DatasetGeneratorConfig {
437479
columns: vec![
438-
ColumnDescr {
439-
name: "a".to_string(),
440-
column_type: DataType::Utf8,
441-
},
442-
ColumnDescr {
443-
name: "b".to_string(),
444-
column_type: DataType::UInt32,
445-
},
480+
ColumnDescr::new("a", DataType::Utf8),
481+
ColumnDescr::new("b", DataType::UInt32),
446482
],
447483
rows_num_range: (16, 32),
448484
sort_keys_set: vec![vec!["b".to_string()]],

0 commit comments

Comments
 (0)