diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 28901b14b5b7..d715635c5951 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -65,10 +65,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{ // // TODO: test other aggregate functions // - AVG (unstable given the wide range of inputs) -// -// TODO: specific test for ordering (ensure all group by columns are ordered) -// Currently the data is sorted by random columns, so there are almost no -// repeated runs. To improve coverage we should also sort by lower cardinality columns #[tokio::test(flavor = "multi_thread")] async fn test_min() { let data_gen_config = baseline_config(); @@ -79,7 +75,7 @@ async fn test_min() { .with_aggregate_function("min") // min works on all column types .with_aggregate_arguments(data_gen_config.all_columns()) - .with_group_by_columns(data_gen_config.all_columns()); + .set_group_by_columns(data_gen_config.all_columns()); AggregationFuzzerBuilder::from(data_gen_config) .add_query_builder(query_builder) @@ -98,7 +94,7 @@ async fn test_max() { .with_aggregate_function("max") // max works on all column types .with_aggregate_arguments(data_gen_config.all_columns()) - .with_group_by_columns(data_gen_config.all_columns()); + .set_group_by_columns(data_gen_config.all_columns()); AggregationFuzzerBuilder::from(data_gen_config) .add_query_builder(query_builder) @@ -118,7 +114,7 @@ async fn test_sum() { .with_distinct_aggregate_function("sum") // sum only works on numeric columns .with_aggregate_arguments(data_gen_config.numeric_columns()) - .with_group_by_columns(data_gen_config.all_columns()); + .set_group_by_columns(data_gen_config.all_columns()); AggregationFuzzerBuilder::from(data_gen_config) .add_query_builder(query_builder) @@ -138,7 +134,7 @@ async fn test_count() { .with_distinct_aggregate_function("count") // count work for all arguments .with_aggregate_arguments(data_gen_config.all_columns()) - .with_group_by_columns(data_gen_config.all_columns()); + .set_group_by_columns(data_gen_config.all_columns()); AggregationFuzzerBuilder::from(data_gen_config) .add_query_builder(query_builder) @@ -174,15 +170,21 @@ fn baseline_config() -> DatasetGeneratorConfig { // TODO add support for utf8view in data generator // ColumnDescr::new("utf8view", DataType::Utf8View), // todo binary + // low cardinality columns + ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10), + ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10), ]; + let min_num_rows = 512; + let max_num_rows = 1024; + DatasetGeneratorConfig { columns, - rows_num_range: (512, 1024), + rows_num_range: (min_num_rows, max_num_rows), sort_keys_set: vec![ // low cardinality to try and get many repeated runs - vec![String::from("u8")], - vec![String::from("utf8"), String::from("u8")], + vec![String::from("u8_low")], + vec![String::from("utf8_low"), String::from("u8_low")], ], } } diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs index ef9b5a7f355a..4fa1b7aa263d 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/data_generator.rs @@ -174,11 +174,16 @@ impl Dataset { #[derive(Debug, Clone)] pub struct ColumnDescr { - // Column name + /// Column name name: String, - // Data type of this column + /// Data type of this column column_type: DataType, + + /// The maximum number of distinct values in this column. + /// + /// See [`ColumnDescr::with_max_num_distinct`] for more information + max_num_distinct: Option, } impl ColumnDescr { @@ -187,8 +192,18 @@ impl ColumnDescr { Self { name: name.to_string(), column_type, + max_num_distinct: None, } } + + /// set the maximum number of distinct values in this column + /// + /// If `None`, the number of distinct values is randomly selected between 1 + /// and the number of rows. + pub fn with_max_num_distinct(mut self, num_distinct: usize) -> Self { + self.max_num_distinct = Some(num_distinct); + self + } } /// Record batch generator @@ -203,20 +218,15 @@ struct RecordBatchGenerator { } macro_rules! generate_string_array { - ($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{ + ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{ let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); let null_pct = $SELF.candidate_null_pcts[null_pct_idx]; let max_len = $BATCH_GEN_RNG.gen_range(1..50); - let num_distinct_strings = if $NUM_ROWS > 1 { - $BATCH_GEN_RNG.gen_range(1..$NUM_ROWS) - } else { - $NUM_ROWS - }; let mut generator = StringArrayGenerator { max_len, num_strings: $NUM_ROWS, - num_distinct_strings, + num_distinct_strings: $MAX_NUM_DISTINCT, null_pct, rng: $ARRAY_GEN_RNG, }; @@ -226,19 +236,14 @@ macro_rules! generate_string_array { } macro_rules! generate_primitive_array { - ($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => { + ($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => { paste::paste! {{ let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len()); let null_pct = $SELF.candidate_null_pcts[null_pct_idx]; - let num_distinct_primitives = if $NUM_ROWS > 1 { - $BATCH_GEN_RNG.gen_range(1..$NUM_ROWS) - } else { - $NUM_ROWS - }; let mut generator = PrimitiveArrayGenerator { num_primitives: $NUM_ROWS, - num_distinct_primitives, + num_distinct_primitives: $MAX_NUM_DISTINCT, null_pct, rng: $ARRAY_GEN_RNG, }; @@ -268,7 +273,7 @@ impl RecordBatchGenerator { let mut arrays = Vec::with_capacity(self.columns.len()); for col in self.columns.iter() { let array = self.generate_array_of_type( - col.column_type.clone(), + col, num_rows, &mut rng, array_gen_rng.clone(), @@ -289,16 +294,28 @@ impl RecordBatchGenerator { fn generate_array_of_type( &self, - data_type: DataType, + col: &ColumnDescr, num_rows: usize, batch_gen_rng: &mut ThreadRng, array_gen_rng: StdRng, ) -> ArrayRef { - match data_type { + let num_distinct = if num_rows > 1 { + batch_gen_rng.gen_range(1..num_rows) + } else { + num_rows + }; + // cap to at most the num_distinct values + let max_num_distinct = col + .max_num_distinct + .map(|max| num_distinct.min(max)) + .unwrap_or(num_distinct); + + match col.column_type { DataType::Int8 => { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Int8Type @@ -308,6 +325,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Int16Type @@ -317,6 +335,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Int32Type @@ -326,6 +345,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Int64Type @@ -335,6 +355,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, UInt8Type @@ -344,6 +365,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, UInt16Type @@ -353,6 +375,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, UInt32Type @@ -362,6 +385,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, UInt64Type @@ -371,6 +395,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Float32Type @@ -380,6 +405,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Float64Type @@ -389,6 +415,7 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Date32Type @@ -398,19 +425,34 @@ impl RecordBatchGenerator { generate_primitive_array!( self, num_rows, + max_num_distinct, batch_gen_rng, array_gen_rng, Date64Type ) } DataType::Utf8 => { - generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i32) + generate_string_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + i32 + ) } DataType::LargeUtf8 => { - generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i64) + generate_string_array!( + self, + num_rows, + max_num_distinct, + batch_gen_rng, + array_gen_rng, + i64 + ) } _ => { - panic!("Unsupported data generator type: {data_type}") + panic!("Unsupported data generator type: {}", col.column_type) } } } @@ -435,14 +477,8 @@ mod test { // - Their rows num should be same and between [16, 32] let config = DatasetGeneratorConfig { columns: vec![ - ColumnDescr { - name: "a".to_string(), - column_type: DataType::Utf8, - }, - ColumnDescr { - name: "b".to_string(), - column_type: DataType::UInt32, - }, + ColumnDescr::new("a", DataType::Utf8), + ColumnDescr::new("b", DataType::UInt32), ], rows_num_range: (16, 32), sort_keys_set: vec![vec!["b".to_string()]], diff --git a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs index 0704bafa0318..d021e73f35b2 100644 --- a/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs +++ b/datafusion/core/tests/fuzz_cases/aggregation_fuzzer/fuzzer.rs @@ -63,17 +63,35 @@ impl AggregationFuzzerBuilder { } /// Adds random SQL queries to the fuzzer along with the table name - pub fn add_query_builder(mut self, query_builder: QueryBuilder) -> Self { - const NUM_QUERIES: usize = 10; + /// + /// Adds + /// - 3 random queries + /// - 3 random queries for each group by selected from the sort keys + /// - 1 random query with no grouping + pub fn add_query_builder(mut self, mut query_builder: QueryBuilder) -> Self { + const NUM_QUERIES: usize = 3; for _ in 0..NUM_QUERIES { - self = self.add_sql(&query_builder.generate_query()); + let sql = query_builder.generate_query(); + self.candidate_sqls.push(Arc::from(sql)); } - self.table_name(query_builder.table_name()) - } - - fn add_sql(mut self, sql: &str) -> Self { + // also add several queries limited to grouping on the group by columns only, if any + // So if the data is sorted on `a,b` only group by `a,b` or`a` or `b` + if let Some(data_gen_config) = &self.data_gen_config { + for sort_keys in &data_gen_config.sort_keys_set { + let group_by_columns = sort_keys.iter().map(|s| s.as_str()); + query_builder = query_builder.set_group_by_columns(group_by_columns); + for _ in 0..NUM_QUERIES { + let sql = query_builder.generate_query(); + self.candidate_sqls.push(Arc::from(sql)); + } + } + } + // also add a query with no grouping + query_builder = query_builder.set_group_by_columns(vec![]); + let sql = query_builder.generate_query(); self.candidate_sqls.push(Arc::from(sql)); - self + + self.table_name(query_builder.table_name()) } pub fn table_name(mut self, table_name: &str) -> Self { @@ -359,7 +377,7 @@ fn format_batches_with_limit(batches: &[RecordBatch]) -> impl std::fmt::Display /// ```sql /// SELECT AGG(..) FROM table_name GROUP BY ///``` -#[derive(Debug, Default)] +#[derive(Debug, Default, Clone)] pub struct QueryBuilder { /// The name of the table to query table_name: String, @@ -412,17 +430,16 @@ impl QueryBuilder { self } - /// Add a column to be used in the group bys - pub fn with_group_by_columns<'a>( + /// Set the columns to be used in the group bys clauses + pub fn set_group_by_columns<'a>( mut self, group_by: impl IntoIterator, ) -> Self { - let group_by = group_by.into_iter().map(String::from); - self.group_by_columns.extend(group_by); + self.group_by_columns = group_by.into_iter().map(String::from).collect(); self } - /// Add a column to be used as an argument in the aggregate functions + /// Add one or more columns to be used as an argument in the aggregate functions pub fn with_aggregate_arguments<'a>( mut self, arguments: impl IntoIterator, @@ -497,7 +514,9 @@ impl QueryBuilder { let mut already_used = HashSet::new(); let mut group_by = vec![]; - while group_by.len() < num_group_by { + while group_by.len() < num_group_by + && already_used.len() != self.group_by_columns.len() + { let idx = rng.gen_range(0..self.group_by_columns.len()); if already_used.insert(idx) { group_by.push(self.group_by_columns[idx].clone());