Skip to content

Commit 3353c06

Browse files
authored
Add Aggregation fuzzer framework (#12667)
* impl primitive arrays generator. * sort out the test record batch generating codes. * draft for `DataSetsGenerator`. * tmp * improve the data generator, and start to impl the session context generator. * impl context generator. * tmp * define the `AggregationFuzzer`. * add ut for data generator. * improve comments for `SessionContextGenerator`. * define `GeneratedSessionContextBuilder` to reduce repeated codes. * extract the check equality logic for reusing. * add ut for `SessionContextGenerator`. * tmp * finish the main logic of `AggregationFuzzer`. * try to rewrite some test using the fuzzer. * fix header. * expose table name through `AggregationFuzzerBuilder`. * throw err to aggr fuzzer, and expect them then. * switch to Arc<str> to slightly improve performance. * throw more errors to fuzzer. * print task informantion before panic. * improve comments. * support printing generated session context params in error reporting. * add todo. * add some new fuzz case based on `AggregationFuzzer`. * fix lint. * print more information in error report. * fix clippy. * improve comment of `SessionContextGenerator`. * just use fixed `data_gen_rounds` and `ctx_gen_rounds` currently, because we will hardly set them. * improve comments for rounds constants. * small improvements. * select sql from some candidates ranther than fixed one. * make `data_gen_rounds` able to set again, and add more tests. * add no group cases. * add fuzz test for basic string aggr. * make `data_gen_rounds` smaller. * add comments. * fix typo. * fix comment.
1 parent 3d347c9 commit 3353c06

File tree

12 files changed

+1646
-63
lines changed

12 files changed

+1646
-63
lines changed

datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs

Lines changed: 302 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,307 @@ use rand::rngs::StdRng;
4444
use rand::{Rng, SeedableRng};
4545
use tokio::task::JoinSet;
4646

47+
use crate::fuzz_cases::aggregation_fuzzer::{
48+
AggregationFuzzerBuilder, ColumnDescr, DatasetGeneratorConfig,
49+
};
50+
51+
// ========================================================================
52+
// The new aggregation fuzz tests based on [`AggregationFuzzer`]
53+
// ========================================================================
54+
55+
// TODO: write more test case to cover more `group by`s and `aggregation function`s
56+
// TODO: maybe we can use macro to simply the case creating
57+
58+
/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `no group by`
59+
#[tokio::test(flavor = "multi_thread")]
60+
async fn test_basic_prim_aggr_no_group() {
61+
let builder = AggregationFuzzerBuilder::default();
62+
63+
// Define data generator config
64+
let columns = vec![ColumnDescr::new("a", DataType::Int32)];
65+
66+
let data_gen_config = DatasetGeneratorConfig {
67+
columns,
68+
rows_num_range: (512, 1024),
69+
sort_keys_set: Vec::new(),
70+
};
71+
72+
// Build fuzzer
73+
let fuzzer = builder
74+
.data_gen_config(data_gen_config)
75+
.data_gen_rounds(16)
76+
.add_sql("SELECT sum(a) FROM fuzz_table")
77+
.add_sql("SELECT sum(distinct a) FROM fuzz_table")
78+
.add_sql("SELECT max(a) FROM fuzz_table")
79+
.add_sql("SELECT min(a) FROM fuzz_table")
80+
.add_sql("SELECT count(a) FROM fuzz_table")
81+
.add_sql("SELECT count(distinct a) FROM fuzz_table")
82+
.add_sql("SELECT avg(a) FROM fuzz_table")
83+
.table_name("fuzz_table")
84+
.build();
85+
86+
fuzzer.run().await;
87+
}
88+
89+
/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by single int64`
90+
#[tokio::test(flavor = "multi_thread")]
91+
async fn test_basic_prim_aggr_group_by_single_int64() {
92+
let builder = AggregationFuzzerBuilder::default();
93+
94+
// Define data generator config
95+
let columns = vec![
96+
ColumnDescr::new("a", DataType::Int32),
97+
ColumnDescr::new("b", DataType::Int64),
98+
ColumnDescr::new("c", DataType::Int64),
99+
];
100+
let sort_keys_set = vec![
101+
vec!["b".to_string()],
102+
vec!["c".to_string(), "b".to_string()],
103+
];
104+
let data_gen_config = DatasetGeneratorConfig {
105+
columns,
106+
rows_num_range: (512, 1024),
107+
sort_keys_set,
108+
};
109+
110+
// Build fuzzer
111+
let fuzzer = builder
112+
.data_gen_config(data_gen_config)
113+
.data_gen_rounds(16)
114+
.add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
115+
.add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
116+
.add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
117+
.add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
118+
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
119+
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
120+
.add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
121+
.table_name("fuzz_table")
122+
.build();
123+
124+
fuzzer.run().await;
125+
}
126+
127+
/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by single string`
128+
#[tokio::test(flavor = "multi_thread")]
129+
async fn test_basic_prim_aggr_group_by_single_string() {
130+
let builder = AggregationFuzzerBuilder::default();
131+
132+
// Define data generator config
133+
let columns = vec![
134+
ColumnDescr::new("a", DataType::Int32),
135+
ColumnDescr::new("b", DataType::Utf8),
136+
ColumnDescr::new("c", DataType::Int64),
137+
];
138+
let sort_keys_set = vec![
139+
vec!["b".to_string()],
140+
vec!["c".to_string(), "b".to_string()],
141+
];
142+
let data_gen_config = DatasetGeneratorConfig {
143+
columns,
144+
rows_num_range: (512, 1024),
145+
sort_keys_set,
146+
};
147+
148+
// Build fuzzer
149+
let fuzzer = builder
150+
.data_gen_config(data_gen_config)
151+
.data_gen_rounds(16)
152+
.add_sql("SELECT b, sum(a) FROM fuzz_table GROUP BY b")
153+
.add_sql("SELECT b, sum(distinct a) FROM fuzz_table GROUP BY b")
154+
.add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
155+
.add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
156+
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
157+
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
158+
.add_sql("SELECT b, avg(a) FROM fuzz_table GROUP BY b")
159+
.table_name("fuzz_table")
160+
.build();
161+
162+
fuzzer.run().await;
163+
}
164+
165+
/// Fuzz test for `basic prim aggr(sum/sum distinct/max/min/count/avg)` + `group by string + int64`
166+
#[tokio::test(flavor = "multi_thread")]
167+
async fn test_basic_prim_aggr_group_by_mixed_string_int64() {
168+
let builder = AggregationFuzzerBuilder::default();
169+
170+
// Define data generator config
171+
let columns = vec![
172+
ColumnDescr::new("a", DataType::Int32),
173+
ColumnDescr::new("b", DataType::Utf8),
174+
ColumnDescr::new("c", DataType::Int64),
175+
ColumnDescr::new("d", DataType::Int32),
176+
];
177+
let sort_keys_set = vec![
178+
vec!["b".to_string(), "c".to_string()],
179+
vec!["d".to_string(), "b".to_string(), "c".to_string()],
180+
];
181+
let data_gen_config = DatasetGeneratorConfig {
182+
columns,
183+
rows_num_range: (512, 1024),
184+
sort_keys_set,
185+
};
186+
187+
// Build fuzzer
188+
let fuzzer = builder
189+
.data_gen_config(data_gen_config)
190+
.data_gen_rounds(16)
191+
.add_sql("SELECT b, c, sum(a) FROM fuzz_table GROUP BY b, c")
192+
.add_sql("SELECT b, c, sum(distinct a) FROM fuzz_table GROUP BY b,c")
193+
.add_sql("SELECT b, c, max(a) FROM fuzz_table GROUP BY b, c")
194+
.add_sql("SELECT b, c, min(a) FROM fuzz_table GROUP BY b, c")
195+
.add_sql("SELECT b, c, count(a) FROM fuzz_table GROUP BY b, c")
196+
.add_sql("SELECT b, c, count(distinct a) FROM fuzz_table GROUP BY b, c")
197+
.add_sql("SELECT b, c, avg(a) FROM fuzz_table GROUP BY b, c")
198+
.table_name("fuzz_table")
199+
.build();
200+
201+
fuzzer.run().await;
202+
}
203+
204+
/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `no group by`
205+
#[tokio::test(flavor = "multi_thread")]
206+
async fn test_basic_string_aggr_no_group() {
207+
let builder = AggregationFuzzerBuilder::default();
208+
209+
// Define data generator config
210+
let columns = vec![ColumnDescr::new("a", DataType::Utf8)];
211+
212+
let data_gen_config = DatasetGeneratorConfig {
213+
columns,
214+
rows_num_range: (512, 1024),
215+
sort_keys_set: Vec::new(),
216+
};
217+
218+
// Build fuzzer
219+
let fuzzer = builder
220+
.data_gen_config(data_gen_config)
221+
.data_gen_rounds(8)
222+
.add_sql("SELECT max(a) FROM fuzz_table")
223+
.add_sql("SELECT min(a) FROM fuzz_table")
224+
.add_sql("SELECT count(a) FROM fuzz_table")
225+
.add_sql("SELECT count(distinct a) FROM fuzz_table")
226+
.table_name("fuzz_table")
227+
.build();
228+
229+
fuzzer.run().await;
230+
}
231+
232+
/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by single int64`
233+
#[tokio::test(flavor = "multi_thread")]
234+
async fn test_basic_string_aggr_group_by_single_int64() {
235+
let builder = AggregationFuzzerBuilder::default();
236+
237+
// Define data generator config
238+
let columns = vec![
239+
ColumnDescr::new("a", DataType::Utf8),
240+
ColumnDescr::new("b", DataType::Int64),
241+
ColumnDescr::new("c", DataType::Int64),
242+
];
243+
let sort_keys_set = vec![
244+
vec!["b".to_string()],
245+
vec!["c".to_string(), "b".to_string()],
246+
];
247+
let data_gen_config = DatasetGeneratorConfig {
248+
columns,
249+
rows_num_range: (512, 1024),
250+
sort_keys_set,
251+
};
252+
253+
// Build fuzzer
254+
let fuzzer = builder
255+
.data_gen_config(data_gen_config)
256+
.data_gen_rounds(8)
257+
// FIXME: Encounter error in min/max
258+
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
259+
// .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
260+
// .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
261+
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
262+
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
263+
.table_name("fuzz_table")
264+
.build();
265+
266+
fuzzer.run().await;
267+
}
268+
269+
/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by single string`
270+
#[tokio::test(flavor = "multi_thread")]
271+
async fn test_basic_string_aggr_group_by_single_string() {
272+
let builder = AggregationFuzzerBuilder::default();
273+
274+
// Define data generator config
275+
let columns = vec![
276+
ColumnDescr::new("a", DataType::Utf8),
277+
ColumnDescr::new("b", DataType::Utf8),
278+
ColumnDescr::new("c", DataType::Int64),
279+
];
280+
let sort_keys_set = vec![
281+
vec!["b".to_string()],
282+
vec!["c".to_string(), "b".to_string()],
283+
];
284+
let data_gen_config = DatasetGeneratorConfig {
285+
columns,
286+
rows_num_range: (512, 1024),
287+
sort_keys_set,
288+
};
289+
290+
// Build fuzzer
291+
let fuzzer = builder
292+
.data_gen_config(data_gen_config)
293+
.data_gen_rounds(16)
294+
// FIXME: Encounter error in min/max
295+
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
296+
// .add_sql("SELECT b, max(a) FROM fuzz_table GROUP BY b")
297+
// .add_sql("SELECT b, min(a) FROM fuzz_table GROUP BY b")
298+
.add_sql("SELECT b, count(a) FROM fuzz_table GROUP BY b")
299+
.add_sql("SELECT b, count(distinct a) FROM fuzz_table GROUP BY b")
300+
.table_name("fuzz_table")
301+
.build();
302+
303+
fuzzer.run().await;
304+
}
305+
306+
/// Fuzz test for `basic string aggr(count/count distinct/min/max)` + `group by string + int64`
307+
#[tokio::test(flavor = "multi_thread")]
308+
async fn test_basic_string_aggr_group_by_mixed_string_int64() {
309+
let builder = AggregationFuzzerBuilder::default();
310+
311+
// Define data generator config
312+
let columns = vec![
313+
ColumnDescr::new("a", DataType::Utf8),
314+
ColumnDescr::new("b", DataType::Utf8),
315+
ColumnDescr::new("c", DataType::Int64),
316+
ColumnDescr::new("d", DataType::Int32),
317+
];
318+
let sort_keys_set = vec![
319+
vec!["b".to_string(), "c".to_string()],
320+
vec!["d".to_string(), "b".to_string(), "c".to_string()],
321+
];
322+
let data_gen_config = DatasetGeneratorConfig {
323+
columns,
324+
rows_num_range: (512, 1024),
325+
sort_keys_set,
326+
};
327+
328+
// Build fuzzer
329+
let fuzzer = builder
330+
.data_gen_config(data_gen_config)
331+
.data_gen_rounds(16)
332+
// FIXME: Encounter error in min/max
333+
// ArrowError(InvalidArgumentError("number of columns(1) must match number of fields(2) in schema"))
334+
// .add_sql("SELECT b, c, max(a) FROM fuzz_table GROUP BY b, c")
335+
// .add_sql("SELECT b, c, min(a) FROM fuzz_table GROUP BY b, c")
336+
.add_sql("SELECT b, c, count(a) FROM fuzz_table GROUP BY b, c")
337+
.add_sql("SELECT b, c, count(distinct a) FROM fuzz_table GROUP BY b, c")
338+
.table_name("fuzz_table")
339+
.build();
340+
341+
fuzzer.run().await;
342+
}
343+
344+
// ========================================================================
345+
// The old aggregation fuzz tests
346+
// ========================================================================
347+
/// Tracks if this stream is generating input or output
47348
/// Tests that streaming aggregate and batch (non streaming) aggregate produce
48349
/// same results
49350
#[tokio::test(flavor = "multi_thread")]
@@ -311,6 +612,7 @@ async fn group_by_string_test(
311612
let actual = extract_result_counts(results);
312613
assert_eq!(expected, actual);
313614
}
615+
314616
async fn verify_ordered_aggregate(frame: &DataFrame, expected_sort: bool) {
315617
struct Visitor {
316618
expected_sort: bool,

0 commit comments

Comments
 (0)