Skip to content

Increase fuzz testing of streaming group by / low cardinality columns #12990

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 13 additions & 11 deletions datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ use crate::fuzz_cases::aggregation_fuzzer::{
//
// TODO: test other aggregate functions
// - AVG (unstable given the wide range of inputs)
//
// TODO: specific test for ordering (ensure all group by columns are ordered)
// Currently the data is sorted by random columns, so there are almost no
// repeated runs. To improve coverage we should also sort by lower cardinality columns
#[tokio::test(flavor = "multi_thread")]
async fn test_min() {
let data_gen_config = baseline_config();
Expand All @@ -79,7 +75,7 @@ async fn test_min() {
.with_aggregate_function("min")
// min works on all column types
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -98,7 +94,7 @@ async fn test_max() {
.with_aggregate_function("max")
// max works on all column types
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -118,7 +114,7 @@ async fn test_sum() {
.with_distinct_aggregate_function("sum")
// sum only works on numeric columns
.with_aggregate_arguments(data_gen_config.numeric_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand All @@ -138,7 +134,7 @@ async fn test_count() {
.with_distinct_aggregate_function("count")
// count work for all arguments
.with_aggregate_arguments(data_gen_config.all_columns())
.with_group_by_columns(data_gen_config.all_columns());
.set_group_by_columns(data_gen_config.all_columns());

AggregationFuzzerBuilder::from(data_gen_config)
.add_query_builder(query_builder)
Expand Down Expand Up @@ -174,15 +170,21 @@ fn baseline_config() -> DatasetGeneratorConfig {
// TODO add support for utf8view in data generator
// ColumnDescr::new("utf8view", DataType::Utf8View),
// todo binary
// low cardinality columns
ColumnDescr::new("u8_low", DataType::UInt8).with_max_num_distinct(10),
ColumnDescr::new("utf8_low", DataType::Utf8).with_max_num_distinct(10),
];

let min_num_rows = 512;
let max_num_rows = 1024;

DatasetGeneratorConfig {
columns,
rows_num_range: (512, 1024),
rows_num_range: (min_num_rows, max_num_rows),
sort_keys_set: vec![
// low cardinality to try and get many repeated runs
vec![String::from("u8")],
vec![String::from("utf8"), String::from("u8")],
vec![String::from("u8_low")],
vec![String::from("utf8_low"), String::from("u8_low")],
],
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,16 @@ impl Dataset {

#[derive(Debug, Clone)]
pub struct ColumnDescr {
// Column name
/// Column name
name: String,

// Data type of this column
/// Data type of this column
column_type: DataType,

/// The maximum number of distinct values in this column.
///
/// See [`ColumnDescr::with_max_num_distinct`] for more information
max_num_distinct: Option<usize>,
}

impl ColumnDescr {
Expand All @@ -187,8 +192,18 @@ impl ColumnDescr {
Self {
name: name.to_string(),
column_type,
max_num_distinct: None,
}
}

/// set the maximum number of distinct values in this column
///
/// If `None`, the number of distinct values is randomly selected between 1
/// and the number of rows.
pub fn with_max_num_distinct(mut self, num_distinct: usize) -> Self {
self.max_num_distinct = Some(num_distinct);
self
}
}

/// Record batch generator
Expand All @@ -203,20 +218,15 @@ struct RecordBatchGenerator {
}

macro_rules! generate_string_array {
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $OFFSET_TYPE:ty) => {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
let max_len = $BATCH_GEN_RNG.gen_range(1..50);
let num_distinct_strings = if $NUM_ROWS > 1 {
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
} else {
$NUM_ROWS
};

let mut generator = StringArrayGenerator {
max_len,
num_strings: $NUM_ROWS,
num_distinct_strings,
num_distinct_strings: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};
Expand All @@ -226,19 +236,14 @@ macro_rules! generate_string_array {
}

macro_rules! generate_primitive_array {
($SELF:ident, $NUM_ROWS:ident, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
($SELF:ident, $NUM_ROWS:ident, $MAX_NUM_DISTINCT:expr, $BATCH_GEN_RNG:ident, $ARRAY_GEN_RNG:ident, $ARROW_TYPE:ident) => {
paste::paste! {{
let null_pct_idx = $BATCH_GEN_RNG.gen_range(0..$SELF.candidate_null_pcts.len());
let null_pct = $SELF.candidate_null_pcts[null_pct_idx];
let num_distinct_primitives = if $NUM_ROWS > 1 {
$BATCH_GEN_RNG.gen_range(1..$NUM_ROWS)
} else {
$NUM_ROWS
};

let mut generator = PrimitiveArrayGenerator {
num_primitives: $NUM_ROWS,
num_distinct_primitives,
num_distinct_primitives: $MAX_NUM_DISTINCT,
null_pct,
rng: $ARRAY_GEN_RNG,
};
Expand Down Expand Up @@ -268,7 +273,7 @@ impl RecordBatchGenerator {
let mut arrays = Vec::with_capacity(self.columns.len());
for col in self.columns.iter() {
let array = self.generate_array_of_type(
col.column_type.clone(),
col,
num_rows,
&mut rng,
array_gen_rng.clone(),
Expand All @@ -289,16 +294,28 @@ impl RecordBatchGenerator {

fn generate_array_of_type(
&self,
data_type: DataType,
col: &ColumnDescr,
num_rows: usize,
batch_gen_rng: &mut ThreadRng,
array_gen_rng: StdRng,
) -> ArrayRef {
match data_type {
let num_distinct = if num_rows > 1 {
batch_gen_rng.gen_range(1..num_rows)
} else {
num_rows
};
// cap to at most the num_distinct values
let max_num_distinct = col
.max_num_distinct
.map(|max| num_distinct.min(max))
.unwrap_or(num_distinct);

match col.column_type {
DataType::Int8 => {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int8Type
Expand All @@ -308,6 +325,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int16Type
Expand All @@ -317,6 +335,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int32Type
Expand All @@ -326,6 +345,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Int64Type
Expand All @@ -335,6 +355,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt8Type
Expand All @@ -344,6 +365,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt16Type
Expand All @@ -353,6 +375,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt32Type
Expand All @@ -362,6 +385,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
UInt64Type
Expand All @@ -371,6 +395,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Float32Type
Expand All @@ -380,6 +405,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Float64Type
Expand All @@ -389,6 +415,7 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Date32Type
Expand All @@ -398,19 +425,34 @@ impl RecordBatchGenerator {
generate_primitive_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
Date64Type
)
}
DataType::Utf8 => {
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i32)
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i32
)
}
DataType::LargeUtf8 => {
generate_string_array!(self, num_rows, batch_gen_rng, array_gen_rng, i64)
generate_string_array!(
self,
num_rows,
max_num_distinct,
batch_gen_rng,
array_gen_rng,
i64
)
}
_ => {
panic!("Unsupported data generator type: {data_type}")
panic!("Unsupported data generator type: {}", col.column_type)
}
}
}
Expand All @@ -435,14 +477,8 @@ mod test {
// - Their rows num should be same and between [16, 32]
let config = DatasetGeneratorConfig {
columns: vec![
ColumnDescr {
name: "a".to_string(),
column_type: DataType::Utf8,
},
ColumnDescr {
name: "b".to_string(),
column_type: DataType::UInt32,
},
ColumnDescr::new("a", DataType::Utf8),
ColumnDescr::new("b", DataType::UInt32),
],
rows_num_range: (16, 32),
sort_keys_set: vec![vec!["b".to_string()]],
Expand Down
Loading