Skip to content

Commit d8090fe

Browse files
committed
add tests
1 parent f7315b0 commit d8090fe

File tree

2 files changed

+184
-7
lines changed

2 files changed

+184
-7
lines changed

datafusion/core/src/datasource/listing/table.rs

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1252,13 +1252,10 @@ async fn get_files_with_limit(
12521252
}
12531253
}
12541254
}
1255-
let mut inexact_stats = false;
1256-
if all_files.next().await.is_some() {
1257-
// If we still have files in the stream, it means that the limit kicked
1258-
// in, and the statistic could have been different had we processed the
1259-
// files in a different order.
1260-
inexact_stats = true;
1261-
}
1255+
// If we still have files in the stream, it means that the limit kicked
1256+
// in, and the statistic could have been different had we processed the
1257+
// files in a different order.
1258+
let inexact_stats = all_files.next().await.is_some();
12621259
Ok((file_group, inexact_stats))
12631260
}
12641261

datafusion/core/src/datasource/statistics.rs

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,3 +354,183 @@ fn set_min_if_lesser(
354354
_ => {}
355355
}
356356
}
357+
358+
#[cfg(test)]
359+
mod tests {
360+
use super::*;
361+
use crate::arrow::datatypes::{DataType, Field, Schema};
362+
use datafusion_common::ScalarValue;
363+
use std::sync::Arc;
364+
365+
#[test]
366+
fn test_compute_summary_statistics_basic() {
367+
// Create a schema with two columns
368+
let schema = Arc::new(Schema::new(vec![
369+
Field::new("col1", DataType::Int32, false),
370+
Field::new("col2", DataType::Int32, false),
371+
]));
372+
373+
// Create items with statistics
374+
let stats1 = Statistics {
375+
num_rows: Precision::Exact(10),
376+
total_byte_size: Precision::Exact(100),
377+
column_statistics: vec![
378+
ColumnStatistics {
379+
null_count: Precision::Exact(1),
380+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
381+
min_value: Precision::Exact(ScalarValue::Int32(Some(1))),
382+
sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
383+
distinct_count: Precision::Absent,
384+
},
385+
ColumnStatistics {
386+
null_count: Precision::Exact(2),
387+
max_value: Precision::Exact(ScalarValue::Int32(Some(200))),
388+
min_value: Precision::Exact(ScalarValue::Int32(Some(10))),
389+
sum_value: Precision::Exact(ScalarValue::Int32(Some(1000))),
390+
distinct_count: Precision::Absent,
391+
},
392+
],
393+
};
394+
395+
let stats2 = Statistics {
396+
num_rows: Precision::Exact(15),
397+
total_byte_size: Precision::Exact(150),
398+
column_statistics: vec![
399+
ColumnStatistics {
400+
null_count: Precision::Exact(2),
401+
max_value: Precision::Exact(ScalarValue::Int32(Some(120))),
402+
min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
403+
sum_value: Precision::Exact(ScalarValue::Int32(Some(600))),
404+
distinct_count: Precision::Absent,
405+
},
406+
ColumnStatistics {
407+
null_count: Precision::Exact(3),
408+
max_value: Precision::Exact(ScalarValue::Int32(Some(180))),
409+
min_value: Precision::Exact(ScalarValue::Int32(Some(5))),
410+
sum_value: Precision::Exact(ScalarValue::Int32(Some(1200))),
411+
distinct_count: Precision::Absent,
412+
},
413+
],
414+
};
415+
416+
let items = vec![Arc::new(stats1), Arc::new(stats2)];
417+
418+
// Call compute_summary_statistics
419+
let summary_stats =
420+
compute_summary_statistics(items, &schema, |item| Some(item.as_ref()));
421+
422+
// Verify the results
423+
assert_eq!(summary_stats.num_rows, Precision::Exact(25)); // 10 + 15
424+
assert_eq!(summary_stats.total_byte_size, Precision::Exact(250)); // 100 + 150
425+
426+
// Verify column statistics
427+
let col1_stats = &summary_stats.column_statistics[0];
428+
assert_eq!(col1_stats.null_count, Precision::Exact(3)); // 1 + 2
429+
assert_eq!(
430+
col1_stats.max_value,
431+
Precision::Exact(ScalarValue::Int32(Some(120)))
432+
);
433+
assert_eq!(
434+
col1_stats.min_value,
435+
Precision::Exact(ScalarValue::Int32(Some(-10)))
436+
);
437+
assert_eq!(
438+
col1_stats.sum_value,
439+
Precision::Exact(ScalarValue::Int32(Some(1100)))
440+
); // 500 + 600
441+
442+
let col2_stats = &summary_stats.column_statistics[1];
443+
assert_eq!(col2_stats.null_count, Precision::Exact(5)); // 2 + 3
444+
assert_eq!(
445+
col2_stats.max_value,
446+
Precision::Exact(ScalarValue::Int32(Some(200)))
447+
);
448+
assert_eq!(
449+
col2_stats.min_value,
450+
Precision::Exact(ScalarValue::Int32(Some(5)))
451+
);
452+
assert_eq!(
453+
col2_stats.sum_value,
454+
Precision::Exact(ScalarValue::Int32(Some(2200)))
455+
); // 1000 + 1200
456+
}
457+
458+
#[test]
459+
fn test_compute_summary_statistics_mixed_precision() {
460+
// Create a schema with one column
461+
let schema = Arc::new(Schema::new(vec![Field::new(
462+
"col1",
463+
DataType::Int32,
464+
false,
465+
)]));
466+
467+
// Create items with different precision levels
468+
let stats1 = Statistics {
469+
num_rows: Precision::Exact(10),
470+
total_byte_size: Precision::Inexact(100),
471+
column_statistics: vec![ColumnStatistics {
472+
null_count: Precision::Exact(1),
473+
max_value: Precision::Exact(ScalarValue::Int32(Some(100))),
474+
min_value: Precision::Inexact(ScalarValue::Int32(Some(1))),
475+
sum_value: Precision::Exact(ScalarValue::Int32(Some(500))),
476+
distinct_count: Precision::Absent,
477+
}],
478+
};
479+
480+
let stats2 = Statistics {
481+
num_rows: Precision::Inexact(15),
482+
total_byte_size: Precision::Exact(150),
483+
column_statistics: vec![ColumnStatistics {
484+
null_count: Precision::Inexact(2),
485+
max_value: Precision::Inexact(ScalarValue::Int32(Some(120))),
486+
min_value: Precision::Exact(ScalarValue::Int32(Some(-10))),
487+
sum_value: Precision::Absent,
488+
distinct_count: Precision::Absent,
489+
}],
490+
};
491+
492+
let items = vec![Arc::new(stats1), Arc::new(stats2)];
493+
494+
let summary_stats =
495+
compute_summary_statistics(items, &schema, |item| Some(item.as_ref()));
496+
497+
assert_eq!(summary_stats.num_rows, Precision::Inexact(25));
498+
assert_eq!(summary_stats.total_byte_size, Precision::Inexact(250));
499+
500+
let col_stats = &summary_stats.column_statistics[0];
501+
assert_eq!(col_stats.null_count, Precision::Inexact(3));
502+
assert_eq!(
503+
col_stats.max_value,
504+
Precision::Inexact(ScalarValue::Int32(Some(120)))
505+
);
506+
assert_eq!(
507+
col_stats.min_value,
508+
Precision::Inexact(ScalarValue::Int32(Some(-10)))
509+
);
510+
assert!(matches!(col_stats.sum_value, Precision::Absent));
511+
}
512+
513+
#[test]
514+
fn test_compute_summary_statistics_empty() {
515+
let schema = Arc::new(Schema::new(vec![Field::new(
516+
"col1",
517+
DataType::Int32,
518+
false,
519+
)]));
520+
521+
// Empty collection
522+
let items: Vec<Arc<Statistics>> = vec![];
523+
524+
let summary_stats =
525+
compute_summary_statistics(items, &schema, |item| Some(item.as_ref()));
526+
527+
// Verify default values for empty collection
528+
assert_eq!(summary_stats.num_rows, Precision::Absent);
529+
assert_eq!(summary_stats.total_byte_size, Precision::Absent);
530+
assert_eq!(summary_stats.column_statistics.len(), 1);
531+
assert_eq!(
532+
summary_stats.column_statistics[0].null_count,
533+
Precision::Absent
534+
);
535+
}
536+
}

0 commit comments

Comments
 (0)