Skip to content

Commit ce22d2e

Browse files
committed
feat: Add support for Int8 and Int16 data types in data page statistics
1 parent d175163 commit ce22d2e

File tree

2 files changed

+33
-21
lines changed

2 files changed

+33
-21
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

+30
Original file line numberDiff line numberDiff line change
@@ -572,13 +572,38 @@ macro_rules! make_data_page_stats_iterator {
572572
};
573573
}
574574

575+
make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min, Index::INT32, i32);
576+
make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max, Index::INT32, i32);
575577
make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min, Index::INT64, i64);
576578
make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max, Index::INT64, i64);
577579

578580
macro_rules! get_data_page_statistics {
579581
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
580582
paste! {
581583
match $data_type {
584+
Some(DataType::Int8) => Ok(Arc::new(
585+
Int8Array::from_iter(
586+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
587+
.map(|x| {
588+
x.into_iter().filter_map(|x| {
589+
x.and_then(|x| i8::try_from(x).ok())
590+
})
591+
})
592+
.flatten()
593+
)
594+
)),
595+
Some(DataType::Int16) => Ok(Arc::new(
596+
Int16Array::from_iter(
597+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
598+
.map(|x| {
599+
x.into_iter().filter_map(|x| {
600+
x.and_then(|x| i16::try_from(x).ok())
601+
})
602+
})
603+
.flatten()
604+
)
605+
)),
606+
Some(DataType::Int32) => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
582607
Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))),
583608
_ => unimplemented!()
584609
}
@@ -666,6 +691,11 @@ where
666691
{
667692
let iter = iterator.flat_map(|(len, index)| match index {
668693
Index::NONE => vec![None; len],
694+
Index::INT32(native_index) => native_index
695+
.indexes
696+
.iter()
697+
.map(|x| x.null_count.map(|x| x as u64))
698+
.collect::<Vec<_>>(),
669699
Index::INT64(native_index) => native_index
670700
.indexes
671701
.iter()

datafusion/core/tests/parquet/arrow_statistics.rs

+3-21
Original file line numberDiff line numberDiff line change
@@ -550,16 +550,11 @@ async fn test_int_32() {
550550
// row counts are [5, 5, 5, 5]
551551
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
552552
column_name: "i32",
553-
check: Check::RowGroup,
553+
check: Check::Both,
554554
}
555555
.run();
556556
}
557557

558-
// BUG: ignore this test for now
559-
// https://github.com/apache/datafusion/issues/10585
560-
// Note that the file has 4 columns named "i8", "i16", "i32", "i64".
561-
// - The tests on column i32 and i64 passed.
562-
// - The tests on column i8 and i16 failed.
563558
#[tokio::test]
564559
async fn test_int_16() {
565560
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
@@ -573,16 +568,6 @@ async fn test_int_16() {
573568
Test {
574569
reader: &reader,
575570
// mins are [-5, -4, 0, 5]
576-
// BUG: not sure why this returns same data but in Int32Array type even though I debugged and the columns name is "i16" an its data is Int16
577-
// My debugging tells me the bug is either at:
578-
// 1. The new code to get "iter". See the code in this PR with
579-
// // Get an iterator over the column statistics
580-
// let iter = row_groups
581-
// .iter()
582-
// .map(|x| x.column(parquet_idx).statistics());
583-
// OR
584-
// 2. in the function (and/or its marco) `pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>` here
585-
// https://github.com/apache/datafusion/blob/ea023e2d4878240eece870cf4b346c7a0667aeed/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs#L179
586571
expected_min: Arc::new(Int16Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array
587572
// maxes are [-1, 0, 4, 9]
588573
expected_max: Arc::new(Int16Array::from(vec![-1, 0, 4, 9])),
@@ -591,13 +576,11 @@ async fn test_int_16() {
591576
// row counts are [5, 5, 5, 5]
592577
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
593578
column_name: "i16",
594-
check: Check::RowGroup,
579+
check: Check::Both,
595580
}
596581
.run();
597582
}
598583

599-
// BUG (same as above): ignore this test for now
600-
// https://github.com/apache/datafusion/issues/10585
601584
#[tokio::test]
602585
async fn test_int_8() {
603586
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
@@ -611,7 +594,6 @@ async fn test_int_8() {
611594
Test {
612595
reader: &reader,
613596
// mins are [-5, -4, 0, 5]
614-
// BUG: not sure why this returns same data but in Int32Array even though I debugged and the columns name is "i8" an its data is Int8
615597
expected_min: Arc::new(Int8Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array
616598
// maxes are [-1, 0, 4, 9]
617599
expected_max: Arc::new(Int8Array::from(vec![-1, 0, 4, 9])),
@@ -620,7 +602,7 @@ async fn test_int_8() {
620602
// row counts are [5, 5, 5, 5]
621603
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
622604
column_name: "i8",
623-
check: Check::RowGroup,
605+
check: Check::Both,
624606
}
625607
.run();
626608
}

0 commit comments

Comments
 (0)