Skip to content

Commit 7d9a556

Browse files
committed
feat: Add support for Int8 and Int16 data types in data page statistics
1 parent 87aea14 commit 7d9a556

File tree

2 files changed

+75
-57
lines changed

2 files changed

+75
-57
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,13 +572,38 @@ macro_rules! make_data_page_stats_iterator {
572572
};
573573
}
574574

575+
make_data_page_stats_iterator!(MinInt32DataPageStatsIterator, min, Index::INT32, i32);
576+
make_data_page_stats_iterator!(MaxInt32DataPageStatsIterator, max, Index::INT32, i32);
575577
make_data_page_stats_iterator!(MinInt64DataPageStatsIterator, min, Index::INT64, i64);
576578
make_data_page_stats_iterator!(MaxInt64DataPageStatsIterator, max, Index::INT64, i64);
577579

578580
macro_rules! get_data_page_statistics {
579581
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
580582
paste! {
581583
match $data_type {
584+
Some(DataType::Int8) => Ok(Arc::new(
585+
Int8Array::from_iter(
586+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
587+
.map(|x| {
588+
x.into_iter().filter_map(|x| {
589+
x.and_then(|x| i8::try_from(x).ok())
590+
})
591+
})
592+
.flatten()
593+
)
594+
)),
595+
Some(DataType::Int16) => Ok(Arc::new(
596+
Int16Array::from_iter(
597+
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
598+
.map(|x| {
599+
x.into_iter().filter_map(|x| {
600+
x.and_then(|x| i16::try_from(x).ok())
601+
})
602+
})
603+
.flatten()
604+
)
605+
)),
606+
Some(DataType::Int32) => Ok(Arc::new(Int32Array::from_iter([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator).flatten()))),
582607
Some(DataType::Int64) => Ok(Arc::new(Int64Array::from_iter([<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator).flatten()))),
583608
_ => unimplemented!()
584609
}
@@ -666,6 +691,11 @@ where
666691
{
667692
let iter = iterator.flat_map(|(len, index)| match index {
668693
Index::NONE => vec![None; len],
694+
Index::INT32(native_index) => native_index
695+
.indexes
696+
.iter()
697+
.map(|x| x.null_count.map(|x| x as u64))
698+
.collect::<Vec<_>>(),
669699
Index::INT64(native_index) => native_index
670700
.indexes
671701
.iter()

datafusion/core/tests/parquet/arrow_statistics.rs

Lines changed: 45 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -510,27 +510,24 @@ async fn test_int_32() {
510510
.build()
511511
.await;
512512

513-
Test {
514-
reader: &reader,
515-
// mins are [-5, -4, 0, 5]
516-
expected_min: Arc::new(Int32Array::from(vec![-5, -4, 0, 5])),
517-
// maxes are [-1, 0, 4, 9]
518-
expected_max: Arc::new(Int32Array::from(vec![-1, 0, 4, 9])),
519-
// nulls are [0, 0, 0, 0]
520-
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
521-
// row counts are [5, 5, 5, 5]
522-
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
523-
column_name: "i32",
524-
test_data_page_statistics: false,
513+
for test_data_page_statistics in [true, false] {
514+
Test {
515+
reader: &reader,
516+
// mins are [-5, -4, 0, 5]
517+
expected_min: Arc::new(Int32Array::from(vec![-5, -4, 0, 5])),
518+
// maxes are [-1, 0, 4, 9]
519+
expected_max: Arc::new(Int32Array::from(vec![-1, 0, 4, 9])),
520+
// nulls are [0, 0, 0, 0]
521+
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
522+
// row counts are [5, 5, 5, 5]
523+
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
524+
column_name: "i32",
525+
test_data_page_statistics,
526+
}
527+
.run();
525528
}
526-
.run();
527529
}
528530

529-
// BUG: ignore this test for now
530-
// https://github.com/apache/datafusion/issues/10585
531-
// Note that the file has 4 columns named "i8", "i16", "i32", "i64".
532-
// - The tests on column i32 and i64 passed.
533-
// - The tests on column i8 and i16 failed.
534531
#[tokio::test]
535532
async fn test_int_16() {
536533
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
@@ -541,34 +538,24 @@ async fn test_int_16() {
541538
.build()
542539
.await;
543540

544-
Test {
545-
reader: &reader,
546-
// mins are [-5, -4, 0, 5]
547-
// BUG: not sure why this returns same data but in Int32Array type even though I debugged and the columns name is "i16" an its data is Int16
548-
// My debugging tells me the bug is either at:
549-
// 1. The new code to get "iter". See the code in this PR with
550-
// // Get an iterator over the column statistics
551-
// let iter = row_groups
552-
// .iter()
553-
// .map(|x| x.column(parquet_idx).statistics());
554-
// OR
555-
// 2. in the function (and/or its marco) `pub(crate) fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>` here
556-
// https://github.com/apache/datafusion/blob/ea023e2d4878240eece870cf4b346c7a0667aeed/datafusion/core/src/datasource/physical_plan/parquet/statistics.rs#L179
557-
expected_min: Arc::new(Int16Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array
558-
// maxes are [-1, 0, 4, 9]
559-
expected_max: Arc::new(Int16Array::from(vec![-1, 0, 4, 9])),
560-
// nulls are [0, 0, 0, 0]
561-
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
562-
// row counts are [5, 5, 5, 5]
563-
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
564-
column_name: "i16",
565-
test_data_page_statistics: false,
541+
for test_data_page_statistics in [true, false] {
542+
Test {
543+
reader: &reader,
544+
// mins are [-5, -4, 0, 5]
545+
expected_min: Arc::new(Int16Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array
546+
// maxes are [-1, 0, 4, 9]
547+
expected_max: Arc::new(Int16Array::from(vec![-1, 0, 4, 9])),
548+
// nulls are [0, 0, 0, 0]
549+
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
550+
// row counts are [5, 5, 5, 5]
551+
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
552+
column_name: "i16",
553+
test_data_page_statistics,
554+
}
555+
.run();
566556
}
567-
.run();
568557
}
569558

570-
// BUG (same as above): ignore this test for now
571-
// https://github.com/apache/datafusion/issues/10585
572559
#[tokio::test]
573560
async fn test_int_8() {
574561
// This creates a parquet files of 4 columns named "i8", "i16", "i32", "i64"
@@ -579,21 +566,22 @@ async fn test_int_8() {
579566
.build()
580567
.await;
581568

582-
Test {
583-
reader: &reader,
584-
// mins are [-5, -4, 0, 5]
585-
// BUG: not sure why this returns same data but in Int32Array even though I debugged and the columns name is "i8" an its data is Int8
586-
expected_min: Arc::new(Int8Array::from(vec![-5, -4, 0, 5])), // panic here because the actual data is Int32Array
587-
// maxes are [-1, 0, 4, 9]
588-
expected_max: Arc::new(Int8Array::from(vec![-1, 0, 4, 9])),
589-
// nulls are [0, 0, 0, 0]
590-
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
591-
// row counts are [5, 5, 5, 5]
592-
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
593-
column_name: "i8",
594-
test_data_page_statistics: false,
569+
for test_data_page_statistics in [true, false] {
570+
Test {
571+
reader: &reader,
572+
// mins are [-5, -4, 0, 5]
573+
expected_min: Arc::new(Int8Array::from(vec![-5, -4, 0, 5])),
574+
// maxes are [-1, 0, 4, 9]
575+
expected_max: Arc::new(Int8Array::from(vec![-1, 0, 4, 9])),
576+
// nulls are [0, 0, 0, 0]
577+
expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]),
578+
// row counts are [5, 5, 5, 5]
579+
expected_row_counts: UInt64Array::from(vec![5, 5, 5, 5]),
580+
column_name: "i8",
581+
test_data_page_statistics,
582+
}
583+
.run();
595584
}
596-
.run();
597585
}
598586

599587
// timestamp

0 commit comments

Comments
 (0)