Skip to content

Commit 6f330c9

Browse files
efredineEric Fredine
and
Eric Fredine
authored
Fix data page statistics when all rows are null in a data page (#11295)
* Adds tests for data page statistics when all values on the page are null. Fixes most of the failing tests for iterators not handling this situation correctly. * Fix handling of data page statistics for FixedBinaryArray using a builder. * Fix data page all nulls stats test for Dictionary DataType. * Fixes handling of None statistics for Decimal128 and Decimal256. * Consolidate make_data_page_stats_iterator uses. * Fix linting error. * Remove unnecessary collect. --------- Co-authored-by: Eric Fredine <[email protected]>
1 parent 229c139 commit 6f330c9

File tree

2 files changed

+184
-102
lines changed

2 files changed

+184
-102
lines changed

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

+68-60
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
2020
// TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
2121

22+
use arrow::array::builder::FixedSizeBinaryBuilder;
2223
use arrow::datatypes::i256;
2324
use arrow::{array::ArrayRef, datatypes::DataType};
2425
use arrow_array::{
@@ -600,6 +601,31 @@ make_data_page_stats_iterator!(
600601
Index::DOUBLE,
601602
f64
602603
);
604+
make_data_page_stats_iterator!(
605+
MinByteArrayDataPageStatsIterator,
606+
|x: &PageIndex<ByteArray>| { x.min.clone() },
607+
Index::BYTE_ARRAY,
608+
ByteArray
609+
);
610+
make_data_page_stats_iterator!(
611+
MaxByteArrayDataPageStatsIterator,
612+
|x: &PageIndex<ByteArray>| { x.max.clone() },
613+
Index::BYTE_ARRAY,
614+
ByteArray
615+
);
616+
make_data_page_stats_iterator!(
617+
MaxFixedLenByteArrayDataPageStatsIterator,
618+
|x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
619+
Index::FIXED_LEN_BYTE_ARRAY,
620+
FixedLenByteArray
621+
);
622+
623+
make_data_page_stats_iterator!(
624+
MinFixedLenByteArrayDataPageStatsIterator,
625+
|x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
626+
Index::FIXED_LEN_BYTE_ARRAY,
627+
FixedLenByteArray
628+
);
603629

604630
macro_rules! get_decimal_page_stats_iterator {
605631
($iterator_type: ident, $func: ident, $stat_value_type: ident, $convert_func: ident) => {
@@ -634,9 +660,7 @@ macro_rules! get_decimal_page_stats_iterator {
634660
.indexes
635661
.iter()
636662
.map(|x| {
637-
Some($stat_value_type::from(
638-
x.$func.unwrap_or_default(),
639-
))
663+
x.$func.and_then(|x| Some($stat_value_type::from(x)))
640664
})
641665
.collect::<Vec<_>>(),
642666
),
@@ -645,9 +669,7 @@ macro_rules! get_decimal_page_stats_iterator {
645669
.indexes
646670
.iter()
647671
.map(|x| {
648-
Some($stat_value_type::from(
649-
x.$func.unwrap_or_default(),
650-
))
672+
x.$func.and_then(|x| Some($stat_value_type::from(x)))
651673
})
652674
.collect::<Vec<_>>(),
653675
),
@@ -656,9 +678,9 @@ macro_rules! get_decimal_page_stats_iterator {
656678
.indexes
657679
.iter()
658680
.map(|x| {
659-
Some($convert_func(
660-
x.clone().$func.unwrap_or_default().data(),
661-
))
681+
x.clone()
682+
.$func
683+
.and_then(|x| Some($convert_func(x.data())))
662684
})
663685
.collect::<Vec<_>>(),
664686
),
@@ -667,9 +689,9 @@ macro_rules! get_decimal_page_stats_iterator {
667689
.indexes
668690
.iter()
669691
.map(|x| {
670-
Some($convert_func(
671-
x.clone().$func.unwrap_or_default().data(),
672-
))
692+
x.clone()
693+
.$func
694+
.and_then(|x| Some($convert_func(x.data())))
673695
})
674696
.collect::<Vec<_>>(),
675697
),
@@ -713,32 +735,6 @@ get_decimal_page_stats_iterator!(
713735
i256,
714736
from_bytes_to_i256
715737
);
716-
make_data_page_stats_iterator!(
717-
MinByteArrayDataPageStatsIterator,
718-
|x: &PageIndex<ByteArray>| { x.min.clone() },
719-
Index::BYTE_ARRAY,
720-
ByteArray
721-
);
722-
make_data_page_stats_iterator!(
723-
MaxByteArrayDataPageStatsIterator,
724-
|x: &PageIndex<ByteArray>| { x.max.clone() },
725-
Index::BYTE_ARRAY,
726-
ByteArray
727-
);
728-
729-
make_data_page_stats_iterator!(
730-
MaxFixedLenByteArrayDataPageStatsIterator,
731-
|x: &PageIndex<FixedLenByteArray>| { x.max.clone() },
732-
Index::FIXED_LEN_BYTE_ARRAY,
733-
FixedLenByteArray
734-
);
735-
736-
make_data_page_stats_iterator!(
737-
MinFixedLenByteArrayDataPageStatsIterator,
738-
|x: &PageIndex<FixedLenByteArray>| { x.min.clone() },
739-
Index::FIXED_LEN_BYTE_ARRAY,
740-
FixedLenByteArray
741-
);
742738

743739
macro_rules! get_data_page_statistics {
744740
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
@@ -757,7 +753,7 @@ macro_rules! get_data_page_statistics {
757753
UInt8Array::from_iter(
758754
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
759755
.map(|x| {
760-
x.into_iter().filter_map(|x| {
756+
x.into_iter().map(|x| {
761757
x.and_then(|x| u8::try_from(x).ok())
762758
})
763759
})
@@ -768,7 +764,7 @@ macro_rules! get_data_page_statistics {
768764
UInt16Array::from_iter(
769765
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
770766
.map(|x| {
771-
x.into_iter().filter_map(|x| {
767+
x.into_iter().map(|x| {
772768
x.and_then(|x| u16::try_from(x).ok())
773769
})
774770
})
@@ -779,7 +775,7 @@ macro_rules! get_data_page_statistics {
779775
UInt32Array::from_iter(
780776
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
781777
.map(|x| {
782-
x.into_iter().filter_map(|x| {
778+
x.into_iter().map(|x| {
783779
x.and_then(|x| Some(x as u32))
784780
})
785781
})
@@ -789,7 +785,7 @@ macro_rules! get_data_page_statistics {
789785
UInt64Array::from_iter(
790786
[<$stat_type_prefix Int64DataPageStatsIterator>]::new($iterator)
791787
.map(|x| {
792-
x.into_iter().filter_map(|x| {
788+
x.into_iter().map(|x| {
793789
x.and_then(|x| Some(x as u64))
794790
})
795791
})
@@ -799,7 +795,7 @@ macro_rules! get_data_page_statistics {
799795
Int8Array::from_iter(
800796
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
801797
.map(|x| {
802-
x.into_iter().filter_map(|x| {
798+
x.into_iter().map(|x| {
803799
x.and_then(|x| i8::try_from(x).ok())
804800
})
805801
})
@@ -810,7 +806,7 @@ macro_rules! get_data_page_statistics {
810806
Int16Array::from_iter(
811807
[<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
812808
.map(|x| {
813-
x.into_iter().filter_map(|x| {
809+
x.into_iter().map(|x| {
814810
x.and_then(|x| i16::try_from(x).ok())
815811
})
816812
})
@@ -823,8 +819,8 @@ macro_rules! get_data_page_statistics {
823819
Float16Array::from_iter(
824820
[<$stat_type_prefix Float16DataPageStatsIterator>]::new($iterator)
825821
.map(|x| {
826-
x.into_iter().filter_map(|x| {
827-
x.and_then(|x| Some(from_bytes_to_f16(x.data())))
822+
x.into_iter().map(|x| {
823+
x.and_then(|x| from_bytes_to_f16(x.data()))
828824
})
829825
})
830826
.flatten()
@@ -836,7 +832,7 @@ macro_rules! get_data_page_statistics {
836832
Some(DataType::LargeBinary) => Ok(Arc::new(LargeBinaryArray::from_iter([<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).flatten()))),
837833
Some(DataType::Utf8) => Ok(Arc::new(StringArray::from(
838834
[<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).map(|x| {
839-
x.into_iter().filter_map(|x| {
835+
x.into_iter().map(|x| {
840836
x.and_then(|x| {
841837
let res = std::str::from_utf8(x.data()).map(|s| s.to_string()).ok();
842838
if res.is_none() {
@@ -849,7 +845,7 @@ macro_rules! get_data_page_statistics {
849845
))),
850846
Some(DataType::LargeUtf8) => Ok(Arc::new(LargeStringArray::from(
851847
[<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator).map(|x| {
852-
x.into_iter().filter_map(|x| {
848+
x.into_iter().map(|x| {
853849
x.and_then(|x| {
854850
let res = std::str::from_utf8(x.data()).map(|s| s.to_string()).ok();
855851
if res.is_none() {
@@ -878,10 +874,10 @@ macro_rules! get_data_page_statistics {
878874
Date64Array::from([<$stat_type_prefix Int32DataPageStatsIterator>]::new($iterator)
879875
.map(|x| {
880876
x.into_iter()
881-
.filter_map(|x| {
877+
.map(|x| {
882878
x.and_then(|x| i64::try_from(x).ok())
879+
.map(|x| x * 24 * 60 * 60 * 1000)
883880
})
884-
.map(|x| x * 24 * 60 * 60 * 1000)
885881
}).flatten().collect::<Vec<_>>()
886882
)
887883
)
@@ -919,16 +915,28 @@ macro_rules! get_data_page_statistics {
919915
})
920916
},
921917
Some(DataType::FixedSizeBinary(size)) => {
922-
Ok(Arc::new(
923-
FixedSizeBinaryArray::try_from_iter(
924-
[<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator)
925-
.flat_map(|x| x.into_iter())
926-
.filter_map(|x| x)
927-
).unwrap_or_else(|e| {
928-
log::debug!("FixedSizeBinary statistics is invalid: {}", e);
929-
FixedSizeBinaryArray::new(*size, vec![].into(), None)
930-
})
931-
))
918+
let mut builder = FixedSizeBinaryBuilder::new(*size);
919+
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
920+
for x in iterator {
921+
for x in x.into_iter() {
922+
let Some(x) = x else {
923+
builder.append_null(); // no statistics value
924+
continue;
925+
};
926+
927+
if x.len() == *size as usize {
928+
let _ = builder.append_value(x.data());
929+
} else {
930+
log::debug!(
931+
"FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
932+
size,
933+
x.len(),
934+
);
935+
builder.append_null();
936+
}
937+
}
938+
}
939+
Ok(Arc::new(builder.finish()))
932940
},
933941
_ => unimplemented!()
934942
}

0 commit comments

Comments
 (0)