Skip to content

Commit 1cb0057

Browse files
Fix: StatisticsConverter counts for missing columns (#10946)
* feat: add run_with_schema + add test_case * fix: null_counts * fix: row_counts * refactor: change return type of data_page_row_counts * refactor: shorten row_group_indices
1 parent e1cfb48 commit 1cb0057

File tree

3 files changed

+167
-108
lines changed

3 files changed

+167
-108
lines changed

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

+1
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ impl<'a> PruningStatistics for RowGroupPruningStatistics<'a> {
381381
self.statistics_converter(column)
382382
.and_then(|c| c.row_group_null_counts(self.metadata_iter()))
383383
.ok()
384+
.map(|counts| Arc::new(counts) as ArrayRef)
384385
}
385386

386387
fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {

datafusion/core/src/datasource/physical_plan/parquet/statistics.rs

+21-22
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,7 @@ where
661661
/// of parquet page [`Index`]'es to an [`ArrayRef`]
662662
///
663663
/// The returned Array is an [`UInt64Array`]
664-
pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result<ArrayRef>
664+
pub(crate) fn null_counts_page_statistics<'a, I>(iterator: I) -> Result<UInt64Array>
665665
where
666666
I: Iterator<Item = (usize, &'a Index)>,
667667
{
@@ -680,7 +680,7 @@ where
680680
_ => unimplemented!(),
681681
});
682682

683-
Ok(Arc::new(UInt64Array::from_iter(iter)))
683+
Ok(UInt64Array::from_iter(iter))
684684
}
685685

686686
/// Extracts Parquet statistics as Arrow arrays
@@ -874,21 +874,22 @@ impl<'a> StatisticsConverter<'a> {
874874
/// Extract the null counts from row group statistics in [`RowGroupMetaData`]
875875
///
876876
/// See docs on [`Self::row_group_mins`] for details
877-
pub fn row_group_null_counts<I>(&self, metadatas: I) -> Result<ArrayRef>
877+
pub fn row_group_null_counts<I>(&self, metadatas: I) -> Result<UInt64Array>
878878
where
879879
I: IntoIterator<Item = &'a RowGroupMetaData>,
880880
{
881-
let data_type = self.arrow_field.data_type();
882-
883881
let Some(parquet_index) = self.parquet_index else {
884-
return Ok(self.make_null_array(data_type, metadatas));
882+
let num_row_groups = metadatas.into_iter().count();
883+
return Ok(UInt64Array::from_iter(
884+
std::iter::repeat(None).take(num_row_groups),
885+
));
885886
};
886887

887888
let null_counts = metadatas
888889
.into_iter()
889890
.map(|x| x.column(parquet_index).statistics())
890891
.map(|s| s.map(|s| s.null_count()));
891-
Ok(Arc::new(UInt64Array::from_iter(null_counts)))
892+
Ok(UInt64Array::from_iter(null_counts))
892893
}
893894

894895
/// Extract the minimum values from Data Page statistics.
@@ -1007,14 +1008,15 @@ impl<'a> StatisticsConverter<'a> {
10071008
column_page_index: &ParquetColumnIndex,
10081009
column_offset_index: &ParquetOffsetIndex,
10091010
row_group_indices: I,
1010-
) -> Result<ArrayRef>
1011+
) -> Result<UInt64Array>
10111012
where
10121013
I: IntoIterator<Item = &'a usize>,
10131014
{
1014-
let data_type = self.arrow_field.data_type();
1015-
10161015
let Some(parquet_index) = self.parquet_index else {
1017-
return Ok(self.make_null_array(data_type, row_group_indices));
1016+
let num_row_groups = row_group_indices.into_iter().count();
1017+
return Ok(UInt64Array::from_iter(
1018+
std::iter::repeat(None).take(num_row_groups),
1019+
));
10181020
};
10191021

10201022
let iter = row_group_indices.into_iter().map(|rg_index| {
@@ -1047,21 +1049,19 @@ impl<'a> StatisticsConverter<'a> {
10471049
pub fn data_page_row_counts<I>(
10481050
&self,
10491051
column_offset_index: &ParquetOffsetIndex,
1050-
row_group_metadatas: &[RowGroupMetaData],
1052+
row_group_metadatas: &'a [RowGroupMetaData],
10511053
row_group_indices: I,
1052-
) -> Result<ArrayRef>
1054+
) -> Result<Option<UInt64Array>>
10531055
where
10541056
I: IntoIterator<Item = &'a usize>,
10551057
{
1056-
let data_type = self.arrow_field.data_type();
1057-
10581058
let Some(parquet_index) = self.parquet_index else {
1059-
return Ok(self.make_null_array(data_type, row_group_indices));
1059+
// no matching column found in parquet_index;
1060+
// thus we cannot extract page_locations in order to determine
1061+
// the row count on a per DataPage basis.
1062+
return Ok(None);
10601063
};
10611064

1062-
// `offset_index[row_group_number][column_number][page_number]` holds
1063-
// the [`PageLocation`] corresponding to page `page_number` of column
1064-
// `column_number`of row group `row_group_number`.
10651065
let mut row_count_total = Vec::new();
10661066
for rg_idx in row_group_indices {
10671067
let page_locations = &column_offset_index[*rg_idx][parquet_index];
@@ -1070,9 +1070,8 @@ impl<'a> StatisticsConverter<'a> {
10701070
Some(loc[1].first_row_index as u64 - loc[0].first_row_index as u64)
10711071
});
10721072

1073-
let num_rows_in_row_group = &row_group_metadatas[*rg_idx].num_rows();
1074-
10751073
// append the last page row count
1074+
let num_rows_in_row_group = &row_group_metadatas[*rg_idx].num_rows();
10761075
let row_count_per_page = row_count_per_page
10771076
.chain(std::iter::once(Some(
10781077
*num_rows_in_row_group as u64
@@ -1083,7 +1082,7 @@ impl<'a> StatisticsConverter<'a> {
10831082
row_count_total.extend(row_count_per_page);
10841083
}
10851084

1086-
Ok(Arc::new(UInt64Array::from_iter(row_count_total)))
1085+
Ok(Some(UInt64Array::from_iter(row_count_total)))
10871086
}
10881087

10891088
/// Returns a null array of data_type with one element per row group

0 commit comments

Comments
 (0)