@@ -661,7 +661,7 @@ where
661
661
/// of parquet page [`Index`]'es to an [`ArrayRef`]
662
662
///
663
663
/// The returned Array is an [`UInt64Array`]
664
- pub ( crate ) fn null_counts_page_statistics < ' a , I > ( iterator : I ) -> Result < ArrayRef >
664
+ pub ( crate ) fn null_counts_page_statistics < ' a , I > ( iterator : I ) -> Result < UInt64Array >
665
665
where
666
666
I : Iterator < Item = ( usize , & ' a Index ) > ,
667
667
{
@@ -680,7 +680,7 @@ where
680
680
_ => unimplemented ! ( ) ,
681
681
} ) ;
682
682
683
- Ok ( Arc :: new ( UInt64Array :: from_iter ( iter) ) )
683
+ Ok ( UInt64Array :: from_iter ( iter) )
684
684
}
685
685
686
686
/// Extracts Parquet statistics as Arrow arrays
@@ -874,21 +874,22 @@ impl<'a> StatisticsConverter<'a> {
874
874
/// Extract the null counts from row group statistics in [`RowGroupMetaData`]
875
875
///
876
876
/// See docs on [`Self::row_group_mins`] for details
877
- pub fn row_group_null_counts < I > ( & self , metadatas : I ) -> Result < ArrayRef >
877
+ pub fn row_group_null_counts < I > ( & self , metadatas : I ) -> Result < UInt64Array >
878
878
where
879
879
I : IntoIterator < Item = & ' a RowGroupMetaData > ,
880
880
{
881
- let data_type = self . arrow_field . data_type ( ) ;
882
-
883
881
let Some ( parquet_index) = self . parquet_index else {
884
- return Ok ( self . make_null_array ( data_type, metadatas) ) ;
882
+ let num_row_groups = metadatas. into_iter ( ) . count ( ) ;
883
+ return Ok ( UInt64Array :: from_iter (
884
+ std:: iter:: repeat ( None ) . take ( num_row_groups) ,
885
+ ) ) ;
885
886
} ;
886
887
887
888
let null_counts = metadatas
888
889
. into_iter ( )
889
890
. map ( |x| x. column ( parquet_index) . statistics ( ) )
890
891
. map ( |s| s. map ( |s| s. null_count ( ) ) ) ;
891
- Ok ( Arc :: new ( UInt64Array :: from_iter ( null_counts) ) )
892
+ Ok ( UInt64Array :: from_iter ( null_counts) )
892
893
}
893
894
894
895
/// Extract the minimum values from Data Page statistics.
@@ -1007,14 +1008,15 @@ impl<'a> StatisticsConverter<'a> {
1007
1008
column_page_index : & ParquetColumnIndex ,
1008
1009
column_offset_index : & ParquetOffsetIndex ,
1009
1010
row_group_indices : I ,
1010
- ) -> Result < ArrayRef >
1011
+ ) -> Result < UInt64Array >
1011
1012
where
1012
1013
I : IntoIterator < Item = & ' a usize > ,
1013
1014
{
1014
- let data_type = self . arrow_field . data_type ( ) ;
1015
-
1016
1015
let Some ( parquet_index) = self . parquet_index else {
1017
- return Ok ( self . make_null_array ( data_type, row_group_indices) ) ;
1016
+ let num_row_groups = row_group_indices. into_iter ( ) . count ( ) ;
1017
+ return Ok ( UInt64Array :: from_iter (
1018
+ std:: iter:: repeat ( None ) . take ( num_row_groups) ,
1019
+ ) ) ;
1018
1020
} ;
1019
1021
1020
1022
let iter = row_group_indices. into_iter ( ) . map ( |rg_index| {
@@ -1047,21 +1049,19 @@ impl<'a> StatisticsConverter<'a> {
1047
1049
pub fn data_page_row_counts < I > (
1048
1050
& self ,
1049
1051
column_offset_index : & ParquetOffsetIndex ,
1050
- row_group_metadatas : & [ RowGroupMetaData ] ,
1052
+ row_group_metadatas : & ' a [ RowGroupMetaData ] ,
1051
1053
row_group_indices : I ,
1052
- ) -> Result < ArrayRef >
1054
+ ) -> Result < Option < UInt64Array > >
1053
1055
where
1054
1056
I : IntoIterator < Item = & ' a usize > ,
1055
1057
{
1056
- let data_type = self . arrow_field . data_type ( ) ;
1057
-
1058
1058
let Some ( parquet_index) = self . parquet_index else {
1059
- return Ok ( self . make_null_array ( data_type, row_group_indices) ) ;
1059
+ // no matching column found in parquet_index;
1060
+ // thus we cannot extract page_locations in order to determine
1061
+ // the row count on a per DataPage basis.
1062
+ return Ok ( None ) ;
1060
1063
} ;
1061
1064
1062
- // `offset_index[row_group_number][column_number][page_number]` holds
1063
- // the [`PageLocation`] corresponding to page `page_number` of column
1064
- // `column_number`of row group `row_group_number`.
1065
1065
let mut row_count_total = Vec :: new ( ) ;
1066
1066
for rg_idx in row_group_indices {
1067
1067
let page_locations = & column_offset_index[ * rg_idx] [ parquet_index] ;
@@ -1070,9 +1070,8 @@ impl<'a> StatisticsConverter<'a> {
1070
1070
Some ( loc[ 1 ] . first_row_index as u64 - loc[ 0 ] . first_row_index as u64 )
1071
1071
} ) ;
1072
1072
1073
- let num_rows_in_row_group = & row_group_metadatas[ * rg_idx] . num_rows ( ) ;
1074
-
1075
1073
// append the last page row count
1074
+ let num_rows_in_row_group = & row_group_metadatas[ * rg_idx] . num_rows ( ) ;
1076
1075
let row_count_per_page = row_count_per_page
1077
1076
. chain ( std:: iter:: once ( Some (
1078
1077
* num_rows_in_row_group as u64
@@ -1083,7 +1082,7 @@ impl<'a> StatisticsConverter<'a> {
1083
1082
row_count_total. extend ( row_count_per_page) ;
1084
1083
}
1085
1084
1086
- Ok ( Arc :: new ( UInt64Array :: from_iter ( row_count_total) ) )
1085
+ Ok ( Some ( UInt64Array :: from_iter ( row_count_total) ) )
1087
1086
}
1088
1087
1089
1088
/// Returns a null array of data_type with one element per row group
0 commit comments