17
17
18
18
use arrow:: { array:: ArrayRef , datatypes:: Schema } ;
19
19
use arrow_array:: BooleanArray ;
20
- use arrow_schema:: FieldRef ;
21
- use datafusion_common:: { Column , ScalarValue } ;
20
+ use datafusion_common:: { Column , Result , ScalarValue } ;
22
21
use parquet:: basic:: Type ;
23
22
use parquet:: data_type:: Decimal ;
24
- use parquet:: file:: metadata:: ColumnChunkMetaData ;
25
23
use parquet:: schema:: types:: SchemaDescriptor ;
26
24
use parquet:: {
27
25
arrow:: { async_reader:: AsyncFileReader , ParquetRecordBatchStreamBuilder } ,
28
26
bloom_filter:: Sbbf ,
29
27
file:: metadata:: RowGroupMetaData ,
30
28
} ;
31
29
use std:: collections:: { HashMap , HashSet } ;
30
+ use std:: sync:: Arc ;
32
31
33
32
use crate :: datasource:: listing:: FileRange ;
34
- use crate :: datasource:: physical_plan:: parquet:: statistics:: {
35
- max_statistics, min_statistics, parquet_column,
36
- } ;
33
+ use crate :: datasource:: physical_plan:: parquet:: statistics:: parquet_column;
37
34
use crate :: physical_optimizer:: pruning:: { PruningPredicate , PruningStatistics } ;
38
35
39
- use super :: { ParquetAccessPlan , ParquetFileMetrics } ;
36
+ use super :: { ParquetAccessPlan , ParquetFileMetrics , StatisticsConverter } ;
40
37
41
38
/// Reduces the [`ParquetAccessPlan`] based on row group level metadata.
42
39
///
@@ -113,32 +110,37 @@ impl RowGroupAccessPlanFilter {
113
110
metrics : & ParquetFileMetrics ,
114
111
) {
115
112
assert_eq ! ( groups. len( ) , self . access_plan. len( ) ) ;
116
- for ( idx, metadata) in groups. iter ( ) . enumerate ( ) {
117
- if !self . access_plan . should_scan ( idx) {
118
- continue ;
119
- }
120
- let pruning_stats = RowGroupPruningStatistics {
121
- parquet_schema,
122
- row_group_metadata : metadata,
123
- arrow_schema,
124
- } ;
125
- match predicate. prune ( & pruning_stats) {
126
- Ok ( values) => {
127
- // NB: false means don't scan row group
128
- if !values[ 0 ] {
113
+ // Indexes of row groups still to scan
114
+ let row_group_indexes = self . access_plan . row_group_indexes ( ) ;
115
+ let row_group_metadatas = row_group_indexes
116
+ . iter ( )
117
+ . map ( |& i| & groups[ i] )
118
+ . collect :: < Vec < _ > > ( ) ;
119
+
120
+ let pruning_stats = RowGroupPruningStatistics {
121
+ parquet_schema,
122
+ row_group_metadatas,
123
+ arrow_schema,
124
+ } ;
125
+
126
+ // try to prune the row groups in a single call
127
+ match predicate. prune ( & pruning_stats) {
128
+ Ok ( values) => {
129
+ // values[i] is false means the predicate could not be true for row group i
130
+ for ( idx, & value) in row_group_indexes. iter ( ) . zip ( values. iter ( ) ) {
131
+ if !value {
132
+ self . access_plan . skip ( * idx) ;
129
133
metrics. row_groups_pruned_statistics . add ( 1 ) ;
130
- self . access_plan . skip ( idx ) ;
131
- continue ;
134
+ } else {
135
+ metrics . row_groups_matched_statistics . add ( 1 ) ;
132
136
}
133
137
}
134
- // stats filter array could not be built
135
- // don't prune this row group
136
- Err ( e) => {
137
- log:: debug!( "Error evaluating row group predicate values {e}" ) ;
138
- metrics. predicate_evaluation_errors . add ( 1 ) ;
139
- }
140
138
}
141
- metrics. row_groups_matched_statistics . add ( 1 ) ;
139
+ // stats filter array could not be built, so we can't prune
140
+ Err ( e) => {
141
+ log:: debug!( "Error evaluating row group predicate values {e}" ) ;
142
+ metrics. predicate_evaluation_errors . add ( 1 ) ;
143
+ }
142
144
}
143
145
}
144
146
@@ -337,49 +339,55 @@ impl PruningStatistics for BloomFilterStatistics {
337
339
}
338
340
}
339
341
340
- /// Wraps [`RowGroupMetaData`] in a way that implements [`PruningStatistics`]
341
- ///
342
- /// Note: This should be implemented for an array of [`RowGroupMetaData`] instead
343
- /// of per row-group
342
+ /// Wraps a slice of [`RowGroupMetaData`] in a way that implements [`PruningStatistics`]
344
343
struct RowGroupPruningStatistics < ' a > {
345
344
parquet_schema : & ' a SchemaDescriptor ,
346
- row_group_metadata : & ' a RowGroupMetaData ,
345
+ row_group_metadatas : Vec < & ' a RowGroupMetaData > ,
347
346
arrow_schema : & ' a Schema ,
348
347
}
349
348
350
349
impl < ' a > RowGroupPruningStatistics < ' a > {
351
- /// Lookups up the parquet column by name
352
- fn column ( & self , name : & str ) -> Option < ( & ColumnChunkMetaData , & FieldRef ) > {
353
- let ( idx, field) = parquet_column ( self . parquet_schema , self . arrow_schema , name) ?;
354
- Some ( ( self . row_group_metadata . column ( idx) , field) )
350
+ /// Return an iterator over the row group metadata
351
+ fn metadata_iter ( & ' a self ) -> impl Iterator < Item = & ' a RowGroupMetaData > + ' a {
352
+ self . row_group_metadatas . iter ( ) . copied ( )
353
+ }
354
+
355
+ fn statistics_converter < ' b > (
356
+ & ' a self ,
357
+ column : & ' b Column ,
358
+ ) -> Result < StatisticsConverter < ' a > > {
359
+ StatisticsConverter :: try_new ( & column. name , self . arrow_schema , self . parquet_schema )
355
360
}
356
361
}
357
362
358
363
impl < ' a > PruningStatistics for RowGroupPruningStatistics < ' a > {
359
364
fn min_values ( & self , column : & Column ) -> Option < ArrayRef > {
360
- let ( column, field) = self . column ( & column. name ) ?;
361
- min_statistics ( field. data_type ( ) , std:: iter:: once ( column. statistics ( ) ) ) . ok ( )
365
+ self . statistics_converter ( column)
366
+ . and_then ( |c| c. row_group_mins ( self . metadata_iter ( ) ) )
367
+ . ok ( )
362
368
}
363
369
364
370
fn max_values ( & self , column : & Column ) -> Option < ArrayRef > {
365
- let ( column, field) = self . column ( & column. name ) ?;
366
- max_statistics ( field. data_type ( ) , std:: iter:: once ( column. statistics ( ) ) ) . ok ( )
371
+ self . statistics_converter ( column)
372
+ . and_then ( |c| c. row_group_maxes ( self . metadata_iter ( ) ) )
373
+ . ok ( )
367
374
}
368
375
369
376
fn num_containers ( & self ) -> usize {
370
- 1
377
+ self . row_group_metadatas . len ( )
371
378
}
372
379
373
380
fn null_counts ( & self , column : & Column ) -> Option < ArrayRef > {
374
- let ( c , _ ) = self . column ( & column. name ) ? ;
375
- let scalar = ScalarValue :: UInt64 ( Some ( c . statistics ( ) ? . null_count ( ) ) ) ;
376
- scalar . to_array ( ) . ok ( )
381
+ self . statistics_converter ( column)
382
+ . and_then ( |c| c . row_group_null_counts ( self . metadata_iter ( ) ) )
383
+ . ok ( )
377
384
}
378
385
379
- fn row_counts ( & self , column : & Column ) -> Option < ArrayRef > {
380
- let ( c, _) = self . column ( & column. name ) ?;
381
- let scalar = ScalarValue :: UInt64 ( Some ( c. num_values ( ) as u64 ) ) ;
382
- scalar. to_array ( ) . ok ( )
386
+ fn row_counts ( & self , _column : & Column ) -> Option < ArrayRef > {
387
+ // row counts are the same for all columns in a row group
388
+ StatisticsConverter :: row_group_row_counts ( self . metadata_iter ( ) )
389
+ . ok ( )
390
+ . map ( |counts| Arc :: new ( counts) as ArrayRef )
383
391
}
384
392
385
393
fn contained (
@@ -406,6 +414,7 @@ mod tests {
406
414
use parquet:: arrow:: async_reader:: ParquetObjectReader ;
407
415
use parquet:: basic:: LogicalType ;
408
416
use parquet:: data_type:: { ByteArray , FixedLenByteArray } ;
417
+ use parquet:: file:: metadata:: ColumnChunkMetaData ;
409
418
use parquet:: {
410
419
basic:: Type as PhysicalType , file:: statistics:: Statistics as ParquetStatistics ,
411
420
schema:: types:: SchemaDescPtr ,
0 commit comments