21
21
/// `arrow-rs/parquet/tests/arrow_reader/statistics.rs`.
22
22
use crate :: arrow:: buffer:: bit_util:: sign_extend_be;
23
23
use crate :: arrow:: parquet_column;
24
+ use crate :: basic:: Type as PhysicalType ;
24
25
use crate :: data_type:: { ByteArray , FixedLenByteArray } ;
25
26
use crate :: errors:: { ParquetError , Result } ;
26
27
use crate :: file:: metadata:: { ParquetColumnIndex , ParquetOffsetIndex , RowGroupMetaData } ;
@@ -318,7 +319,7 @@ make_decimal_stats_iterator!(
318
319
/// data_type: The data type of the statistics (e.g. `DataType::Int32`)
319
320
/// iterator: The iterator of [`ParquetStatistics`] to extract the statistics from.
320
321
macro_rules! get_statistics {
321
- ( $stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
322
+ ( $stat_type_prefix: ident, $data_type: ident, $iterator: ident, $physical_type : ident ) => {
322
323
paste! {
323
324
match $data_type {
324
325
DataType :: Boolean => Ok ( Arc :: new( BooleanArray :: from_iter(
@@ -370,9 +371,11 @@ macro_rules! get_statistics {
370
371
DataType :: Date32 => Ok ( Arc :: new( Date32Array :: from_iter(
371
372
[ <$stat_type_prefix Int32StatsIterator >] :: new( $iterator) . map( |x| x. copied( ) ) ,
372
373
) ) ) ,
373
- DataType :: Date64 => Ok ( Arc :: new( Date64Array :: from_iter(
374
- [ <$stat_type_prefix Int64StatsIterator >] :: new( $iterator) . map( |x| x. copied( ) ) ,
375
- ) ) ) ,
374
+ DataType :: Date64 if $physical_type == Some ( PhysicalType :: INT32 ) => Ok ( Arc :: new( Date64Array :: from_iter(
375
+ [ <$stat_type_prefix Int32StatsIterator >] :: new( $iterator)
376
+ . map( |x| x. map( |x| i64 :: from( * x) * 24 * 60 * 60 * 1000 ) ) ) ) ) ,
377
+ DataType :: Date64 if $physical_type == Some ( PhysicalType :: INT64 ) => Ok ( Arc :: new( Date64Array :: from_iter(
378
+ [ <$stat_type_prefix Int64StatsIterator >] :: new( $iterator) . map( |x| x. copied( ) ) , ) ) ) ,
376
379
DataType :: Timestamp ( unit, timezone) =>{
377
380
let iter = [ <$stat_type_prefix Int64StatsIterator >] :: new( $iterator) . map( |x| x. copied( ) ) ;
378
381
Ok ( match unit {
@@ -486,7 +489,7 @@ macro_rules! get_statistics {
486
489
Ok ( Arc :: new( arr) )
487
490
} ,
488
491
DataType :: Dictionary ( _, value_type) => {
489
- [ <$stat_type_prefix: lower _ statistics>] ( value_type, $iterator)
492
+ [ <$stat_type_prefix: lower _ statistics>] ( value_type, $iterator, $physical_type )
490
493
} ,
491
494
DataType :: Utf8View => {
492
495
let iterator = [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) ;
@@ -523,6 +526,7 @@ macro_rules! get_statistics {
523
526
DataType :: Map ( _, _) |
524
527
DataType :: Duration ( _) |
525
528
DataType :: Interval ( _) |
529
+ DataType :: Date64 | // required to cover $physical_type match guard
526
530
DataType :: Null |
527
531
DataType :: List ( _) |
528
532
DataType :: ListView ( _) |
@@ -1054,8 +1058,9 @@ macro_rules! get_data_page_statistics {
1054
1058
fn min_statistics < ' a , I : Iterator < Item = Option < & ' a ParquetStatistics > > > (
1055
1059
data_type : & DataType ,
1056
1060
iterator : I ,
1061
+ physical_type : Option < PhysicalType > ,
1057
1062
) -> Result < ArrayRef > {
1058
- get_statistics ! ( Min , data_type, iterator)
1063
+ get_statistics ! ( Min , data_type, iterator, physical_type )
1059
1064
}
1060
1065
1061
1066
/// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -1064,8 +1069,9 @@ fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
1064
1069
fn max_statistics < ' a , I : Iterator < Item = Option < & ' a ParquetStatistics > > > (
1065
1070
data_type : & DataType ,
1066
1071
iterator : I ,
1072
+ physical_type : Option < PhysicalType > ,
1067
1073
) -> Result < ArrayRef > {
1068
- get_statistics ! ( Max , data_type, iterator)
1074
+ get_statistics ! ( Max , data_type, iterator, physical_type )
1069
1075
}
1070
1076
1071
1077
/// Extracts the min statistics from an iterator
@@ -1164,6 +1170,8 @@ pub struct StatisticsConverter<'a> {
1164
1170
arrow_field : & ' a Field ,
1165
1171
/// treat missing null_counts as 0 nulls
1166
1172
missing_null_counts_as_zero : bool ,
1173
+ /// The physical type of the matched column in the Parquet schema
1174
+ physical_type : Option < PhysicalType > ,
1167
1175
}
1168
1176
1169
1177
impl < ' a > StatisticsConverter < ' a > {
@@ -1291,6 +1299,7 @@ impl<'a> StatisticsConverter<'a> {
1291
1299
parquet_column_index : parquet_index,
1292
1300
arrow_field,
1293
1301
missing_null_counts_as_zero : true ,
1302
+ physical_type : parquet_index. map ( |idx| parquet_schema. column ( idx) . physical_type ( ) ) ,
1294
1303
} )
1295
1304
}
1296
1305
@@ -1333,7 +1342,7 @@ impl<'a> StatisticsConverter<'a> {
1333
1342
/// // get the minimum value for the column "foo" in the parquet file
1334
1343
/// let min_values: ArrayRef = converter
1335
1344
/// .row_group_mins(metadata.row_groups().iter())
1336
- /// .unwrap();
1345
+ /// .unwrap();
1337
1346
/// // if "foo" is a Float64 value, the returned array will contain Float64 values
1338
1347
/// assert_eq!(min_values, Arc::new(Float64Array::from(vec![Some(1.0), Some(2.0)])) as _);
1339
1348
/// ```
@@ -1350,7 +1359,7 @@ impl<'a> StatisticsConverter<'a> {
1350
1359
let iter = metadatas
1351
1360
. into_iter ( )
1352
1361
. map ( |x| x. column ( parquet_index) . statistics ( ) ) ;
1353
- min_statistics ( data_type, iter)
1362
+ min_statistics ( data_type, iter, self . physical_type )
1354
1363
}
1355
1364
1356
1365
/// Extract the maximum values from row group statistics in [`RowGroupMetaData`]
@@ -1369,7 +1378,7 @@ impl<'a> StatisticsConverter<'a> {
1369
1378
let iter = metadatas
1370
1379
. into_iter ( )
1371
1380
. map ( |x| x. column ( parquet_index) . statistics ( ) ) ;
1372
- max_statistics ( data_type, iter)
1381
+ max_statistics ( data_type, iter, self . physical_type )
1373
1382
}
1374
1383
1375
1384
/// Extract the null counts from row group statistics in [`RowGroupMetaData`]
0 commit comments