Skip to content

Commit 169ba01

Browse files
committed
chore: add physical_type to StatisticsConverter to account for coerce_types
1 parent e0fb77c commit 169ba01

File tree

1 file changed

+19
-10
lines changed

1 file changed

+19
-10
lines changed

parquet/src/arrow/arrow_reader/statistics.rs

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
/// `arrow-rs/parquet/tests/arrow_reader/statistics.rs`.
2222
use crate::arrow::buffer::bit_util::sign_extend_be;
2323
use crate::arrow::parquet_column;
24+
use crate::basic::Type as PhysicalType;
2425
use crate::data_type::{ByteArray, FixedLenByteArray};
2526
use crate::errors::{ParquetError, Result};
2627
use crate::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex, RowGroupMetaData};
@@ -318,7 +319,7 @@ make_decimal_stats_iterator!(
318319
/// data_type: The data type of the statistics (e.g. `DataType::Int32`)
319320
/// iterator: The iterator of [`ParquetStatistics`] to extract the statistics from.
320321
macro_rules! get_statistics {
321-
($stat_type_prefix: ident, $data_type: ident, $iterator: ident) => {
322+
($stat_type_prefix: ident, $data_type: ident, $iterator: ident, $physical_type: ident) => {
322323
paste! {
323324
match $data_type {
324325
DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(
@@ -370,9 +371,11 @@ macro_rules! get_statistics {
370371
DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(
371372
[<$stat_type_prefix Int32StatsIterator>]::new($iterator).map(|x| x.copied()),
372373
))),
373-
DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(
374-
[<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),
375-
))),
374+
DataType::Date64 if $physical_type == Some(PhysicalType::INT32) => Ok(Arc::new(Date64Array::from_iter(
375+
[<$stat_type_prefix Int32StatsIterator>]::new($iterator)
376+
.map(|x| x.map(|x| i64::from(*x) * 24 * 60 * 60 * 1000))))),
377+
DataType::Date64 if $physical_type == Some(PhysicalType::INT64) => Ok(Arc::new(Date64Array::from_iter(
378+
[<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied()),))),
376379
DataType::Timestamp(unit, timezone) =>{
377380
let iter = [<$stat_type_prefix Int64StatsIterator>]::new($iterator).map(|x| x.copied());
378381
Ok(match unit {
@@ -486,7 +489,7 @@ macro_rules! get_statistics {
486489
Ok(Arc::new(arr))
487490
},
488491
DataType::Dictionary(_, value_type) => {
489-
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
492+
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator, $physical_type)
490493
},
491494
DataType::Utf8View => {
492495
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
@@ -523,6 +526,7 @@ macro_rules! get_statistics {
523526
DataType::Map(_,_) |
524527
DataType::Duration(_) |
525528
DataType::Interval(_) |
529+
DataType::Date64 | // required to cover $physical_type match guard
526530
DataType::Null |
527531
DataType::List(_) |
528532
DataType::ListView(_) |
@@ -1054,8 +1058,9 @@ macro_rules! get_data_page_statistics {
10541058
fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
10551059
data_type: &DataType,
10561060
iterator: I,
1061+
physical_type: Option<PhysicalType>,
10571062
) -> Result<ArrayRef> {
1058-
get_statistics!(Min, data_type, iterator)
1063+
get_statistics!(Min, data_type, iterator, physical_type)
10591064
}
10601065

10611066
/// Extracts the max statistics from an iterator of [`ParquetStatistics`] to an [`ArrayRef`]
@@ -1064,8 +1069,9 @@ fn min_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
10641069
fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
10651070
data_type: &DataType,
10661071
iterator: I,
1072+
physical_type: Option<PhysicalType>,
10671073
) -> Result<ArrayRef> {
1068-
get_statistics!(Max, data_type, iterator)
1074+
get_statistics!(Max, data_type, iterator, physical_type)
10691075
}
10701076

10711077
/// Extracts the min statistics from an iterator
@@ -1164,6 +1170,8 @@ pub struct StatisticsConverter<'a> {
11641170
arrow_field: &'a Field,
11651171
/// treat missing null_counts as 0 nulls
11661172
missing_null_counts_as_zero: bool,
1173+
/// The physical type of the matched column in the Parquet schema
1174+
physical_type: Option<PhysicalType>,
11671175
}
11681176

11691177
impl<'a> StatisticsConverter<'a> {
@@ -1291,6 +1299,7 @@ impl<'a> StatisticsConverter<'a> {
12911299
parquet_column_index: parquet_index,
12921300
arrow_field,
12931301
missing_null_counts_as_zero: true,
1302+
physical_type: parquet_index.map(|idx| parquet_schema.column(idx).physical_type()),
12941303
})
12951304
}
12961305

@@ -1333,7 +1342,7 @@ impl<'a> StatisticsConverter<'a> {
13331342
/// // get the minimum value for the column "foo" in the parquet file
13341343
/// let min_values: ArrayRef = converter
13351344
/// .row_group_mins(metadata.row_groups().iter())
1336-
/// .unwrap();
1345+
/// .unwrap();
13371346
/// // if "foo" is a Float64 value, the returned array will contain Float64 values
13381347
/// assert_eq!(min_values, Arc::new(Float64Array::from(vec![Some(1.0), Some(2.0)])) as _);
13391348
/// ```
@@ -1350,7 +1359,7 @@ impl<'a> StatisticsConverter<'a> {
13501359
let iter = metadatas
13511360
.into_iter()
13521361
.map(|x| x.column(parquet_index).statistics());
1353-
min_statistics(data_type, iter)
1362+
min_statistics(data_type, iter, self.physical_type)
13541363
}
13551364

13561365
/// Extract the maximum values from row group statistics in [`RowGroupMetaData`]
@@ -1369,7 +1378,7 @@ impl<'a> StatisticsConverter<'a> {
13691378
let iter = metadatas
13701379
.into_iter()
13711380
.map(|x| x.column(parquet_index).statistics());
1372-
max_statistics(data_type, iter)
1381+
max_statistics(data_type, iter, self.physical_type)
13731382
}
13741383

13751384
/// Extract the null counts from row group statistics in [`RowGroupMetaData`]

0 commit comments

Comments
 (0)