Skip to content

Commit 06bbe12

Browse files
alambtustvold
andauthored
Extract parquet statistics to its own module, add tests (apache#8294)
* Extract parquet statistics to its own module, add tests * Update datafusion/core/src/datasource/physical_plan/parquet/statistics.rs Co-authored-by: Raphael Taylor-Davies <[email protected]> * rename enum * Improve API * Add test for reading struct array statistics * Add test for column after statistics * improve tests * simplify * clippy * Update datafusion/core/src/datasource/physical_plan/parquet/statistics.rs * Update datafusion/core/src/datasource/physical_plan/parquet/statistics.rs * Add test showing incorrect statistics * Rework statistics * Fix clippy * Update documentation and make it clear the statistics are not publically accessable * Add link to upstream arrow ticket --------- Co-authored-by: Raphael Taylor-Davies <[email protected]> Co-authored-by: Raphael Taylor-Davies <[email protected]>
1 parent 167b5b7 commit 06bbe12

File tree

4 files changed

+951
-166
lines changed

4 files changed

+951
-166
lines changed

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 2 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ mod metrics;
6666
pub mod page_filter;
6767
mod row_filter;
6868
mod row_groups;
69+
mod statistics;
6970

7071
pub use metrics::ParquetFileMetrics;
7172

@@ -506,6 +507,7 @@ impl FileOpener for ParquetOpener {
506507
let file_metadata = builder.metadata().clone();
507508
let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
508509
let mut row_groups = row_groups::prune_row_groups_by_statistics(
510+
builder.parquet_schema(),
509511
file_metadata.row_groups(),
510512
file_range,
511513
predicate,
@@ -718,28 +720,6 @@ pub async fn plan_to_parquet(
718720
Ok(())
719721
}
720722

721-
// Copy from the arrow-rs
722-
// https://github.com/apache/arrow-rs/blob/733b7e7fd1e8c43a404c3ce40ecf741d493c21b4/parquet/src/arrow/buffer/bit_util.rs#L55
723-
// Convert the byte slice to fixed length byte array with the length of 16
724-
fn sign_extend_be(b: &[u8]) -> [u8; 16] {
725-
assert!(b.len() <= 16, "Array too large, expected less than 16");
726-
let is_negative = (b[0] & 128u8) == 128u8;
727-
let mut result = if is_negative { [255u8; 16] } else { [0u8; 16] };
728-
for (d, s) in result.iter_mut().skip(16 - b.len()).zip(b) {
729-
*d = *s;
730-
}
731-
result
732-
}
733-
734-
// Convert the bytes array to i128.
735-
// The endian of the input bytes array must be big-endian.
736-
pub(crate) fn from_bytes_to_i128(b: &[u8]) -> i128 {
737-
// The bytes array are from parquet file and must be the big-endian.
738-
// The endian is defined by parquet format, and the reference document
739-
// https://github.com/apache/parquet-format/blob/54e53e5d7794d383529dd30746378f19a12afd58/src/main/thrift/parquet.thrift#L66
740-
i128::from_be_bytes(sign_extend_be(b))
741-
}
742-
743723
// Convert parquet column schema to arrow data type, and just consider the
744724
// decimal data type.
745725
pub(crate) fn parquet_to_arrow_decimal_type(

datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,8 @@ use parquet::{
3939
};
4040
use std::sync::Arc;
4141

42-
use crate::datasource::physical_plan::parquet::{
43-
from_bytes_to_i128, parquet_to_arrow_decimal_type,
44-
};
42+
use crate::datasource::physical_plan::parquet::parquet_to_arrow_decimal_type;
43+
use crate::datasource::physical_plan::parquet::statistics::from_bytes_to_i128;
4544
use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
4645

4746
use super::metrics::ParquetFileMetrics;

0 commit comments

Comments
 (0)