Skip to content

Extract parquet statistics from timestamps with timezones #10758

Closed
@alamb

Description

@alamb

Is your feature request related to a problem or challenge?

Part of #10453, where we are filling out support for extracting statistics for all data types from parquet files

At the moment, even if statistics are extracted for a different type (like Int32) the PruningPredicate will attempt to cast these values to the correct type:

// cast statistics array to required data type (e.g. parquet
// provides timestamp statistics as "Int64")
let array = arrow::compute::cast(&array, data_type)?;

However, in order to be efficient and ensure the cast kernel doesn't add anything incorrectly, we should be extracting the parquet statistics as the correct Array type directly. It turns out we do not do this yet for several types and those types do not have good (or any) test coverage. We almost missed this in #10711 in @xinlifoobar

Thus, we need to add support and tests for other types

Describe the solution you'd like

  1. Add a new test to arrow_stastics.rs (run this with cargo test --test parquet_exec) with the relevant type
  2. Potentially add a new case to the match here
    macro_rules! get_statistic {
    ($column_statistics:expr, $func:ident, $bytes_func:ident, $target_arrow_type:expr) => {{
    if !$column_statistics.has_min_max_set() {
    return None;
    }
    match $column_statistics {
    ParquetStatistics::Boolean(s) => Some(ScalarValue::Boolean(Some(*s.$func()))),
    ParquetStatistics::Int32(s) => {
    match $target_arrow_type {
    // int32 to decimal with the precision and scale
    Some(DataType::Decimal128(precision, scale)) => {
    Some(ScalarValue::Decimal128(
    Some(*s.$func() as i128),
    *precision,
    *scale,
    ))
    }
    Some(DataType::Int8) => {
    Some(ScalarValue::Int8(Some((*s.$func()).try_into().unwrap())))
    }
    Some(DataType::Int16) => {
    Some(ScalarValue::Int16(Some((*s.$func()).try_into().unwrap())))
    }
    Some(DataType::UInt8) => {
    Some(ScalarValue::UInt8(Some((*s.$func()).try_into().unwrap())))
    }
    Some(DataType::UInt16) => {
    Some(ScalarValue::UInt16(Some((*s.$func()).try_into().unwrap())))
    }
    Some(DataType::UInt32) => {
    Some(ScalarValue::UInt32(Some((*s.$func()) as u32)))
    }
    Some(DataType::Date32) => {
    Some(ScalarValue::Date32(Some(*s.$func())))
    }
    Some(DataType::Date64) => {
    Some(ScalarValue::Date64(Some(i64::from(*s.$func()) * 24 * 60 * 60 * 1000)))
    }
    _ => Some(ScalarValue::Int32(Some(*s.$func()))),
    }
    }
    ParquetStatistics::Int64(s) => {
    match $target_arrow_type {
    // int64 to decimal with the precision and scale
    Some(DataType::Decimal128(precision, scale)) => {
    Some(ScalarValue::Decimal128(
    Some(*s.$func() as i128),
    *precision,
    *scale,
    ))
    }
    Some(DataType::UInt64) => {
    Some(ScalarValue::UInt64(Some((*s.$func()) as u64)))
    }
    _ => Some(ScalarValue::Int64(Some(*s.$func()))),
    }
    }
    // 96 bit ints not supported
    ParquetStatistics::Int96(_) => None,
    ParquetStatistics::Float(s) => Some(ScalarValue::Float32(Some(*s.$func()))),
    ParquetStatistics::Double(s) => Some(ScalarValue::Float64(Some(*s.$func()))),
    ParquetStatistics::ByteArray(s) => {
    match $target_arrow_type {
    // decimal data type
    Some(DataType::Decimal128(precision, scale)) => {
    Some(ScalarValue::Decimal128(
    Some(from_bytes_to_i128(s.$bytes_func())),
    *precision,
    *scale,
    ))
    }
    Some(DataType::Binary) => {
    Some(ScalarValue::Binary(Some(s.$bytes_func().to_vec())))
    }
    _ => {
    let s = std::str::from_utf8(s.$bytes_func())
    .map(|s| s.to_string())
    .ok();
    if s.is_none() {
    log::debug!(
    "Utf8 statistics is a non-UTF8 value, ignoring it."
    );
    }
    Some(ScalarValue::Utf8(s))
    }
    }
    }
    // type not fully supported yet
    ParquetStatistics::FixedLenByteArray(s) => {
    match $target_arrow_type {
    // just support specific logical data types, there are others each
    // with their own ordering
    Some(DataType::Decimal128(precision, scale)) => {
    Some(ScalarValue::Decimal128(
    Some(from_bytes_to_i128(s.$bytes_func())),
    *precision,
    *scale,
    ))
    }
    Some(DataType::FixedSizeBinary(size)) => {
    let value = s.$bytes_func().to_vec();
    let value = if value.len().try_into() == Ok(*size) {
    Some(value)
    } else {
    log::debug!(
    "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.",
    size,
    value.len(),
    );
    None
    };
    Some(ScalarValue::FixedSizeBinary(
    *size,
    value,
    ))
    }
    _ => None,
    }
    }
    }
    }};
    }
    to get the correct types

Here are some example PRs:

  1. Minor: Add tests for extracting dictionary parquet statistics #10729
  2. Extract Date32 parquet statistics as Date32Array rather than Int32Array #10593

Describe alternatives you've considered

No response

Additional context

No response

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions