|
19 | 19 |
|
20 | 20 | // TODO: potentially move this to arrow-rs: https://github.com/apache/arrow-rs/issues/4328
|
21 | 21 |
|
22 |
| -use arrow::array::StringBuilder; |
| 22 | +use arrow::array::{FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder}; |
23 | 23 | use arrow::datatypes::i256;
|
24 | 24 | use arrow::{array::ArrayRef, datatypes::DataType};
|
25 | 25 | use arrow_array::{
|
26 | 26 | new_empty_array, new_null_array, BinaryArray, BooleanArray, Date32Array, Date64Array,
|
27 | 27 | Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array, Float32Array,
|
28 | 28 | Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
|
29 |
| - LargeStringArray, StringArray, Time32MillisecondArray, Time32SecondArray, |
30 |
| - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, |
31 |
| - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, |
32 |
| - UInt16Array, UInt32Array, UInt64Array, UInt8Array, |
| 29 | + LargeStringArray, Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, |
| 30 | + Time64NanosecondArray, TimestampMicrosecondArray, TimestampMillisecondArray, |
| 31 | + TimestampNanosecondArray, TimestampSecondArray, UInt16Array, UInt32Array, |
| 32 | + UInt64Array, UInt8Array, |
33 | 33 | };
|
34 | 34 | use arrow_schema::{Field, FieldRef, Schema, TimeUnit};
|
35 | 35 | use datafusion_common::{internal_datafusion_err, internal_err, plan_err, Result};
|
@@ -398,46 +398,67 @@ macro_rules! get_statistics {
|
398 | 398 | DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
|
399 | 399 | [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x|x.to_vec())),
|
400 | 400 | ))),
|
401 |
| - DataType::Utf8 => Ok(Arc::new(StringArray::from_iter( |
402 |
| - [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| { |
403 |
| - x.and_then(|x| { |
404 |
| - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); |
405 |
| - if res.is_none() { |
406 |
| - log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); |
407 |
| - } |
408 |
| - res |
409 |
| - }) |
410 |
| - }), |
411 |
| - ))), |
| 401 | + DataType::Utf8 => { |
| 402 | + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); |
| 403 | + let mut builder = StringBuilder::new(); |
| 404 | + for x in iterator { |
| 405 | + let Some(x) = x else { |
| 406 | + builder.append_null(); // no statistics value |
| 407 | + continue; |
| 408 | + }; |
| 409 | + |
| 410 | + let Ok(x) = std::str::from_utf8(x) else { |
| 411 | + log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); |
| 412 | + builder.append_null(); |
| 413 | + continue; |
| 414 | + }; |
| 415 | + |
| 416 | + builder.append_value(x); |
| 417 | + } |
| 418 | + Ok(Arc::new(builder.finish())) |
| 419 | + }, |
412 | 420 | DataType::LargeUtf8 => {
|
413 |
| - Ok(Arc::new(LargeStringArray::from_iter( |
414 |
| - [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator).map(|x| { |
415 |
| - x.and_then(|x| { |
416 |
| - let res = std::str::from_utf8(x).map(|s| s.to_string()).ok(); |
417 |
| - if res.is_none() { |
418 |
| - log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); |
419 |
| - } |
420 |
| - res |
421 |
| - }) |
422 |
| - }), |
423 |
| - ))) |
| 421 | + let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator); |
| 422 | + let mut builder = LargeStringBuilder::new(); |
| 423 | + for x in iterator { |
| 424 | + let Some(x) = x else { |
| 425 | + builder.append_null(); // no statistics value |
| 426 | + continue; |
| 427 | + }; |
| 428 | + |
| 429 | + let Ok(x) = std::str::from_utf8(x) else { |
| 430 | + log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); |
| 431 | + builder.append_null(); |
| 432 | + continue; |
| 433 | + }; |
| 434 | + |
| 435 | + builder.append_value(x); |
| 436 | + } |
| 437 | + Ok(Arc::new(builder.finish())) |
| 438 | + } |
| 439 | + DataType::FixedSizeBinary(size) => { |
| 440 | + let iterator = MaxFixedLenByteArrayStatsIterator::new($iterator); |
| 441 | + let mut builder = FixedSizeBinaryBuilder::new(*size); |
| 442 | + for x in iterator { |
| 443 | + let Some(x) = x else { |
| 444 | + builder.append_null(); // no statistics value |
| 445 | + continue; |
| 446 | + }; |
| 447 | + |
| 448 | + if x.len().try_into() != Ok(*size){ |
| 449 | + log::debug!( |
| 450 | + "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", |
| 451 | + size, |
| 452 | + x.len(), |
| 453 | + ); |
| 454 | + builder.append_null(); // no statistics value |
| 455 | + continue; |
| 456 | + } |
| 457 | + |
| 458 | + let _ = builder.append_value(x); |
| 459 | + } |
| 460 | + Ok(Arc::new(builder.finish())) |
424 | 461 | }
|
425 |
| - DataType::FixedSizeBinary(size) => Ok(Arc::new(FixedSizeBinaryArray::from( |
426 |
| - [<$stat_type_prefix FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| { |
427 |
| - x.and_then(|x| { |
428 |
| - if x.len().try_into() == Ok(*size) { |
429 |
| - Some(x) |
430 |
| - } else { |
431 |
| - log::debug!( |
432 |
| - "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", |
433 |
| - size, |
434 |
| - x.len(), |
435 |
| - ); |
436 |
| - None |
437 |
| - } |
438 |
| - }) |
439 |
| - }).collect::<Vec<_>>(), |
440 |
| - ))), |
441 | 462 | DataType::Decimal128(precision, scale) => {
|
442 | 463 | let arr = Decimal128Array::from_iter(
|
443 | 464 | [<$stat_type_prefix Decimal128StatsIterator>]::new($iterator)
|
@@ -999,6 +1020,153 @@ fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>(
|
999 | 1020 | get_statistics!(Max, data_type, iterator)
|
1000 | 1021 | }
|
1001 | 1022 |
|
| 1023 | +// fn max_statistics<'a, I: Iterator<Item = Option<&'a ParquetStatistics>>>( |
| 1024 | +// data_type: &DataType, |
| 1025 | +// iterator: I, |
| 1026 | +// ) -> Result<ArrayRef> { |
| 1027 | +// match data_type { |
| 1028 | +// DataType::Boolean => Ok(Arc::new(BooleanArray::from_iter(MaxBooleanStatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1029 | +// DataType::Int8 => Ok(Arc::new(Int8Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|{ |
| 1030 | +// x.and_then(|x|i8::try_from(*x).ok()) |
| 1031 | +// }),))), |
| 1032 | +// DataType::Int16 => Ok(Arc::new(Int16Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|{ |
| 1033 | +// x.and_then(|x|i16::try_from(*x).ok()) |
| 1034 | +// }),))), |
| 1035 | +// DataType::Int32 => Ok(Arc::new(Int32Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1036 | +// DataType::Int64 => Ok(Arc::new(Int64Array::from_iter(MaxInt64StatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1037 | +// DataType::UInt8 => Ok(Arc::new(UInt8Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|{ |
| 1038 | +// x.and_then(|x|u8::try_from(*x).ok()) |
| 1039 | +// }),))), |
| 1040 | +// DataType::UInt16 => Ok(Arc::new(UInt16Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|{ |
| 1041 | +// x.and_then(|x|u16::try_from(*x).ok()) |
| 1042 | +// }),))), |
| 1043 | +// DataType::UInt32 => Ok(Arc::new(UInt32Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.map(|x| *x as u32)),))), |
| 1044 | +// DataType::UInt64 => Ok(Arc::new(UInt64Array::from_iter(MaxInt64StatsIterator::new(iterator).map(|x|x.map(|x| *x as u64)),))), |
| 1045 | +// DataType::Float16 => Ok(Arc::new(Float16Array::from_iter(MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x|x.and_then(|x|{ |
| 1046 | +// from_bytes_to_f16(x) |
| 1047 | +// })),))), |
| 1048 | +// DataType::Float32 => Ok(Arc::new(Float32Array::from_iter(MaxFloatStatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1049 | +// DataType::Float64 => Ok(Arc::new(Float64Array::from_iter(MaxDoubleStatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1050 | +// DataType::Date32 => Ok(Arc::new(Date32Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.copied()),))), |
| 1051 | +// DataType::Date64 => Ok(Arc::new(Date64Array::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.map(|x|i64::from(*x)*24*60*60*1000)),))), |
| 1052 | +// DataType::Timestamp(unit,timezone) => { |
| 1053 | +// let iter = MaxInt64StatsIterator::new(iterator).map(|x|x.copied()); |
| 1054 | +// Ok(match unit { |
| 1055 | +// TimeUnit::Second => Arc::new(TimestampSecondArray::from_iter(iter).with_timezone_opt(timezone.clone())), |
| 1056 | +// TimeUnit::Millisecond => Arc::new(TimestampMillisecondArray::from_iter(iter).with_timezone_opt(timezone.clone())), |
| 1057 | +// TimeUnit::Microsecond => Arc::new(TimestampMicrosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())), |
| 1058 | +// TimeUnit::Nanosecond => Arc::new(TimestampNanosecondArray::from_iter(iter).with_timezone_opt(timezone.clone())), |
| 1059 | +// }) |
| 1060 | +// }, |
| 1061 | +// DataType::Time32(unit) => { |
| 1062 | +// Ok(match unit { |
| 1063 | +// TimeUnit::Second => Arc::new(Time32SecondArray::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.copied()),)), |
| 1064 | +// TimeUnit::Millisecond => Arc::new(Time32MillisecondArray::from_iter(MaxInt32StatsIterator::new(iterator).map(|x|x.copied()),)), |
| 1065 | +// _ => { |
| 1066 | +// let len = iterator.count(); |
| 1067 | +// new_null_array(data_type,len) |
| 1068 | +// } |
| 1069 | +// }) |
| 1070 | +// }, |
| 1071 | +// DataType::Time64(unit) => { |
| 1072 | +// Ok(match unit { |
| 1073 | +// TimeUnit::Microsecond => Arc::new(Time64MicrosecondArray::from_iter(MaxInt64StatsIterator::new(iterator).map(|x|x.copied()),)), |
| 1074 | +// TimeUnit::Nanosecond => Arc::new(Time64NanosecondArray::from_iter(MaxInt64StatsIterator::new(iterator).map(|x|x.copied()),)), |
| 1075 | +// _ => { |
| 1076 | +// let len = iterator.count(); |
| 1077 | +// new_null_array(data_type,len) |
| 1078 | +// } |
| 1079 | +// }) |
| 1080 | +// }, |
| 1081 | +// DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(MaxByteArrayStatsIterator::new(iterator)))), |
| 1082 | +// DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(MaxByteArrayStatsIterator::new(iterator)))), |
| 1083 | +// DataType::Utf8 => { |
| 1084 | +// let iterator = MaxByteArrayStatsIterator::new(iterator); |
| 1085 | +// let mut builder = StringBuilder::new(); |
| 1086 | +// for x in iterator { |
| 1087 | +// let Some(x) = x else { |
| 1088 | +// builder.append_null(); // no statistics value |
| 1089 | +// continue; |
| 1090 | +// }; |
| 1091 | + |
| 1092 | +// let Ok(x) = std::str::from_utf8(x) else { |
| 1093 | +// log::debug!("Utf8 statistics is a non-UTF8 value, ignoring it."); |
| 1094 | +// builder.append_null(); |
| 1095 | +// continue; |
| 1096 | +// }; |
| 1097 | + |
| 1098 | +// builder.append_value(x); |
| 1099 | +// } |
| 1100 | +// Ok(Arc::new(builder.finish())) |
| 1101 | +// }, |
| 1102 | +// DataType::LargeUtf8 => { |
| 1103 | +// Ok(Arc::new(LargeStringArray::from_iter(MaxByteArrayStatsIterator::new(iterator).map(|x|{ |
| 1104 | +// x.and_then(|x|{ |
| 1105 | +// let res = std::str::from_utf8(x).map(|s|s.to_string()).ok(); |
| 1106 | +// if res.is_none() { |
| 1107 | +// log::debug!("LargeUtf8 statistics is a non-UTF8 value, ignoring it."); |
| 1108 | +// } |
| 1109 | +// res |
| 1110 | +// }) |
| 1111 | +// }),))) |
| 1112 | +// } |
| 1113 | +// DataType::FixedSizeBinary(size) => { |
| 1114 | +// let iterator = MaxFixedLenByteArrayStatsIterator::new(iterator); |
| 1115 | +// let mut builder = FixedSizeBinaryBuilder::new(size); |
| 1116 | +// for x in iterator { |
| 1117 | +// let Some(x) = x else { |
| 1118 | +// builder.append_null(); // no statistics value |
| 1119 | +// continue; |
| 1120 | +// }; |
| 1121 | + |
| 1122 | +// if x.len().try_into() != Ok(*size){ |
| 1123 | +// log::debug!( |
| 1124 | +// "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", |
| 1125 | +// size, |
| 1126 | +// x.len(), |
| 1127 | +// ); |
| 1128 | +// builder.append_null(); // no statistics value |
| 1129 | +// continue; |
| 1130 | +// } |
| 1131 | + |
| 1132 | +// builder.append_value(x); |
| 1133 | +// } |
| 1134 | +// Ok(Arc::new(builder.finish())) |
| 1135 | +// } |
| 1136 | + |
| 1137 | +// // Ok(Arc::new(FixedSizeBinaryArray::from(MaxFixedLenByteArrayStatsIterator::new(iterator).map(|x|{ |
| 1138 | +// // x.and_then(|x|{ |
| 1139 | +// // if x.len().try_into()==Ok(*size){ |
| 1140 | +// // Some(x) |
| 1141 | +// // }else { |
| 1142 | +// log::debug!( |
| 1143 | +// "FixedSizeBinary({}) statistics is a binary of size {}, ignoring it.", |
| 1144 | +// size, |
| 1145 | +// x.len(), |
| 1146 | +// ); |
| 1147 | +// // None |
| 1148 | +// // } |
| 1149 | +// // }) |
| 1150 | +// // }).collect::<Vec<_>>(),))), |
| 1151 | +// DataType::Decimal128(precision,scale) => { |
| 1152 | +// let arr = Decimal128Array::from_iter(MaxDecimal128StatsIterator::new(iterator)).with_precision_and_scale(*precision, *scale)?; |
| 1153 | +// Ok(Arc::new(arr)) |
| 1154 | +// }, |
| 1155 | +// DataType::Decimal256(precision,scale) => { |
| 1156 | +// let arr = Decimal256Array::from_iter(MaxDecimal256StatsIterator::new(iterator)).with_precision_and_scale(*precision, *scale)?; |
| 1157 | +// Ok(Arc::new(arr)) |
| 1158 | +// }, |
| 1159 | +// DataType::Dictionary(_,value_type) => { |
| 1160 | +// max_statistics(value_type,iterator) |
| 1161 | +// } |
| 1162 | +// DataType::Map(_,_)|DataType::Duration(_)|DataType::Interval(_)|DataType::Null|DataType::BinaryView|DataType::Utf8View|DataType::List(_)|DataType::ListView(_)|DataType::FixedSizeList(_,_)|DataType::LargeList(_)|DataType::LargeListView(_)|DataType::Struct(_)|DataType::Union(_,_)|DataType::RunEndEncoded(_,_) => { |
| 1163 | +// let len = iterator.count(); |
| 1164 | +// Ok(new_null_array(data_type,len)) |
| 1165 | +// } |
| 1166 | +// } |
| 1167 | +// } |
| 1168 | + |
| 1169 | + |
1002 | 1170 | /// Extracts the min statistics from an iterator
|
1003 | 1171 | /// of parquet page [`Index`]'es to an [`ArrayRef`]
|
1004 | 1172 | pub(crate) fn min_page_statistics<'a, I>(
|
|
0 commit comments