From b0e3f7d0ee6f86ea03aa478440265374bbffb65e Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 6 Dec 2024 10:48:51 -0500 Subject: [PATCH 1/3] . --- parquet/src/arrow/arrow_reader/statistics.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/parquet/src/arrow/arrow_reader/statistics.rs b/parquet/src/arrow/arrow_reader/statistics.rs index 8a7511be2afe..dc64de6616a5 100644 --- a/parquet/src/arrow/arrow_reader/statistics.rs +++ b/parquet/src/arrow/arrow_reader/statistics.rs @@ -1414,6 +1414,24 @@ impl<'a> StatisticsConverter<'a> { Ok(UInt64Array::from_iter(null_counts)) } + /// Extract the uncompressed sizes from row group statistics in [`RowGroupMetaData`] + pub fn row_group_uncompressed_size(&self, metadatas: I) -> Result + where + I: IntoIterator, + { + let Some(parquet_index) = self.parquet_column_index else { + let num_row_groups = metadatas.into_iter().count(); + return Ok(UInt64Array::from_iter( + std::iter::repeat(None).take(num_row_groups), + )); + }; + + let uncompressed_sizes = metadatas + .into_iter() + .map(|x| x.column(parquet_index).uncompressed_size() as u64); + Ok(UInt64Array::from_iter_values(uncompressed_sizes)) + } + /// Extract the minimum values from Data Page statistics. /// /// In Parquet files, in addition to the Column Chunk level statistics From 0c0c40e0d42082ab915622850a2d3968e53e35ef Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 6 Dec 2024 14:51:16 -0500 Subject: [PATCH 2/3] Add expected row counts to statistics tests --- parquet/tests/arrow_reader/statistics.rs | 86 ++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 0eb0fc2b277f..0037cb74e549 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -212,6 +212,7 @@ struct Test<'a> { expected_max: ArrayRef, expected_null_counts: UInt64Array, expected_row_counts: Option, + expected_uncompressed_size: UInt64Array, /// Which column to extract statistics from column_name: &'static str, /// What statistics should be checked? @@ -245,6 +246,7 @@ impl Test<'_> { expected_max, expected_null_counts, expected_row_counts, + expected_uncompressed_size, column_name, check, } = self; @@ -328,6 +330,13 @@ impl Test<'_> { "{column_name}: Mismatch with expected row counts. \ Actual: {row_counts:?}. Expected: {expected_row_counts:?}" ); + + let uncompressed_size = converter.row_group_uncompressed_size(row_groups).unwrap(); + assert_eq!( + &uncompressed_size, &expected_uncompressed_size, + "{column_name}: Mismatch with expected uncompressed size. \ + Actual: {uncompressed_size:?}. Expected: {expected_uncompressed_size:?}" + ); } } @@ -377,6 +386,7 @@ async fn test_one_row_group_without_null() { expected_null_counts: UInt64Array::from(vec![0]), // 3 rows expected_row_counts: Some(UInt64Array::from(vec![3])), + expected_uncompressed_size: UInt64Array::from(vec![91]), column_name: "i64", check: Check::Both, } @@ -404,6 +414,7 @@ async fn test_one_row_group_with_null_and_negative() { expected_null_counts: UInt64Array::from(vec![2]), // 8 rows expected_row_counts: Some(UInt64Array::from(vec![8])), + expected_uncompressed_size: UInt64Array::from(vec![116]), column_name: "i64", check: Check::Both, } @@ -431,6 +442,7 @@ async fn test_two_row_group_with_null() { expected_null_counts: UInt64Array::from(vec![0, 2]), // row counts are [10, 5] expected_row_counts: Some(UInt64Array::from(vec![10, 5])), + expected_uncompressed_size: UInt64Array::from(vec![155, 91]), column_name: "i64", check: Check::Both, } @@ -458,6 +470,7 @@ async fn test_two_row_groups_with_all_nulls_in_one() { expected_null_counts: UInt64Array::from(vec![1, 3]), // row counts are [5, 3] expected_row_counts: Some(UInt64Array::from(vec![5, 3])), + expected_uncompressed_size: UInt64Array::from(vec![99, 38]), column_name: "i64", check: Check::Both, } @@ -489,6 +502,7 @@ async fn test_multiple_data_pages_nulls_and_negatives() { expected_max: Arc::new(Int64Array::from(vec![Some(2), Some(6), Some(9), None])), expected_null_counts: UInt64Array::from(vec![0, 0, 1, 2]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 2])), + expected_uncompressed_size: UInt64Array::from(vec![32, 32, 32, 16]), column_name: "i64", check: Check::DataPage, } @@ -551,6 +565,7 @@ async fn test_data_page_stats_with_all_null_page() { expected_max: new_null_array(expected_data_type, 1), expected_null_counts: UInt64Array::from(vec![4]), expected_row_counts: Some(UInt64Array::from(vec![4])), + expected_uncompressed_size: UInt64Array::from(vec![4 * data_type.size() as u64]), column_name: "col", check: Check::DataPage, } @@ -585,6 +600,7 @@ async fn test_int_64() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![108, 108, 108, 108]), column_name: "i64", check: Check::Both, } @@ -611,6 +627,7 @@ async fn test_int_32() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 80, 80]), column_name: "i32", check: Check::Both, } @@ -637,6 +654,7 @@ async fn test_int_16() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 80, 80]), column_name: "i16", check: Check::Both, } @@ -663,6 +681,7 @@ async fn test_int_8() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 80, 80]), column_name: "i8", check: Check::Both, } @@ -699,6 +718,7 @@ async fn test_float_16() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![47, 47, 47, 47]), column_name: "f", check: Check::Both, } @@ -725,6 +745,7 @@ async fn test_float_32() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 80, 80]), column_name: "f", check: Check::Both, } @@ -751,6 +772,7 @@ async fn test_float_64() { expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![108, 108, 108, 108]), column_name: "f", check: Check::Both, } @@ -801,6 +823,7 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "nanos", check: Check::Both, } @@ -830,6 +853,7 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "nanos_timezoned", check: Check::Both, } @@ -852,6 +876,7 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "micros", check: Check::Both, } @@ -881,6 +906,7 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "micros_timezoned", check: Check::Both, } @@ -903,6 +929,7 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "millis", check: Check::Both, } @@ -932,6 +959,7 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "millis_timezoned", check: Check::Both, } @@ -954,6 +982,7 @@ async fn test_timestamp() { ])), expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "seconds", check: Check::Both, } @@ -983,6 +1012,7 @@ async fn test_timestamp() { expected_null_counts: UInt64Array::from(vec![1, 1, 1, 1]), // row counts are [5, 5, 5, 5] expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "seconds_timezoned", check: Check::Both, } @@ -1029,6 +1059,7 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "nanos", check: Check::Both, } @@ -1056,6 +1087,7 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "nanos_timezoned", check: Check::Both, } @@ -1076,6 +1108,7 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "micros", check: Check::Both, } @@ -1103,6 +1136,7 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "micros_timezoned", check: Check::Both, } @@ -1123,6 +1157,7 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "millis", check: Check::Both, } @@ -1150,6 +1185,7 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "millis_timezoned", check: Check::Both, } @@ -1170,6 +1206,7 @@ async fn test_timestamp_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![1, 2, 1]), expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "seconds", check: Check::Both, } @@ -1197,6 +1234,7 @@ async fn test_timestamp_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![1, 2, 1]), // row counts are [8, 8, 4] expected_row_counts: Some(UInt64Array::from(vec![8, 8, 4])), + expected_uncompressed_size: UInt64Array::from(vec![124, 116, 91]), column_name: "seconds_timezoned", check: Check::Both, } @@ -1235,6 +1273,7 @@ async fn test_dates_32_diff_rg_sizes() { expected_null_counts: UInt64Array::from(vec![2, 2]), // row counts are [13, 7] expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + expected_uncompressed_size: UInt64Array::from(vec![110, 80]), column_name: "date32", check: Check::Both, } @@ -1258,6 +1297,7 @@ async fn test_time32_second_diff_rg_sizes() { expected_max: Arc::new(Time32SecondArray::from(vec![18509, 18513, 18517, 18521])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![75, 75, 75, 75]), column_name: "second", check: Check::Both, } @@ -1285,6 +1325,7 @@ async fn test_time32_millisecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![75, 75, 75, 75]), column_name: "millisecond", check: Check::Both, } @@ -1318,6 +1359,7 @@ async fn test_time64_microsecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "microsecond", check: Check::Both, } @@ -1351,6 +1393,7 @@ async fn test_time64_nanosecond_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), // Assuming 1 null per row group for simplicity expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 99, 99]), column_name: "nanosecond", check: Check::Both, } @@ -1378,6 +1421,7 @@ async fn test_dates_64_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + expected_uncompressed_size: UInt64Array::from(vec![110, 80]), column_name: "date64", check: Check::Both, } @@ -1406,6 +1450,7 @@ async fn test_uint() { expected_max: Arc::new(UInt8Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![75, 75, 71, 75, 75]), column_name: "u8", check: Check::Both, } @@ -1417,6 +1462,7 @@ async fn test_uint() { expected_max: Arc::new(UInt16Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![75, 75, 71, 75, 75]), column_name: "u16", check: Check::Both, } @@ -1428,6 +1474,7 @@ async fn test_uint() { expected_max: Arc::new(UInt32Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![75, 75, 71, 75, 75]), column_name: "u32", check: Check::Both, } @@ -1439,6 +1486,7 @@ async fn test_uint() { expected_max: Arc::new(UInt64Array::from(vec![3, 4, 6, 250, 254])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![4, 4, 4, 4, 4])), + expected_uncompressed_size: UInt64Array::from(vec![99, 99, 91, 99, 99]), column_name: "u64", check: Check::Both, } @@ -1462,6 +1510,7 @@ async fn test_int32_range() { expected_max: Arc::new(Int32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![4])), + expected_uncompressed_size: UInt64Array::from(vec![75]), column_name: "i", check: Check::Both, } @@ -1485,6 +1534,7 @@ async fn test_uint32_range() { expected_max: Arc::new(UInt32Array::from(vec![300000])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![4])), + expected_uncompressed_size: UInt64Array::from(vec![75]), column_name: "u", check: Check::Both, } @@ -1507,6 +1557,7 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt8Array::from(vec![100, u8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![65, 60]), column_name: "u8", check: Check::Both, } @@ -1518,6 +1569,7 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt16Array::from(vec![100, u16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![65, 60]), column_name: "u16", check: Check::Both, } @@ -1529,6 +1581,7 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt32Array::from(vec![100, u32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![65, 60]), column_name: "u32", check: Check::Both, } @@ -1540,6 +1593,7 @@ async fn test_numeric_limits_unsigned() { expected_max: Arc::new(UInt64Array::from(vec![100, u64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![85, 76]), column_name: "u64", check: Check::Both, } @@ -1562,6 +1616,7 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int8Array::from(vec![100, i8::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![74, 60]), column_name: "i8", check: Check::Both, } @@ -1573,6 +1628,7 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int16Array::from(vec![100, i16::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![74, 60]), column_name: "i16", check: Check::Both, } @@ -1584,6 +1640,7 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int32Array::from(vec![100, i32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![74, 60]), column_name: "i32", check: Check::Both, } @@ -1595,6 +1652,7 @@ async fn test_numeric_limits_signed() { expected_max: Arc::new(Int64Array::from(vec![100, i64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![102, 76]), column_name: "i64", check: Check::Both, } @@ -1617,6 +1675,7 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float32Array::from(vec![100.0, f32::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![74, 60]), column_name: "f32", check: Check::Both, } @@ -1628,6 +1687,7 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float64Array::from(vec![100.0, f64::MAX])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![102, 76]), column_name: "f64", check: Check::Both, } @@ -1639,6 +1699,7 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float32Array::from(vec![100.0, -100.0])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![74, 60]), column_name: "f32_nan", check: Check::Both, } @@ -1650,6 +1711,7 @@ async fn test_numeric_limits_float() { expected_max: Arc::new(Float64Array::from(vec![100.0, -100.0])), expected_null_counts: UInt64Array::from(vec![0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![102, 76]), column_name: "f64_nan", check: Check::Both, } @@ -1673,6 +1735,7 @@ async fn test_float64() { expected_max: Arc::new(Float64Array::from(vec![-1.0, 0.0, 4.0, 9.0])), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![108, 108, 108, 108]), column_name: "f", check: Check::Both, } @@ -1706,6 +1769,7 @@ async fn test_float16() { )), expected_null_counts: UInt64Array::from(vec![0, 0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![47, 47, 47, 47]), column_name: "f", check: Check::Both, } @@ -1737,6 +1801,7 @@ async fn test_decimal() { ), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 75]), column_name: "decimal_col", check: Check::Both, } @@ -1767,6 +1832,7 @@ async fn test_decimal_256() { ), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![80, 80, 75]), column_name: "decimal256_col", check: Check::Both, } @@ -1787,6 +1853,7 @@ async fn test_dictionary() { expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![70, 74]), column_name: "string_dict_i8", check: Check::Both, } @@ -1798,6 +1865,7 @@ async fn test_dictionary() { expected_max: Arc::new(StringArray::from(vec!["def", "fffff"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![70, 74]), column_name: "string_dict_i32", check: Check::Both, } @@ -1809,6 +1877,7 @@ async fn test_dictionary() { expected_max: Arc::new(Int64Array::from(vec![0, 100])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 2])), + expected_uncompressed_size: UInt64Array::from(vec![82, 82]), column_name: "int_dict_i8", check: Check::Both, } @@ -1847,6 +1916,7 @@ async fn test_byte() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![92, 68, 89]), column_name: "name", check: Check::Both, } @@ -1867,6 +1937,7 @@ async fn test_byte() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![164, 157, 159]), column_name: "service_string", check: Check::Both, } @@ -1886,6 +1957,7 @@ async fn test_byte() { expected_max: Arc::new(BinaryArray::from(expected_service_binary_max_values)), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![164, 157, 159]), column_name: "service_binary", check: Check::Both, } @@ -1903,6 +1975,7 @@ async fn test_byte() { expected_max: Arc::new(FixedSizeBinaryArray::try_from_iter(max_input.into_iter()).unwrap()), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![54, 54, 54]), column_name: "service_fixedsize", check: Check::Both, } @@ -1924,6 +1997,7 @@ async fn test_byte() { )), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![164, 157, 159]), column_name: "service_large_binary", check: Check::Both, } @@ -1957,6 +2031,7 @@ async fn test_period_in_column_names() { ])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![110, 110, 110]), column_name: "name", check: Check::Both, } @@ -1969,6 +2044,8 @@ async fn test_period_in_column_names() { expected_max: Arc::new(StringArray::from(vec!["frontend", "frontend", "backend"])), expected_null_counts: UInt64Array::from(vec![0, 0, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![77, 88, 74]), + column_name: "service.name", check: Check::Both, } @@ -1993,6 +2070,7 @@ async fn test_boolean() { expected_max: Arc::new(BooleanArray::from(vec![true, false])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![36, 36]), column_name: "bool", check: Check::Both, } @@ -2020,6 +2098,7 @@ async fn test_struct() { expected_max: Arc::new(struct_array(vec![(Some(2), Some(8.5), Some(14.0))])), expected_null_counts: UInt64Array::from(vec![0]), expected_row_counts: Some(UInt64Array::from(vec![3])), + expected_uncompressed_size: UInt64Array::from(vec![20, 20]), column_name: "struct", check: Check::RowGroup, } @@ -2043,6 +2122,7 @@ async fn test_utf8() { expected_max: Arc::new(StringArray::from(vec!["d", "i"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![73, 79]), column_name: "utf8", check: Check::Both, } @@ -2055,6 +2135,7 @@ async fn test_utf8() { expected_max: Arc::new(LargeStringArray::from(vec!["d", "i"])), expected_null_counts: UInt64Array::from(vec![1, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![73, 79]), column_name: "large_utf8", check: Check::Both, } @@ -2082,6 +2163,7 @@ async fn test_utf8_view() { ])), expected_null_counts: UInt64Array::from(vec![1, 3, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![73, 88, 172]), column_name: "utf8_view", check: Check::Both, } @@ -2109,6 +2191,7 @@ async fn test_binary_view() { expected_max: Arc::new(BinaryViewArray::from(expected_max)), expected_null_counts: UInt64Array::from(vec![1, 3, 0]), expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])), + expected_uncompressed_size: UInt64Array::from(vec![73, 88, 172]), column_name: "binary_view", check: Check::Both, } @@ -2135,6 +2218,7 @@ async fn test_missing_statistics() { expected_max: Arc::new(Int64Array::from(vec![None])), expected_null_counts: UInt64Array::from(vec![None]), expected_row_counts: Some(UInt64Array::from(vec![3])), // still has row count statistics + expected_uncompressed_size: UInt64Array::from(vec![65]), // still has a size column_name: "i64", check: Check::Both, } @@ -2216,6 +2300,7 @@ async fn test_column_not_found() { expected_max: Arc::new(Int64Array::from(vec![18564, 21865])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: Some(UInt64Array::from(vec![13, 7])), + expected_uncompressed_size: UInt64Array::from(vec![50, 50]), column_name: "not_a_column", check: Check::Both, } @@ -2251,6 +2336,7 @@ async fn test_column_non_existent() { expected_null_counts: UInt64Array::from(vec![None, None, None, None]), // row counts are [5, 5, 5, 5] expected_row_counts: None, + expected_uncompressed_size: UInt64Array::from(vec![None, None, None, None]), column_name: "i_do_not_exist", check: Check::Both, } From 98bea446b6f3fe8a0e47102a05c4edf4ead8e9c5 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 6 Dec 2024 16:19:07 -0500 Subject: [PATCH 3/3] pull upstream and fix test --- parquet/tests/arrow_reader/statistics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/tests/arrow_reader/statistics.rs b/parquet/tests/arrow_reader/statistics.rs index 0037cb74e549..2a4b7d03d993 100644 --- a/parquet/tests/arrow_reader/statistics.rs +++ b/parquet/tests/arrow_reader/statistics.rs @@ -1421,7 +1421,7 @@ async fn test_dates_64_diff_rg_sizes() { ])), expected_null_counts: UInt64Array::from(vec![2, 2]), expected_row_counts: Some(UInt64Array::from(vec![13, 7])), - expected_uncompressed_size: UInt64Array::from(vec![110, 80]), + expected_uncompressed_size: UInt64Array::from(vec![164, 108]), column_name: "date64", check: Check::Both, }