@@ -29,7 +29,6 @@ use datafusion_common::stats::Precision;
29
29
use datafusion_common:: ScalarValue ;
30
30
31
31
use futures:: { Stream , StreamExt } ;
32
- use itertools:: izip;
33
32
34
33
/// Get all files as well as the file level summary statistics (no statistic for partition columns).
35
34
/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to
@@ -50,9 +49,10 @@ pub async fn get_statistics_with_limit(
50
49
// - zero for summations, and
51
50
// - neutral element for extreme points.
52
51
let size = file_schema. fields ( ) . len ( ) ;
53
- let mut null_counts: Vec < Precision < usize > > = vec ! [ Precision :: Absent ; size] ;
54
- let mut max_values: Vec < Precision < ScalarValue > > = vec ! [ Precision :: Absent ; size] ;
55
- let mut min_values: Vec < Precision < ScalarValue > > = vec ! [ Precision :: Absent ; size] ;
52
+ let mut col_stats_set = vec ! [ ColumnStatistics :: default ( ) ; size] ;
53
+ // let mut null_counts: Vec<Precision<usize>> = vec![Precision::Absent; size];
54
+ // let mut max_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
55
+ // let mut min_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
56
56
let mut num_rows = Precision :: < usize > :: Absent ;
57
57
let mut total_byte_size = Precision :: < usize > :: Absent ;
58
58
@@ -69,9 +69,9 @@ pub async fn get_statistics_with_limit(
69
69
for ( index, file_column) in
70
70
file_stats. column_statistics . clone ( ) . into_iter ( ) . enumerate ( )
71
71
{
72
- null_counts [ index] = file_column. null_count ;
73
- max_values [ index] = file_column. max_value ;
74
- min_values [ index] = file_column. min_value ;
72
+ col_stats_set [ index] . null_count = file_column. null_count ;
73
+ col_stats_set [ index] . max_value = file_column. max_value ;
74
+ col_stats_set [ index] . min_value = file_column. min_value ;
75
75
}
76
76
77
77
// If the number of rows exceeds the limit, we can stop processing
@@ -99,22 +99,22 @@ pub async fn get_statistics_with_limit(
99
99
total_byte_size =
100
100
add_row_stats ( file_stats. total_byte_size . clone ( ) , total_byte_size) ;
101
101
102
- for ( file_col_stats, null_count, max_value, min_value) in izip ! (
103
- file_stats. column_statistics. iter( ) ,
104
- null_counts. iter_mut( ) ,
105
- max_values. iter_mut( ) ,
106
- min_values. iter_mut( ) ,
107
- ) {
102
+ for ( file_col_stats, col_stats) in file_stats
103
+ . column_statistics
104
+ . iter ( )
105
+ . zip ( col_stats_set. iter_mut ( ) )
106
+ {
108
107
let ColumnStatistics {
109
108
null_count : file_nc,
110
109
max_value : file_max,
111
110
min_value : file_min,
112
111
distinct_count : _,
113
112
} = file_col_stats;
114
113
115
- * null_count = add_row_stats ( file_nc. clone ( ) , null_count. clone ( ) ) ;
116
- set_max_if_greater ( file_max, max_value) ;
117
- set_min_if_lesser ( file_min, min_value)
114
+ col_stats. null_count =
115
+ add_row_stats ( file_nc. clone ( ) , col_stats. null_count . clone ( ) ) ;
116
+ set_max_if_greater ( file_max, & mut col_stats. max_value ) ;
117
+ set_min_if_lesser ( file_min, & mut col_stats. min_value )
118
118
}
119
119
120
120
// If the number of rows exceeds the limit, we can stop processing
@@ -133,7 +133,7 @@ pub async fn get_statistics_with_limit(
133
133
let mut statistics = Statistics {
134
134
num_rows,
135
135
total_byte_size,
136
- column_statistics : get_col_stats_vec ( null_counts , max_values , min_values ) ,
136
+ column_statistics : col_stats_set ,
137
137
} ;
138
138
if all_files. next ( ) . await . is_some ( ) {
139
139
// If we still have files in the stream, it means that the limit kicked
@@ -176,21 +176,6 @@ fn add_row_stats(
176
176
}
177
177
}
178
178
179
- pub ( crate ) fn get_col_stats_vec (
180
- null_counts : Vec < Precision < usize > > ,
181
- max_values : Vec < Precision < ScalarValue > > ,
182
- min_values : Vec < Precision < ScalarValue > > ,
183
- ) -> Vec < ColumnStatistics > {
184
- izip ! ( null_counts, max_values, min_values)
185
- . map ( |( null_count, max_value, min_value) | ColumnStatistics {
186
- null_count,
187
- max_value,
188
- min_value,
189
- distinct_count : Precision :: Absent ,
190
- } )
191
- . collect ( )
192
- }
193
-
194
179
pub ( crate ) fn get_col_stats (
195
180
schema : & Schema ,
196
181
null_counts : Vec < Precision < usize > > ,
0 commit comments