Skip to content

Commit d636155

Browse files
committed
directly create the col stats set.
1 parent 9205343 commit d636155

File tree

1 file changed

+17
-32
lines changed

1 file changed

+17
-32
lines changed

datafusion/core/src/datasource/statistics.rs

Lines changed: 17 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ use datafusion_common::stats::Precision;
2929
use datafusion_common::ScalarValue;
3030

3131
use futures::{Stream, StreamExt};
32-
use itertools::izip;
3332

3433
/// Get all files as well as the file level summary statistics (no statistic for partition columns).
3534
/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to
@@ -50,9 +49,10 @@ pub async fn get_statistics_with_limit(
5049
// - zero for summations, and
5150
// - neutral element for extreme points.
5251
let size = file_schema.fields().len();
53-
let mut null_counts: Vec<Precision<usize>> = vec![Precision::Absent; size];
54-
let mut max_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
55-
let mut min_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
52+
let mut col_stats_set = vec![ColumnStatistics::default(); size];
53+
// let mut null_counts: Vec<Precision<usize>> = vec![Precision::Absent; size];
54+
// let mut max_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
55+
// let mut min_values: Vec<Precision<ScalarValue>> = vec![Precision::Absent; size];
5656
let mut num_rows = Precision::<usize>::Absent;
5757
let mut total_byte_size = Precision::<usize>::Absent;
5858

@@ -69,9 +69,9 @@ pub async fn get_statistics_with_limit(
6969
for (index, file_column) in
7070
file_stats.column_statistics.clone().into_iter().enumerate()
7171
{
72-
null_counts[index] = file_column.null_count;
73-
max_values[index] = file_column.max_value;
74-
min_values[index] = file_column.min_value;
72+
col_stats_set[index].null_count = file_column.null_count;
73+
col_stats_set[index].max_value = file_column.max_value;
74+
col_stats_set[index].min_value = file_column.min_value;
7575
}
7676

7777
// If the number of rows exceeds the limit, we can stop processing
@@ -99,22 +99,22 @@ pub async fn get_statistics_with_limit(
9999
total_byte_size =
100100
add_row_stats(file_stats.total_byte_size.clone(), total_byte_size);
101101

102-
for (file_col_stats, null_count, max_value, min_value) in izip!(
103-
file_stats.column_statistics.iter(),
104-
null_counts.iter_mut(),
105-
max_values.iter_mut(),
106-
min_values.iter_mut(),
107-
) {
102+
for (file_col_stats, col_stats) in file_stats
103+
.column_statistics
104+
.iter()
105+
.zip(col_stats_set.iter_mut())
106+
{
108107
let ColumnStatistics {
109108
null_count: file_nc,
110109
max_value: file_max,
111110
min_value: file_min,
112111
distinct_count: _,
113112
} = file_col_stats;
114113

115-
*null_count = add_row_stats(file_nc.clone(), null_count.clone());
116-
set_max_if_greater(file_max, max_value);
117-
set_min_if_lesser(file_min, min_value)
114+
col_stats.null_count =
115+
add_row_stats(file_nc.clone(), col_stats.null_count.clone());
116+
set_max_if_greater(file_max, &mut col_stats.max_value);
117+
set_min_if_lesser(file_min, &mut col_stats.min_value)
118118
}
119119

120120
// If the number of rows exceeds the limit, we can stop processing
@@ -133,7 +133,7 @@ pub async fn get_statistics_with_limit(
133133
let mut statistics = Statistics {
134134
num_rows,
135135
total_byte_size,
136-
column_statistics: get_col_stats_vec(null_counts, max_values, min_values),
136+
column_statistics: col_stats_set,
137137
};
138138
if all_files.next().await.is_some() {
139139
// If we still have files in the stream, it means that the limit kicked
@@ -176,21 +176,6 @@ fn add_row_stats(
176176
}
177177
}
178178

179-
pub(crate) fn get_col_stats_vec(
180-
null_counts: Vec<Precision<usize>>,
181-
max_values: Vec<Precision<ScalarValue>>,
182-
min_values: Vec<Precision<ScalarValue>>,
183-
) -> Vec<ColumnStatistics> {
184-
izip!(null_counts, max_values, min_values)
185-
.map(|(null_count, max_value, min_value)| ColumnStatistics {
186-
null_count,
187-
max_value,
188-
min_value,
189-
distinct_count: Precision::Absent,
190-
})
191-
.collect()
192-
}
193-
194179
pub(crate) fn get_col_stats(
195180
schema: &Schema,
196181
null_counts: Vec<Precision<usize>>,

0 commit comments

Comments
 (0)