Skip to content

Commit f3eb12a

Browse files
committed
enhance: Remove redundant statistics from FileScanConfig
Signed-off-by: Alan Tang <[email protected]> chore: fix some fmt errors Signed-off-by: Alan Tang <[email protected]>
1 parent 52d750f commit f3eb12a

File tree

4 files changed

+31
-30
lines changed

4 files changed

+31
-30
lines changed

datafusion/core/tests/physical_optimizer/test_utils.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,10 @@ pub(crate) fn parquet_exec_with_stats() -> Arc<DataSourceExec> {
136136
)
137137
.with_file(PartitionedFile::new("x".to_string(), 10000))
138138
.with_statistics(statistics);
139-
assert_eq!(config.statistics.num_rows, Precision::Inexact(10));
139+
assert_eq!(
140+
config.file_source.statistics().unwrap().num_rows,
141+
Precision::Inexact(10)
142+
);
140143

141144
config.build()
142145
}

datafusion/datasource/src/file_scan_config.rs

+11-13
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,6 @@ pub struct FileScanConfig {
148148
pub file_groups: Vec<Vec<PartitionedFile>>,
149149
/// Table constraints
150150
pub constraints: Constraints,
151-
/// Estimated overall statistics of the files, taking `filters` into account.
152-
/// Defaults to [`Statistics::new_unknown`].
153-
pub statistics: Statistics,
154151
/// Columns on which to project the data. Indexes that are higher than the
155152
/// number of columns of `file_schema` refer to `table_partition_cols`.
156153
pub projection: Option<Vec<usize>>,
@@ -302,13 +299,13 @@ impl FileScanConfig {
302299
file_source: Arc<dyn FileSource>,
303300
) -> Self {
304301
let statistics = Statistics::new_unknown(&file_schema);
302+
file_source.with_statistics(statistics);
305303

306304
let mut config = Self {
307305
object_store_url,
308306
file_schema,
309307
file_groups: vec![],
310308
constraints: Constraints::empty(),
311-
statistics,
312309
projection: None,
313310
limit: None,
314311
table_partition_cols: vec![],
@@ -324,7 +321,8 @@ impl FileScanConfig {
324321

325322
/// Set the file source
326323
pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
327-
self.file_source = file_source.with_statistics(self.statistics.clone());
324+
let statistics = Statistics::new_unknown(&self.file_schema);
325+
self.file_source = file_source.with_statistics(statistics);
328326
self
329327
}
330328

@@ -336,8 +334,7 @@ impl FileScanConfig {
336334

337335
/// Set the statistics of the files
338336
pub fn with_statistics(mut self, statistics: Statistics) -> Self {
339-
self.statistics = statistics.clone();
340-
self.file_source = self.file_source.with_statistics(statistics);
337+
self.file_source = self.file_source.with_statistics(statistics.clone());
341338
self
342339
}
343340

@@ -351,10 +348,7 @@ impl FileScanConfig {
351348
}
352349

353350
fn projected_stats(&self) -> Statistics {
354-
let statistics = self
355-
.file_source
356-
.statistics()
357-
.unwrap_or(self.statistics.clone());
351+
let statistics = self.file_source.statistics().unwrap();
358352

359353
let table_cols_stats = self
360354
.projection_indices()
@@ -487,7 +481,7 @@ impl FileScanConfig {
487481
return (
488482
Arc::clone(&self.file_schema),
489483
self.constraints.clone(),
490-
self.statistics.clone(),
484+
self.file_source.statistics().unwrap().clone(),
491485
self.output_ordering.clone(),
492486
);
493487
}
@@ -630,7 +624,11 @@ impl Debug for FileScanConfig {
630624
write!(f, "FileScanConfig {{")?;
631625
write!(f, "object_store_url={:?}, ", self.object_store_url)?;
632626

633-
write!(f, "statistics={:?}, ", self.statistics)?;
627+
write!(
628+
f,
629+
"statistics={:?}, ",
630+
self.file_source.statistics().unwrap()
631+
)?;
634632

635633
DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
636634
write!(f, "}}")

datafusion/proto/src/physical_plan/to_proto.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ pub fn serialize_file_scan_config(
507507

508508
Ok(protobuf::FileScanExecConf {
509509
file_groups,
510-
statistics: Some((&conf.statistics).into()),
510+
statistics: Some((&conf.file_source.statistics().unwrap()).into()),
511511
limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
512512
projection: conf
513513
.projection

datafusion/proto/tests/cases/roundtrip_physical_plan.rs

+15-15
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,14 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
742742
ParquetSource::new(options).with_predicate(Arc::clone(&file_schema), predicate),
743743
);
744744

745+
source.with_statistics(Statistics {
746+
num_rows: Precision::Inexact(100),
747+
total_byte_size: Precision::Inexact(1024),
748+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
749+
Field::new("col", DataType::Utf8, false),
750+
]))),
751+
});
752+
745753
let scan_config = FileScanConfig {
746754
object_store_url: ObjectStoreUrl::local_filesystem(),
747755
file_schema,
@@ -750,13 +758,6 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
750758
1024,
751759
)]],
752760
constraints: Constraints::empty(),
753-
statistics: Statistics {
754-
num_rows: Precision::Inexact(100),
755-
total_byte_size: Precision::Inexact(1024),
756-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
757-
Field::new("col", DataType::Utf8, false),
758-
]))),
759-
},
760761
projection: None,
761762
limit: None,
762763
table_partition_cols: vec![],
@@ -805,6 +806,13 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
805806
ParquetSource::default()
806807
.with_predicate(Arc::clone(&file_schema), custom_predicate_expr),
807808
);
809+
source.with_statistics(Statistics {
810+
num_rows: Precision::Inexact(100),
811+
total_byte_size: Precision::Inexact(1024),
812+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
813+
Field::new("col", DataType::Utf8, false),
814+
]))),
815+
});
808816

809817
let scan_config = FileScanConfig {
810818
object_store_url: ObjectStoreUrl::local_filesystem(),
@@ -814,13 +822,6 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
814822
1024,
815823
)]],
816824
constraints: Constraints::empty(),
817-
statistics: Statistics {
818-
num_rows: Precision::Inexact(100),
819-
total_byte_size: Precision::Inexact(1024),
820-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
821-
Field::new("col", DataType::Utf8, false),
822-
]))),
823-
},
824825
projection: None,
825826
limit: None,
826827
table_partition_cols: vec![],
@@ -1616,7 +1617,6 @@ async fn roundtrip_projection_source() -> Result<()> {
16161617
1024,
16171618
)]],
16181619
constraints: Constraints::empty(),
1619-
statistics,
16201620
file_schema: schema.clone(),
16211621
projection: Some(vec![0, 1, 2]),
16221622
limit: None,

0 commit comments

Comments
 (0)