Skip to content

Commit bdfb9ac

Browse files
committed
enhance: Remove redundant statistics from FileScanConfig
Signed-off-by: Alan Tang <[email protected]>
1 parent 57a1221 commit bdfb9ac

File tree

3 files changed

+31
-31
lines changed

3 files changed

+31
-31
lines changed

datafusion/datasource/src/file_scan_config.rs

+10-13
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,6 @@ pub struct FileScanConfig {
148148
pub file_groups: Vec<Vec<PartitionedFile>>,
149149
/// Table constraints
150150
pub constraints: Constraints,
151-
/// Estimated overall statistics of the files, taking `filters` into account.
152-
/// Defaults to [`Statistics::new_unknown`].
153-
pub statistics: Statistics,
154151
/// Columns on which to project the data. Indexes that are higher than the
155152
/// number of columns of `file_schema` refer to `table_partition_cols`.
156153
pub projection: Option<Vec<usize>>,
@@ -319,7 +316,6 @@ impl FileScanConfig {
319316
file_schema,
320317
file_groups: vec![],
321318
constraints: Constraints::empty(),
322-
statistics,
323319
projection: None,
324320
limit: None,
325321
table_partition_cols: vec![],
@@ -328,14 +324,15 @@ impl FileScanConfig {
328324
new_lines_in_values: false,
329325
file_source: Arc::clone(&file_source),
330326
};
327+
config = config.with_statistics(statistics);
331328

332-
config = config.with_source(Arc::clone(&file_source));
333329
config
334330
}
335331

336332
/// Set the file source
337333
pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
338-
self.file_source = file_source.with_statistics(self.statistics.clone());
334+
self.file_source =
335+
file_source.with_statistics(Statistics::new_unknown(&self.file_schema));
339336
self
340337
}
341338

@@ -347,7 +344,6 @@ impl FileScanConfig {
347344

348345
/// Set the statistics of the files
349346
pub fn with_statistics(mut self, statistics: Statistics) -> Self {
350-
self.statistics = statistics.clone();
351347
self.file_source = self.file_source.with_statistics(statistics);
352348
self
353349
}
@@ -362,10 +358,7 @@ impl FileScanConfig {
362358
}
363359

364360
fn projected_stats(&self) -> Statistics {
365-
let statistics = self
366-
.file_source
367-
.statistics()
368-
.unwrap_or(self.statistics.clone());
361+
let statistics = self.file_source.statistics().unwrap();
369362

370363
let table_cols_stats = self
371364
.projection_indices()
@@ -498,7 +491,7 @@ impl FileScanConfig {
498491
return (
499492
Arc::clone(&self.file_schema),
500493
self.constraints.clone(),
501-
self.statistics.clone(),
494+
self.file_source.statistics().unwrap().clone(),
502495
self.output_ordering.clone(),
503496
);
504497
}
@@ -641,7 +634,11 @@ impl Debug for FileScanConfig {
641634
write!(f, "FileScanConfig {{")?;
642635
write!(f, "object_store_url={:?}, ", self.object_store_url)?;
643636

644-
write!(f, "statistics={:?}, ", self.statistics)?;
637+
write!(
638+
f,
639+
"statistics={:?}, ",
640+
self.file_source.statistics().unwrap()
641+
)?;
645642

646643
DisplayAs::fmt_as(self, DisplayFormatType::Verbose, f)?;
647644
write!(f, "}}")

datafusion/proto/src/physical_plan/to_proto.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,7 @@ pub fn serialize_file_scan_config(
507507

508508
Ok(protobuf::FileScanExecConf {
509509
file_groups,
510-
statistics: Some((&conf.statistics).into()),
510+
statistics: Some((&conf.file_source.statistics().unwrap()).into()),
511511
limit: conf.limit.map(|l| protobuf::ScanLimit { limit: l as u32 }),
512512
projection: conf
513513
.projection

datafusion/proto/tests/cases/roundtrip_physical_plan.rs

+20-17
Original file line numberDiff line numberDiff line change
@@ -741,22 +741,22 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
741741
let source = Arc::new(
742742
ParquetSource::new(options).with_predicate(Arc::clone(&file_schema), predicate),
743743
);
744+
let statistics = Statistics {
745+
num_rows: Precision::Inexact(100),
746+
total_byte_size: Precision::Inexact(1024),
747+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
748+
Field::new("col", DataType::Utf8, false),
749+
]))),
750+
};
744751

745-
let scan_config = FileScanConfig {
752+
let mut scan_config = FileScanConfig {
746753
object_store_url: ObjectStoreUrl::local_filesystem(),
747754
file_schema,
748755
file_groups: vec![vec![PartitionedFile::new(
749756
"/path/to/file.parquet".to_string(),
750757
1024,
751758
)]],
752759
constraints: Constraints::empty(),
753-
statistics: Statistics {
754-
num_rows: Precision::Inexact(100),
755-
total_byte_size: Precision::Inexact(1024),
756-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
757-
Field::new("col", DataType::Utf8, false),
758-
]))),
759-
},
760760
projection: None,
761761
limit: None,
762762
table_partition_cols: vec![],
@@ -765,6 +765,7 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> {
765765
new_lines_in_values: false,
766766
file_source: source,
767767
};
768+
scan_config = scan_config.with_statistics(statistics);
768769

769770
roundtrip_test(scan_config.build())
770771
}
@@ -806,21 +807,23 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
806807
.with_predicate(Arc::clone(&file_schema), custom_predicate_expr),
807808
);
808809

809-
let scan_config = FileScanConfig {
810+
let statistics = Statistics {
811+
num_rows: Precision::Inexact(100),
812+
total_byte_size: Precision::Inexact(1024),
813+
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
814+
Field::new("col", DataType::Utf8, false),
815+
]))),
816+
};
817+
818+
let mut scan_config = FileScanConfig {
810819
object_store_url: ObjectStoreUrl::local_filesystem(),
811820
file_schema,
812821
file_groups: vec![vec![PartitionedFile::new(
813822
"/path/to/file.parquet".to_string(),
814823
1024,
815824
)]],
816825
constraints: Constraints::empty(),
817-
statistics: Statistics {
818-
num_rows: Precision::Inexact(100),
819-
total_byte_size: Precision::Inexact(1024),
820-
column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![
821-
Field::new("col", DataType::Utf8, false),
822-
]))),
823-
},
826+
824827
projection: None,
825828
limit: None,
826829
table_partition_cols: vec![],
@@ -829,6 +832,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> {
829832
new_lines_in_values: false,
830833
file_source: source,
831834
};
835+
scan_config = scan_config.with_statistics(statistics);
832836

833837
#[derive(Debug, Clone, Eq)]
834838
struct CustomPredicateExpr {
@@ -1616,7 +1620,6 @@ async fn roundtrip_projection_source() -> Result<()> {
16161620
1024,
16171621
)]],
16181622
constraints: Constraints::empty(),
1619-
statistics,
16201623
file_schema: schema.clone(),
16211624
projection: Some(vec![0, 1, 2]),
16221625
limit: None,

0 commit comments

Comments
 (0)