From b7ae6ed06044ab926f593cced1acdd62e20ea280 Mon Sep 17 00:00:00 2001 From: blaginin Date: Fri, 21 Mar 2025 16:39:36 +0000 Subject: [PATCH 01/14] WIP: Add `FileScanConfigBuilder` and switch some cases --- .../examples/advanced_parquet_index.rs | 16 +- datafusion-examples/examples/parquet_index.rs | 9 +- .../core/src/datasource/file_format/arrow.rs | 10 +- .../core/src/datasource/file_format/mod.rs | 8 +- .../core/src/datasource/listing/table.rs | 7 +- .../core/src/datasource/physical_plan/avro.rs | 16 +- .../core/src/datasource/physical_plan/json.rs | 8 +- .../core/src/datasource/physical_plan/mod.rs | 1 + datafusion/datasource-csv/src/file_format.rs | 15 +- datafusion/datasource/src/file_scan_config.rs | 374 +++++++++++++++++- datafusion/datasource/src/file_stream.rs | 7 +- .../proto/src/physical_plan/from_proto.rs | 9 +- .../tests/cases/roundtrip_physical_plan.rs | 71 ++-- 13 files changed, 478 insertions(+), 73 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index d6cf61c61d73..90e9e989ce0e 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -30,7 +30,8 @@ use datafusion::common::{ use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan; use datafusion::datasource::physical_plan::{ - FileMeta, FileScanConfig, ParquetFileReaderFactory, ParquetSource, + FileMeta, FileScanConfigBuilder, ParquetFileReaderFactory, + ParquetSource, }; use datafusion::datasource::TableProvider; use datafusion::execution::object_store::ObjectStoreUrl; @@ -55,6 +56,7 @@ use arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; use bytes::Bytes; +use datafusion::datasource::memory::DataSourceExec; use futures::future::BoxFuture; use futures::FutureExt; use object_store::ObjectStore; @@ -498,13 +500,15 @@ impl TableProvider for IndexTableProvider { // provide the factory to create parquet reader without re-reading metadata .with_parquet_file_reader_factory(Arc::new(reader_factory)), ); - let file_scan_config = FileScanConfig::new(object_store_url, schema, file_source) - .with_limit(limit) - .with_projection(projection.cloned()) - .with_file(partitioned_file); + let file_scan_config = + FileScanConfigBuilder::new(object_store_url, schema, file_source) + .with_limit(limit) + .with_projection(projection.cloned()) + .with_file(partitioned_file) + .build(); // Finally, put it all together into a DataSourceExec - Ok(file_scan_config.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(file_scan_config)))) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index 3851dca2a775..f35f4edb0e26 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -27,7 +27,9 @@ use datafusion::common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::{ + FileScanConfigBuilder, ParquetSource, +}; use datafusion::datasource::TableProvider; use datafusion::execution::object_store::ObjectStoreUrl; use datafusion::logical_expr::{ @@ -244,9 +246,10 @@ impl TableProvider for IndexTableProvider { let source = Arc::new(ParquetSource::default().with_predicate(self.schema(), predicate)); let mut file_scan_config = - FileScanConfig::new(object_store_url, self.schema(), source) + FileScanConfigBuilder::new(object_store_url, self.schema(), source) .with_projection(projection.cloned()) - .with_limit(limit); + .with_limit(limit) + .build(); // Transform to the format needed to pass to DataSourceExec // Create one file group per file (default to scanning them all in parallel) diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index 3172e5692559..7e731847c49d 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -49,7 +49,7 @@ use datafusion_common::{ use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; use datafusion_physical_expr::PhysicalExpr; @@ -58,6 +58,7 @@ use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; use async_trait::async_trait; use bytes::Bytes; +use datafusion_datasource::source::DataSourceExec; use futures::stream::BoxStream; use futures::StreamExt; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; @@ -173,7 +174,12 @@ impl FileFormat for ArrowFormat { conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { - Ok(conf.with_source(Arc::new(ArrowSource::default())).build()) + let source = Arc::new(ArrowSource::default()); + let config = FileScanConfigBuilder::from(conf) + .with_source(source) + .build(); + + Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } async fn create_writer_physical_plan( diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index df74e5d060e6..f82d7f23411e 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -40,8 +40,9 @@ pub(crate) mod test_util { use datafusion_catalog::Session; use datafusion_common::Result; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::{ - file_format::FileFormat, file_scan_config::FileScanConfig, PartitionedFile, + file_format::FileFormat, PartitionedFile, }; use datafusion_execution::object_store::ObjectStoreUrl; @@ -78,7 +79,7 @@ pub(crate) mod test_util { let exec = format .create_physical_plan( state, - FileScanConfig::new( + FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, format.file_source(), @@ -86,7 +87,8 @@ pub(crate) mod test_util { .with_file_groups(file_groups) .with_statistics(statistics) .with_projection(projection) - .with_limit(limit), + .with_limit(limit) + .build(), None, ) .await?; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 21b35bac2174..20956f59dcc5 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -34,7 +34,7 @@ use crate::datasource::{ use crate::execution::context::SessionState; use datafusion_catalog::TableProvider; use datafusion_common::{config_err, DataFusionError, Result}; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_expr::dml::InsertOp; use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown}; use datafusion_expr::{SortExpr, TableType}; @@ -941,7 +941,7 @@ impl TableProvider for ListingTable { .format .create_physical_plan( session_state, - FileScanConfig::new( + FileScanConfigBuilder::new( object_store_url, Arc::clone(&self.file_schema), self.options.format.file_source(), @@ -952,7 +952,8 @@ impl TableProvider for ListingTable { .with_projection(projection.cloned()) .with_limit(limit) .with_output_ordering(output_ordering) - .with_table_partition_cols(table_partition_cols), + .with_table_partition_cols(table_partition_cols) + .build(), filters.as_ref(), ) .await diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 9fa2b3bc1482..85ec0b2d7661 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -32,7 +32,9 @@ mod tests { use arrow::datatypes::{DataType, Field, SchemaBuilder}; use datafusion_common::{assert_batches_eq, test_util, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; - use datafusion_datasource::file_scan_config::FileScanConfig; + use datafusion_datasource::file_scan_config::{ + FileScanConfig, FileScanConfigBuilder, + }; use datafusion_datasource::PartitionedFile; use datafusion_datasource_avro::source::AvroSource; use datafusion_datasource_avro::AvroFormat; @@ -79,10 +81,14 @@ mod tests { .await?; let source = Arc::new(AvroSource::new()); - let conf = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), file_schema, source) - .with_file(meta.into()) - .with_projection(Some(vec![0, 1, 2])); + let conf = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + file_schema, + source, + ) + .with_file(meta.into()) + .with_projection(Some(vec![0, 1, 2])) + .build(); let source_exec = conf.build(); assert_eq!( diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 9bab75fc88c3..f8a0dd711b19 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -49,6 +49,7 @@ mod tests { use arrow::array::Array; use arrow::datatypes::SchemaRef; use arrow::datatypes::{Field, SchemaBuilder}; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use object_store::ObjectStore; @@ -329,7 +330,7 @@ mod tests { async fn nd_json_exec_file_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - use datafusion_datasource::file_scan_config::FileScanConfig; + use futures::StreamExt; let session_ctx = SessionContext::new(); @@ -340,10 +341,11 @@ mod tests { prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; let source = Arc::new(JsonSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) .with_projection(Some(vec![3, 0, 2])) - .with_file_compression_type(file_compression_type.to_owned()); + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let exec = conf.build(); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 3); diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index cae04e5ee6b8..eaa4552c3679 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -57,6 +57,7 @@ pub use datafusion_datasource::file_groups::FileGroupPartitioner; pub use datafusion_datasource::file_meta::FileMeta; pub use datafusion_datasource::file_scan_config::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, + FileScanConfigBuilder, }; pub use datafusion_datasource::file_sink_config::*; diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index 522cb12db0c7..4d3c3f565304 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -41,7 +41,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{ FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD, }; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig}; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join; @@ -406,10 +406,9 @@ impl FileFormat for CsvFormat { async fn create_physical_plan( &self, state: &dyn Session, - mut conf: FileScanConfig, + conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { - conf.file_compression_type = self.options.compression.into(); // Consult configuration options for default values let has_header = self .options @@ -419,7 +418,10 @@ impl FileFormat for CsvFormat { .options .newlines_in_values .unwrap_or(state.config_options().catalog.newlines_in_values); - conf.new_lines_in_values = newlines_in_values; + + let conf_builder = FileScanConfigBuilder::from(conf) + .with_file_compression_type(self.options.compression.into()) + .with_newlines_in_values(newlines_in_values); let source = Arc::new( CsvSource::new(has_header, self.options.delimiter, self.options.quote) @@ -427,7 +429,10 @@ impl FileFormat for CsvFormat { .with_terminator(self.options.terminator) .with_comment(self.options.comment), ); - Ok(conf.with_source(source).build()) + + let config = conf_builder.with_source(source).build(); + + Ok(config.build()) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 91b5f0157739..036449e65ddd 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -169,6 +169,216 @@ pub struct FileScanConfig { pub file_source: Arc, } +#[derive(Clone)] +pub struct FileScanConfigBuilder { + object_store_url: ObjectStoreUrl, + file_schema: SchemaRef, + file_source: Arc, + + limit: Option, + projection: Option>, + table_partition_cols: Vec, + constraints: Option, + + file_groups: Vec>, + + /// Estimated overall statistics of the files, taking `filters` into account. + /// Defaults to [`Statistics::new_unknown`]. + statistics: Option, + + /// All equivalent lexicographical orderings that describe the schema. + output_ordering: Vec, + + file_compression_type: Option, + + new_lines_in_values: Option, +} + +impl FileScanConfigBuilder { + /// Create a new [`FileScanConfigBuilder`] with default settings for scanning files. + /// + /// # Parameters: + /// * `object_store_url`: See [`FileScanConfig::object_store_url`] + /// * `file_schema`: See [`FileScanConfig::file_schema`] + /// * `file_source`: See [`FileScanConfig::file_source`] + pub fn new( + object_store_url: ObjectStoreUrl, + file_schema: SchemaRef, + file_source: Arc, + ) -> Self { + Self { + object_store_url, + file_schema, + file_source, + file_groups: vec![], + statistics: None, + output_ordering: vec![], + file_compression_type: None, + new_lines_in_values: None, + limit: None, + projection: None, + table_partition_cols: vec![], + constraints: None, + } + } + + /// Set the maximum number of records to read from this plan. If `None`, + /// all records after filtering are returned. + pub fn with_limit(mut self, limit: Option) -> Self { + self.limit = limit; + self + } + + pub fn with_source(mut self, file_source: Arc) -> Self { + self.file_source = file_source; + self + } + + /// Set the columns on which to project the data. Indexes that are higher than the + /// number of columns of `file_schema` refer to `table_partition_cols`. + pub fn with_projection(mut self, projection: Option>) -> Self { + self.projection = projection; + self + } + + /// Set the partitioning columns + pub fn with_table_partition_cols(mut self, table_partition_cols: Vec) -> Self { + self.table_partition_cols = table_partition_cols; + self + } + + /// Set the table constraints + pub fn with_constraints(mut self, constraints: Constraints) -> Self { + self.constraints = Some(constraints); + self + } + + /// Set the estimated overall statistics of the files, taking `filters` into account. + /// Defaults to [`Statistics::new_unknown`]. + pub fn with_statistics(mut self, statistics: Statistics) -> Self { + self.statistics = Some(statistics); + self + } + + /// Set the list of files to be processed, grouped into partitions. + /// + /// Each file must have a schema of `file_schema` or a subset. If + /// a particular file has a subset, the missing columns are + /// padded with NULLs. + /// + /// DataFusion may attempt to read each partition of files + /// concurrently, however files *within* a partition will be read + /// sequentially, one after the next. + pub fn with_file_groups(mut self, file_groups: Vec>) -> Self { + self.file_groups = file_groups; + self + } + + /// Add a new file group + /// + /// See [`Self::with_file_groups`] for more information + pub fn with_file_group(mut self, file_group: Vec) -> Self { + self.file_groups.push(file_group); + self + } + + /// Add a file as a single group + /// + /// See [`Self::with_file_groups`] for more information. + pub fn with_file(self, file: PartitionedFile) -> Self { + self.with_file_group(vec![file]) + } + + /// Set the output ordering of the files + pub fn with_output_ordering(mut self, output_ordering: Vec) -> Self { + self.output_ordering = output_ordering; + self + } + + /// Set the file compression type + pub fn with_file_compression_type( + mut self, + file_compression_type: FileCompressionType, + ) -> Self { + self.file_compression_type = Some(file_compression_type); + self + } + + /// Set whether new lines in values are supported for CSVOptions + /// + /// Parsing newlines in quoted values may be affected by execution behaviour such as + /// parallel file scanning. Setting this to `true` ensures that newlines in values are + /// parsed successfully, which may reduce performance. + pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self { + self.new_lines_in_values = Some(new_lines_in_values); + self + } + + /// Build the final [`FileScanConfig`] with all the configured settings. + /// + /// This method takes ownership of the builder and returns the constructed `FileScanConfig`. + /// Any unset optional fields will use their default values. + pub fn build(self) -> FileScanConfig { + let Self { + object_store_url, + file_schema, + file_source, + limit, + projection, + table_partition_cols, + constraints, + file_groups, + statistics, + output_ordering, + file_compression_type, + new_lines_in_values, + } = self; + + let constraints = constraints.unwrap_or_default(); + let statistics = + statistics.unwrap_or_else(|| Statistics::new_unknown(&file_schema)); + + let file_source = file_source.with_statistics(statistics.clone()); + let file_compression_type = + file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); + let new_lines_in_values = new_lines_in_values.unwrap_or(false); + + FileScanConfig { + object_store_url, + file_schema, + file_source, + limit, + projection, + table_partition_cols, + constraints, + file_groups, + statistics, + output_ordering, + file_compression_type, + new_lines_in_values, + } + } +} + +impl From for FileScanConfigBuilder { + fn from(config: FileScanConfig) -> Self { + Self { + object_store_url: config.object_store_url, + file_schema: config.file_schema, + file_source: config.file_source, + file_groups: config.file_groups, + statistics: Some(config.statistics), + output_ordering: config.output_ordering, + file_compression_type: Some(config.file_compression_type), + new_lines_in_values: Some(config.new_lines_in_values), + limit: config.limit, + projection: config.projection, + table_partition_cols: config.table_partition_cols, + constraints: Some(config.constraints), + } + } +} + impl DataSource for FileScanConfig { fn open( &self, @@ -298,11 +508,13 @@ impl DataSource for FileScanConfig { .clone() .unwrap_or((0..self.file_schema.fields().len()).collect()), ); - file_scan - // Assign projected statistics to source - .with_projection(Some(new_projections)) - .with_source(source) - .build() as _ + Arc::new(DataSourceExec::new(Arc::new( + FileScanConfigBuilder::from(file_scan) + // Assign projected statistics to source + .with_projection(Some(new_projections)) + .with_source(source) + .build(), + ))) as _ })) } } @@ -345,18 +557,21 @@ impl FileScanConfig { } /// Set the file source + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] pub fn with_source(mut self, file_source: Arc) -> Self { self.file_source = file_source.with_statistics(self.statistics.clone()); self } /// Set the table constraints of the files + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] pub fn with_constraints(mut self, constraints: Constraints) -> Self { self.constraints = constraints; self } /// Set the statistics of the files + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] pub fn with_statistics(mut self, statistics: Statistics) -> Self { self.statistics = statistics.clone(); self.file_source = self.file_source.with_statistics(statistics); @@ -478,6 +693,7 @@ impl FileScanConfig { } /// Set the file compression type + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] pub fn with_file_compression_type( mut self, file_compression_type: FileCompressionType, @@ -487,6 +703,7 @@ impl FileScanConfig { } /// Set the new_lines_in_values property + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] pub fn with_newlines_in_values(mut self, new_lines_in_values: bool) -> Self { self.new_lines_in_values = new_lines_in_values; self @@ -630,6 +847,7 @@ impl FileScanConfig { // TODO: This function should be moved into DataSourceExec once FileScanConfig moved out of datafusion/core /// Returns a new [`DataSourceExec`] to scan the files specified by this config + #[deprecated(since = "47.0.0", note = "use DataSourceExec::new instead")] pub fn build(self) -> Arc { Arc::new(DataSourceExec::new(Arc::new(self))) } @@ -1736,7 +1954,7 @@ mod tests { statistics: Statistics, table_partition_cols: Vec, ) -> FileScanConfig { - FileScanConfig::new( + FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema, Arc::new(MockSource::default()), @@ -1744,6 +1962,7 @@ mod tests { .with_projection(projection) .with_statistics(statistics) .with_table_partition_cols(table_partition_cols) + .build() } /// Convert partition columns from Vec to Vec @@ -1776,4 +1995,147 @@ mod tests { ) .unwrap() } + + #[test] + fn test_file_scan_config_builder() { + let file_schema = aggr_test_schema(); + let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); + let file_source: Arc = Arc::new(MockSource::default()); + + // Create a builder with required parameters + let builder = FileScanConfigBuilder::new( + object_store_url.clone(), + Arc::clone(&file_schema), + Arc::clone(&file_source), + ); + + // Build with various configurations + let config = builder + .with_limit(Some(1000)) + .with_projection(Some(vec![0, 1])) + .with_table_partition_cols(vec![Field::new( + "date", + wrap_partition_type_in_dict(DataType::Utf8), + false, + )]) + .with_constraints(Constraints::empty()) + .with_statistics(Statistics::new_unknown(&file_schema)) + .with_file_group(vec![PartitionedFile::new("test.parquet", 1234)]) + .with_output_ordering(vec![LexOrdering::default()]) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .with_newlines_in_values(true) + .build(); + + // Verify the built config has all the expected values + assert_eq!(config.object_store_url, object_store_url); + assert_eq!(config.file_schema, file_schema); + assert_eq!(config.limit, Some(1000)); + assert_eq!(config.projection, Some(vec![0, 1])); + assert_eq!(config.table_partition_cols.len(), 1); + assert_eq!(config.table_partition_cols[0].name(), "date"); + assert_eq!(config.file_groups.len(), 1); + assert_eq!(config.file_groups[0].len(), 1); + assert_eq!( + config.file_groups[0][0].object_meta.location.as_ref(), + "test.parquet" + ); + assert_eq!( + config.file_compression_type, + FileCompressionType::UNCOMPRESSED + ); + assert!(config.new_lines_in_values); + assert_eq!(config.output_ordering.len(), 1); + } + + #[test] + fn test_file_scan_config_builder_defaults() { + let file_schema = aggr_test_schema(); + let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); + let file_source: Arc = Arc::new(MockSource::default()); + + // Create a builder with only required parameters and build without any additional configurations + let config = FileScanConfigBuilder::new( + object_store_url.clone(), + Arc::clone(&file_schema), + Arc::clone(&file_source), + ) + .build(); + + // Verify default values + assert_eq!(config.object_store_url, object_store_url); + assert_eq!(config.file_schema, file_schema); + assert_eq!(config.limit, None); + assert_eq!(config.projection, None); + assert!(config.table_partition_cols.is_empty()); + assert!(config.file_groups.is_empty()); + assert_eq!( + config.file_compression_type, + FileCompressionType::UNCOMPRESSED + ); + assert!(!config.new_lines_in_values); + assert!(config.output_ordering.is_empty()); + assert!(config.constraints.is_empty()); + + // Verify statistics are set to unknown + assert_eq!(config.statistics.num_rows, Precision::Absent); + assert_eq!(config.statistics.total_byte_size, Precision::Absent); + assert_eq!( + config.statistics.column_statistics.len(), + file_schema.fields().len() + ); + for stat in config.statistics.column_statistics { + assert_eq!(stat.distinct_count, Precision::Absent); + assert_eq!(stat.min_value, Precision::Absent); + assert_eq!(stat.max_value, Precision::Absent); + assert_eq!(stat.null_count, Precision::Absent); + } + } + + #[test] + fn test_file_scan_config_builder_new_from() { + let schema = aggr_test_schema(); + let object_store_url = ObjectStoreUrl::parse("test:///").unwrap(); + let file_source: Arc = Arc::new(MockSource::default()); + let partition_cols = vec![Field::new( + "date", + wrap_partition_type_in_dict(DataType::Utf8), + false, + )]; + let file = PartitionedFile::new("test_file.parquet", 100); + + // Create a config with non-default values + let original_config = FileScanConfigBuilder::new( + object_store_url.clone(), + Arc::clone(&schema), + Arc::clone(&file_source), + ) + .with_projection(Some(vec![0, 2])) + .with_limit(Some(10)) + .with_table_partition_cols(partition_cols.clone()) + .with_file(file.clone()) + .with_constraints(Constraints::default()) + .with_newlines_in_values(true) + .build(); + + // Create a new builder from the config + let new_builder = FileScanConfigBuilder::from(original_config); + + // Build a new config from this builder + let new_config = new_builder.build(); + + // Verify properties match + assert_eq!(new_config.object_store_url, object_store_url); + assert_eq!(new_config.file_schema, schema); + assert_eq!(new_config.projection, Some(vec![0, 2])); + assert_eq!(new_config.limit, Some(10)); + assert_eq!(new_config.table_partition_cols, partition_cols); + assert_eq!(new_config.file_groups.len(), 1); + assert_eq!(new_config.file_groups[0].len(), 1); + assert_eq!( + new_config.file_groups[0][0].object_meta.location.as_ref(), + "test_file.parquet" + ); + assert_eq!(new_config.constraints, Constraints::default()); + assert!(new_config.new_lines_in_values); + } } diff --git a/datafusion/datasource/src/file_stream.rs b/datafusion/datasource/src/file_stream.rs index 7d17d230fc01..198b6965cf10 100644 --- a/datafusion/datasource/src/file_stream.rs +++ b/datafusion/datasource/src/file_stream.rs @@ -522,7 +522,7 @@ impl FileStreamMetrics { #[cfg(test)] mod tests { - use crate::file_scan_config::FileScanConfig; + use crate::file_scan_config::FileScanConfigBuilder; use crate::tests::make_partition; use crate::PartitionedFile; use arrow::error::ArrowError; @@ -656,13 +656,14 @@ mod tests { let on_error = self.on_error; - let config = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema, Arc::new(MockSource::default()), ) .with_file_group(file_group) - .with_limit(self.limit); + .with_limit(self.limit) + .build(); let metrics_set = ExecutionPlanMetricsSet::new(); let file_stream = FileStream::new(&config, 0, Arc::new(self.opener), &metrics_set) diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 0bf9fdb63d59..4b4ef1a5715b 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -32,7 +32,9 @@ use datafusion::datasource::file_format::json::JsonSink; use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::listing::{FileRange, ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig, FileSource}; +use datafusion::datasource::physical_plan::{ + FileScanConfig, FileScanConfigBuilder, FileSinkConfig, FileSource, +}; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::WindowFunctionDefinition; use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr, ScalarFunctionExpr}; @@ -537,14 +539,15 @@ pub fn parse_protobuf_file_scan_config( output_ordering.push(sort_expr); } - let config = FileScanConfig::new(object_store_url, file_schema, file_source) + let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) .with_file_groups(file_groups) .with_constraints(constraints) .with_statistics(statistics) .with_projection(projection) .with_limit(proto.limit.as_ref().map(|sl| sl.limit as usize)) .with_table_partition_cols(table_partition_cols) - .with_output_ordering(output_ordering); + .with_output_ordering(output_ordering) + .build(); Ok(config) } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index aeae39c4d039..023980151a6d 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -47,8 +47,9 @@ use datafusion::datasource::listing::{ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, - FileSinkConfig, FileSource, ParquetSource, + FileScanConfigBuilder, FileSinkConfig, FileSource, ParquetSource, }; +use datafusion::datasource::source::DataSourceExec; use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::sum::sum_udaf; use datafusion::functions_window::nth_value::nth_value_udwf; @@ -741,21 +742,24 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { ParquetSource::new(options).with_predicate(Arc::clone(&file_schema), predicate), ); - let scan_config = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), file_schema, file_source) - .with_file_groups(vec![vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )]]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( - vec![Field::new("col", DataType::Utf8, false)], - ))), - }); - - roundtrip_test(scan_config.build()) + let scan_config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + file_schema, + file_source, + ) + .with_file_groups(vec![vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )]]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ + Field::new("col", DataType::Utf8, false), + ]))), + }).build(); + + roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) } #[tokio::test] @@ -795,19 +799,23 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { .with_predicate(Arc::clone(&file_schema), custom_predicate_expr), ); - let scan_config = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), file_schema, file_source) - .with_file_groups(vec![vec![PartitionedFile::new( - "/path/to/file.parquet".to_string(), - 1024, - )]]) - .with_statistics(Statistics { - num_rows: Precision::Inexact(100), - total_byte_size: Precision::Inexact(1024), - column_statistics: Statistics::unknown_column(&Arc::new(Schema::new( - vec![Field::new("col", DataType::Utf8, false)], - ))), - }); + let scan_config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + file_schema, + file_source, + ) + .with_file_groups(vec![vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )]]) + .with_statistics(Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ + Field::new("col", DataType::Utf8, false), + ]))), + }) + .build(); #[derive(Debug, Clone, Eq)] struct CustomPredicateExpr { @@ -1604,7 +1612,7 @@ async fn roundtrip_projection_source() -> Result<()> { let statistics = Statistics::new_unknown(&schema); let file_source = ParquetSource::default().with_statistics(statistics.clone()); - let scan_config = FileScanConfig::new( + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), schema.clone(), file_source, @@ -1614,7 +1622,8 @@ async fn roundtrip_projection_source() -> Result<()> { 1024, )]]) .with_statistics(statistics) - .with_projection(Some(vec![0, 1, 2])); + .with_projection(Some(vec![0, 1, 2])) + .build(); let filter = Arc::new( FilterExec::try_new( From 93002ea65fdb5eceef71f797c677c3fe185cf8b2 Mon Sep 17 00:00:00 2001 From: blaginin Date: Fri, 21 Mar 2025 17:05:23 +0000 Subject: [PATCH 02/14] Fmt --- datafusion/core/src/datasource/file_format/mod.rs | 4 +--- datafusion/core/src/datasource/physical_plan/json.rs | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/mod.rs b/datafusion/core/src/datasource/file_format/mod.rs index f82d7f23411e..b1e52e28b29b 100644 --- a/datafusion/core/src/datasource/file_format/mod.rs +++ b/datafusion/core/src/datasource/file_format/mod.rs @@ -41,9 +41,7 @@ pub(crate) mod test_util { use datafusion_catalog::Session; use datafusion_common::Result; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; - use datafusion_datasource::{ - file_format::FileFormat, PartitionedFile, - }; + use datafusion_datasource::{file_format::FileFormat, PartitionedFile}; use datafusion_execution::object_store::ObjectStoreUrl; use crate::test::object_store::local_unpartitioned_file; diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index f8a0dd711b19..4a7190f8b3e7 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -330,7 +330,6 @@ mod tests { async fn nd_json_exec_file_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - use futures::StreamExt; let session_ctx = SessionContext::new(); From dc53de6d3b19cdab5030736e74b9e7d72499c110 Mon Sep 17 00:00:00 2001 From: blaginin Date: Fri, 21 Mar 2025 17:05:49 +0000 Subject: [PATCH 03/14] More fmt --- datafusion-examples/examples/advanced_parquet_index.rs | 3 +-- datafusion-examples/examples/parquet_index.rs | 4 +--- datafusion/proto/tests/cases/roundtrip_physical_plan.rs | 3 ++- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 90e9e989ce0e..d3f2f04428f1 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -30,8 +30,7 @@ use datafusion::common::{ use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::parquet::ParquetAccessPlan; use datafusion::datasource::physical_plan::{ - FileMeta, FileScanConfigBuilder, ParquetFileReaderFactory, - ParquetSource, + FileMeta, FileScanConfigBuilder, ParquetFileReaderFactory, ParquetSource, }; use datafusion::datasource::TableProvider; use datafusion::execution::object_store::ObjectStoreUrl; diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index f35f4edb0e26..d85d7b25c12c 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -27,9 +27,7 @@ use datafusion::common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::physical_plan::{ - FileScanConfigBuilder, ParquetSource, -}; +use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; use datafusion::datasource::TableProvider; use datafusion::execution::object_store::ObjectStoreUrl; use datafusion::logical_expr::{ diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 023980151a6d..4ca5af7b56d2 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -757,7 +757,8 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ Field::new("col", DataType::Utf8, false), ]))), - }).build(); + }) + .build(); roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) } From c42c90655e0ec69b64355f20879569762f13e720 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 20:44:54 +0000 Subject: [PATCH 04/14] Clean `FileScanConfig::build` --- datafusion-examples/examples/parquet_index.rs | 3 ++- datafusion/core/src/datasource/mod.rs | 3 ++- .../core/src/datasource/physical_plan/avro.rs | 12 +++++---- .../core/src/datasource/physical_plan/csv.rs | 13 ++++----- .../core/src/datasource/physical_plan/json.rs | 9 ++++--- .../src/datasource/physical_plan/parquet.rs | 27 +++++++++++++------ datafusion/core/src/test/mod.rs | 2 +- datafusion/core/src/test_util/parquet.rs | 12 +++++---- datafusion/core/tests/fuzz_cases/pruning.rs | 8 +++--- .../core/tests/parquet/custom_reader.rs | 8 +++--- .../tests/parquet/external_access_plan.rs | 3 ++- .../core/tests/parquet/schema_coercion.rs | 8 ++++-- .../enforce_distribution.rs | 25 ++++++++++------- .../physical_optimizer/enforce_sorting.rs | 21 ++++++++++----- .../physical_optimizer/projection_pushdown.rs | 19 ++++++++----- .../tests/physical_optimizer/test_utils.rs | 14 ++++++---- datafusion/datasource-avro/src/file_format.rs | 7 ++--- datafusion/datasource-csv/src/file_format.rs | 4 +-- datafusion/datasource-csv/src/source.rs | 4 +-- datafusion/datasource-json/src/file_format.rs | 8 +++--- .../datasource-parquet/src/file_format.rs | 8 +++--- datafusion/datasource-parquet/src/source.rs | 10 +++---- datafusion/datasource/src/file_scan_config.rs | 5 +++- datafusion/proto/src/physical_plan/mod.rs | 8 +++--- .../tests/cases/roundtrip_physical_plan.rs | 6 ++--- .../substrait/src/physical_plan/consumer.rs | 4 +-- .../tests/cases/roundtrip_physical_plan.rs | 3 ++- 27 files changed, 158 insertions(+), 96 deletions(-) diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index d85d7b25c12c..dbd7d2c8c265 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -52,6 +52,7 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tempfile::TempDir; use url::Url; +use datafusion::datasource::memory::DataSourceExec; /// This example demonstrates building a secondary index over multiple Parquet /// files and using that index during query to skip ("prune") files that do not @@ -259,7 +260,7 @@ impl TableProvider for IndexTableProvider { file_size, )); } - Ok(file_scan_config.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(file_scan_config)))) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index a932ae76c621..2f8acae317e6 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -73,6 +73,7 @@ mod tests { use ::object_store::ObjectMeta; use datafusion_physical_plan::collect; use tempfile::TempDir; + use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn can_override_schema_adapter() { @@ -131,7 +132,7 @@ mod tests { FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema, source) .with_file(partitioned_file); - let parquet_exec = base_conf.build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_conf))); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 85ec0b2d7661..32cb78c0d437 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -47,6 +47,7 @@ mod tests { use object_store::ObjectStore; use rstest::*; use url::Url; + use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn avro_exec_without_partition() -> Result<()> { @@ -90,7 +91,7 @@ mod tests { .with_projection(Some(vec![0, 1, 2])) .build(); - let source_exec = conf.build(); + let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); assert_eq!( source_exec .properties() @@ -163,7 +164,7 @@ mod tests { .with_file(meta.into()) .with_projection(projection); - let source_exec = conf.build(); + let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); assert_eq!( source_exec .properties() @@ -230,14 +231,15 @@ mod tests { let projection = Some(vec![0, 1, file_schema.fields().len(), 2]); let source = Arc::new(AvroSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) // select specific columns of the files as well as the partitioning // column which is supposed to be the last column in the table schema. .with_projection(projection) .with_file(partitioned_file) - .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]); + .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) + .build(); - let source_exec = conf.build(); + let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); assert_eq!( source_exec diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index e80d04fe4b2f..8798c4269043 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -54,6 +54,7 @@ mod tests { use rstest::*; use tempfile::TempDir; use url::Url; + use datafusion_datasource::source::DataSourceExec; fn aggr_test_schema() -> SchemaRef { let mut f1 = Field::new("c1", DataType::Utf8, false); @@ -113,7 +114,7 @@ mod tests { .with_projection(Some(vec![0, 2, 4])); assert_eq!(13, config.file_schema.fields().len()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(3, csv.schema().fields().len()); @@ -175,7 +176,7 @@ mod tests { .with_file_compression_type(file_compression_type.to_owned()) .with_projection(Some(vec![4, 0, 2])); assert_eq!(13, config.file_schema.fields().len()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(3, csv.schema().fields().len()); let mut stream = csv.execute(0, task_ctx)?; @@ -238,7 +239,7 @@ mod tests { .with_file_compression_type(file_compression_type.to_owned()) .with_limit(Some(5)); assert_eq!(13, config.file_schema.fields().len()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(13, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -296,7 +297,7 @@ mod tests { .with_file_compression_type(file_compression_type.to_owned()) .with_limit(Some(5)); assert_eq!(14, config.file_schema.fields().len()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(14, csv.schema().fields().len()); // errors due to https://github.com/apache/datafusion/issues/4918 @@ -357,7 +358,7 @@ mod tests { // partitions are resolved during scan anyway assert_eq!(13, config.file_schema.fields().len()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(2, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -446,7 +447,7 @@ mod tests { let config = partitioned_csv_config(file_schema, file_groups, source) .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()); - let csv = config.build(); + let csv = Arc::new(DataSourceExec::new(Arc::new(config))); let it = csv.execute(0, task_ctx).unwrap(); let batches: Vec<_> = it.try_collect().await.unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 4a7190f8b3e7..a15af4cb9e95 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -56,6 +56,7 @@ mod tests { use rstest::*; use tempfile::TempDir; use url::Url; + use datafusion_datasource::source::DataSourceExec; const TEST_DATA_BASE: &str = "tests/data"; @@ -180,7 +181,7 @@ mod tests { .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()); - let exec = conf.build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); // TODO: this is not where schema inference should be tested @@ -254,7 +255,7 @@ mod tests { .with_file_groups(file_groups) .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()); - let exec = conf.build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let mut it = exec.execute(0, task_ctx)?; let batch = it.next().await.unwrap()?; @@ -297,7 +298,7 @@ mod tests { .with_file_groups(file_groups) .with_projection(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()); - let exec = conf.build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 2); @@ -345,7 +346,7 @@ mod tests { .with_projection(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = conf.build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 3); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index b5534d6b3d1c..030c5262ed54 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -51,7 +51,7 @@ mod tests { }; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_meta::FileMeta; - use datafusion_datasource::file_scan_config::FileScanConfig; + use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::{FileRange, PartitionedFile}; @@ -178,17 +178,20 @@ mod tests { source = source.with_enable_page_index(true); } - let base_config = FileScanConfig::new( + let base_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, Arc::new(source.clone()), ) .with_file_group(file_group) - .with_projection(projection); + .with_projection(projection) + .build(); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); - let parquet_exec = base_config.clone().build(); + + + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_config.clone()))); RoundTripResult { batches: collect(parquet_exec.clone(), task_ctx).await, parquet_exec, @@ -1127,13 +1130,15 @@ mod tests { expected_row_num: Option, file_schema: SchemaRef, ) -> Result<()> { - let parquet_exec = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, Arc::new(ParquetSource::default()), ) .with_file_groups(file_groups) .build(); + + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!( parquet_exec .properties() @@ -1231,7 +1236,7 @@ mod tests { ]); let source = Arc::new(ParquetSource::default()); - let parquet_exec = FileScanConfig::new(object_store_url, schema.clone(), source) + let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) .with_file(partitioned_file) // file has 10 cols so index 12 should be month and 13 should be day .with_projection(Some(vec![0, 1, 2, 12, 13])) @@ -1248,6 +1253,8 @@ mod tests { ), ]) .build(); + + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); let partition_count = parquet_exec .data_source() .output_partitioning() @@ -1304,7 +1311,7 @@ mod tests { }; let file_schema = Arc::new(Schema::empty()); - let parquet_exec = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, Arc::new(ParquetSource::default()), @@ -1312,6 +1319,8 @@ mod tests { .with_file(partitioned_file) .build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let mut results = parquet_exec.execute(0, state.task_ctx())?; let batch = results.next().await.unwrap(); // invalid file should produce an error to that effect @@ -1935,7 +1944,7 @@ mod tests { .with_parquet_file_reader_factory(reader_factory) .with_metadata_size_hint(456), ); - let exec = FileScanConfig::new(store_url, schema, source) + let config = FileScanConfigBuilder::new(store_url, schema, source) .with_file( PartitionedFile { object_meta: ObjectMeta { @@ -1969,6 +1978,8 @@ mod tests { }) .build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let res = collect(exec, ctx.task_ctx()).await.unwrap(); assert_eq!(res.len(), 2); diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index be707f7e19d0..a77721190fad 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -93,7 +93,7 @@ pub fn scan_partitioned_csv( let source = Arc::new(CsvSource::new(true, b'"', b'"')); let config = partitioned_csv_config(schema, file_groups, source) .with_file_compression_type(FileCompressionType::UNCOMPRESSED); - Ok(config.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } /// Returns file groups [`Vec>`] for scanning `partitions` of `filename` diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index c0be13baf21a..b62d6898098a 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -37,7 +37,7 @@ use crate::physical_plan::metrics::MetricsSet; use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::source::DataSourceExec; use object_store::path::Path; use object_store::ObjectMeta; @@ -157,7 +157,7 @@ impl TestParquetFile { ) -> Result> { let parquet_options = ctx.copied_table_options().parquet; let source = Arc::new(ParquetSource::new(parquet_options.clone())); - let scan_config = FileScanConfig::new( + let scan_config_builder = FileScanConfigBuilder::new( self.object_store_url.clone(), Arc::clone(&self.schema), source, @@ -183,15 +183,17 @@ impl TestParquetFile { create_physical_expr(&filter, &df_schema, &ExecutionProps::default())?; let source = Arc::new(ParquetSource::new(parquet_options).with_predicate( - Arc::clone(&scan_config.file_schema), + Arc::clone(&self.schema), Arc::clone(&physical_filter_expr), )); - let parquet_exec = scan_config.with_source(source).build(); + let config = scan_config_builder.with_source(source).build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); let exec = Arc::new(FilterExec::try_new(physical_filter_expr, parquet_exec)?); Ok(exec) } else { - Ok(scan_config.build()) + let config = scan_config_builder.build(); + Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index f87572631b28..1cd51d84748f 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -40,6 +40,8 @@ use parquet::{ use rand::seq::SliceRandom; use tokio::sync::Mutex; use url::Url; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn test_utf8_eq() { @@ -281,7 +283,7 @@ async fn execute_with_predicate( } else { ParquetSource::default() }; - let scan = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("memory://").unwrap(), schema.clone(), Arc::new(parquet_source), @@ -293,8 +295,8 @@ async fn execute_with_predicate( PartitionedFile::new(test_file.path.clone(), test_file.size as u64) }) .collect(), - ); - let exec = scan.build(); + ).build(); + let exec = Arc::new(DataSourceExec::new(Arc::new(config))); let exec = Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc; diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 4a4059db2547..068016f05545 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -46,6 +46,8 @@ use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; use parquet::file::metadata::ParquetMetaData; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; const EXPECTED_USER_DEFINED_METADATA: &str = "some-user-defined-metadata"; @@ -83,15 +85,15 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { InMemoryParquetFileReaderFactory(Arc::clone(&in_memory_object_store)), )), ); - let base_config = FileScanConfig::new( + let base_config = FileScanConfigBuilder::new( // just any url that doesn't point to in memory object store ObjectStoreUrl::local_filesystem(), file_schema, source, ) - .with_file_group(file_group); + .with_file_group(file_group).build(); - let parquet_exec = base_config.build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_config))); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index 31c685378a21..343fbb6ca819 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -40,6 +40,7 @@ use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; +use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn none() { @@ -351,7 +352,7 @@ impl TestFull { let config = FileScanConfig::new(object_store_url, schema.clone(), source) .with_file(partitioned_file); - let plan: Arc = config.build(); + let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); // run the DataSourceExec and collect the results let results = diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 85bc1104795f..0e82e7a5d442 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -35,6 +35,8 @@ use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; /// Test for reading data from multiple parquet files with different schemas and coercing them into a single schema. #[tokio::test] @@ -65,7 +67,7 @@ async fn multi_parquet_coercion() { FileScanConfig::new(ObjectStoreUrl::local_filesystem(), file_schema, source) .with_file_group(file_group); - let parquet_exec = conf.build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); @@ -114,7 +116,7 @@ async fn multi_parquet_coercion_projection() { Field::new("c2", DataType::Int32, true), Field::new("c3", DataType::Float64, true), ])); - let parquet_exec = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), file_schema, Arc::new(ParquetSource::default()), @@ -123,6 +125,8 @@ async fn multi_parquet_coercion_projection() { .with_projection(Some(vec![1, 0, 2])) .build(); + let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); let read = collect(parquet_exec, task_ctx).await.unwrap(); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index b71724b8f7cd..34fdc6d1724d 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -35,6 +35,7 @@ use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_expr::{JoinType, Operator}; use datafusion_physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion_physical_expr::PhysicalExpr; @@ -183,7 +184,7 @@ fn parquet_exec_multiple() -> Arc { fn parquet_exec_multiple_sorted( output_ordering: Vec, ) -> Arc { - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(ParquetSource::default()), @@ -193,7 +194,9 @@ fn parquet_exec_multiple_sorted( vec![PartitionedFile::new("y".to_string(), 100)], ]) .with_output_ordering(output_ordering) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } fn csv_exec() -> Arc { @@ -201,14 +204,16 @@ fn csv_exec() -> Arc { } fn csv_exec_with_sort(output_ordering: Vec) -> Arc { - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(CsvSource::new(false, b',', b'"')), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(output_ordering) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } fn csv_exec_multiple() -> Arc { @@ -217,7 +222,7 @@ fn csv_exec_multiple() -> Arc { // Created a sorted parquet exec with multiple files fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc { - FileScanConfig::new( + let config = FileScanConfig::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(CsvSource::new(false, b',', b'"')), @@ -226,8 +231,9 @@ fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc Result<()> { }; let plan = aggregate_exec_with_alias( - FileScanConfig::new( + Arc::new(DataSourceExec::new(Arc::new( + FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(CsvSource::new(false, b',', b'"')), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_file_compression_type(compression_type) - .build(), + .build()))), vec![("a".to_string(), "a".to_string())], ); let test_config = TestConfig::default() diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index bb77192e05b8..ecb12ef55ba8 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -57,6 +57,8 @@ use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; use rstest::rstest; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; /// Create a csv exec for tests fn csv_exec_ordered( @@ -65,14 +67,17 @@ fn csv_exec_ordered( ) -> Arc { let sort_exprs = sort_exprs.into_iter().collect(); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema.clone(), Arc::new(CsvSource::new(true, 0, b'"')), ) .with_file(PartitionedFile::new("file_path".to_string(), 100)) .with_output_ordering(vec![sort_exprs]) - .build() + .build(); + + Arc::new( + DataSourceExec::new(Arc::new(config))) } /// Created a sorted parquet exec @@ -83,14 +88,16 @@ pub fn parquet_exec_sorted( let sort_exprs = sort_exprs.into_iter().collect(); let source = Arc::new(ParquetSource::default()); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema.clone(), source, ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(vec![sort_exprs]) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } /// Create a sorted Csv exec @@ -100,14 +107,16 @@ fn csv_exec_sorted( ) -> Arc { let sort_exprs = sort_exprs.into_iter().collect(); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema.clone(), Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(vec![sort_exprs]) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } /// Runs the sort enforcement optimizer and asserts the plan diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index abe058df99d0..af0959c3f615 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -61,6 +61,7 @@ use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; use datafusion_expr_common::columnar_value::ColumnarValue; use itertools::Itertools; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; /// Mocked UDF #[derive(Debug)] @@ -372,14 +373,16 @@ fn create_simple_csv_exec() -> Arc { Field::new("d", DataType::Int32, true), Field::new("e", DataType::Int32, true), ])); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema, Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_projection(Some(vec![0, 1, 2, 3, 4])) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } fn create_projecting_csv_exec() -> Arc { @@ -389,14 +392,16 @@ fn create_projecting_csv_exec() -> Arc { Field::new("c", DataType::Int32, true), Field::new("d", DataType::Int32, true), ])); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema, Arc::new(CsvSource::new(false, 0, 0)), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_projection(Some(vec![3, 2, 1])) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } fn create_projecting_memory_exec() -> Arc { @@ -1398,7 +1403,7 @@ fn partitioned_data_source() -> Arc { Field::new("string_col", DataType::Utf8, true), ])); - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), file_schema.clone(), Arc::new(CsvSource::default()), @@ -1406,7 +1411,9 @@ fn partitioned_data_source() -> Arc { .with_file(PartitionedFile::new("x".to_string(), 100)) .with_table_partition_cols(vec![Field::new("partition_col", DataType::Utf8, true)]) .with_projection(Some(vec![0, 1, 2])) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } #[test] diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 99a75e6e5067..4b3bfc611a5c 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -33,7 +33,7 @@ use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::utils::expr::COUNT_STAR_EXPANSION; use datafusion_common::{JoinType, Result}; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{WindowFrame, WindowFunctionDefinition}; @@ -69,27 +69,31 @@ use datafusion_physical_plan::{ /// Create a non sorted parquet exec pub fn parquet_exec(schema: &SchemaRef) -> Arc { - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema.clone(), Arc::new(ParquetSource::default()), ) .with_file(PartitionedFile::new("x".to_string(), 100)) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } /// Create a single parquet file that is sorted pub(crate) fn parquet_exec_with_sort( output_ordering: Vec, ) -> Arc { - FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(ParquetSource::default()), ) .with_file(PartitionedFile::new("x".to_string(), 100)) .with_output_ordering(output_ordering) - .build() + .build(); + + Arc::new(DataSourceExec::new(Arc::new(config))) } pub fn schema() -> SchemaRef { diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 00a96121aa3b..aad90e47dc7b 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -35,11 +35,11 @@ use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::GetExt; use datafusion_common::DEFAULT_AVRO_EXTENSION; use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; - +use datafusion_datasource::source::DataSourceExec; use crate::avro_to_arrow::read_avro_schema_from_reader; use crate::source::AvroSource; @@ -151,7 +151,8 @@ impl FileFormat for AvroFormat { conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { - Ok(conf.with_source(self.file_source()).build()) + let config = FileScanConfigBuilder::from(conf).with_source(self.file_source()).build(); + Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } fn file_source(&self) -> Arc { diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index 4d3c3f565304..c59019182825 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -59,7 +59,7 @@ use futures::stream::BoxStream; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; use regex::Regex; - +use datafusion_datasource::source::DataSourceExec; use crate::source::CsvSource; #[derive(Default)] @@ -432,7 +432,7 @@ impl FileFormat for CsvFormat { let config = conf_builder.with_source(source).build(); - Ok(config.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index b9d974c88484..42da8bd92d20 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -424,10 +424,10 @@ impl ExecutionPlan for CsvExec { /// .with_terminator(Some(b'#') /// )); /// // Create a DataSourceExec for reading the first 100MB of `file1.csv` -/// let file_scan_config = FileScanConfig::new(object_store_url, file_schema, source) +/// let config = FileScanConfig::new(object_store_url, file_schema, source) /// .with_file(PartitionedFile::new("file1.csv", 100*1024*1024)) /// .with_newlines_in_values(true); // The file contains newlines in values; -/// let exec = file_scan_config.build(); +/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` #[derive(Debug, Clone, Default)] pub struct CsvSource { diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index 9b6d5925fe81..f1ad5a5377ef 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -43,7 +43,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{ FileFormat, FileFormatFactory, DEFAULT_SCHEMA_INFER_MAX_RECORD, }; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_datasource::file_sink_config::{FileSink, FileSinkConfig}; use datafusion_datasource::write::demux::DemuxedStreamReceiver; use datafusion_datasource::write::orchestration::spawn_writer_tasks_and_join; @@ -58,7 +58,7 @@ use async_trait::async_trait; use bytes::{Buf, Bytes}; use datafusion_physical_expr_common::sort_expr::LexRequirement; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; - +use datafusion_datasource::source::DataSourceExec; use crate::source::JsonSource; #[derive(Default)] @@ -250,8 +250,8 @@ impl FileFormat for JsonFormat { _filters: Option<&Arc>, ) -> Result> { let source = Arc::new(JsonSource::new()); - conf.file_compression_type = FileCompressionType::from(self.options.compression); - Ok(conf.with_source(source).build()) + let conf = FileScanConfigBuilder::from(conf).with_file_compression_type(FileCompressionType::from(self.options.compression)).with_source(source).build(); + Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 8a78407c64d1..e9a812e5ce02 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -48,7 +48,7 @@ use datafusion_common::{HashMap, Statistics}; use datafusion_common_runtime::{JoinSet, SpawnedTask}; use datafusion_datasource::display::FileGroupDisplay; use datafusion_datasource::file::FileSource; -use datafusion_datasource::file_scan_config::FileScanConfig; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; use datafusion_execution::memory_pool::{MemoryConsumer, MemoryPool, MemoryReservation}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::dml::InsertOp; @@ -82,7 +82,7 @@ use parquet::file::writer::SerializedFileWriter; use parquet::format::FileMetaData; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; - +use datafusion_datasource::source::DataSourceExec; use crate::can_expr_be_pushed_down_with_schemas; use crate::source::ParquetSource; @@ -419,7 +419,9 @@ impl FileFormat for ParquetFormat { if let Some(metadata_size_hint) = metadata_size_hint { source = source.with_metadata_size_hint(metadata_size_hint) } - Ok(conf.with_source(Arc::new(source)).build()) + + let conf = FileScanConfigBuilder::from(conf).with_source(Arc::new(source)).build(); + Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 47e692cb966d..50f49e7a83b2 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -93,9 +93,9 @@ use object_store::ObjectStore; /// .with_predicate(Arc::clone(&file_schema), predicate) /// ); /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB -/// let file_scan_config = FileScanConfig::new(object_store_url, file_schema, source) +/// let config = FileScanConfig::new(object_store_url, file_schema, source) /// .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)); -/// let exec = file_scan_config.build(); +/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` /// /// # Features @@ -177,7 +177,7 @@ use object_store::ObjectStore; /// .clone() /// .with_file_groups(vec![file_group.clone()]); /// -/// new_config.build() +/// Arc::new(DataSourceExec::new(Arc::new(new_config))) /// }) /// .collect::>(); /// ``` @@ -216,11 +216,11 @@ use object_store::ObjectStore; /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234) /// .with_extensions(Arc::new(access_plan)); /// // create a FileScanConfig to scan this file -/// let file_scan_config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) +/// let config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) /// .with_file(partitioned_file); /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional /// // pruning based on predicates may also happen -/// let exec = file_scan_config.build(); +/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` /// /// For a complete example, see the [`advanced_parquet_index` example]). diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 514281944a81..91fd8014901c 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -74,6 +74,7 @@ use crate::{ /// # use datafusion_datasource::PartitionedFile; /// # use datafusion_datasource::file_scan_config::FileScanConfig; /// # use datafusion_datasource::file_stream::FileOpener; +/// # use datafusion_datasource::source::DataSourceExec; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_physical_plan::ExecutionPlan; /// # use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; @@ -116,7 +117,7 @@ use crate::{ /// PartitionedFile::new("file3.parquet", 78), /// ]); /// // create an execution plan from the config -/// let plan: Arc = config.build(); +/// let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` #[derive(Clone)] pub struct FileScanConfig { @@ -549,6 +550,8 @@ impl FileScanConfig { /// # Parameters: /// * `object_store_url`: See [`Self::object_store_url`] /// * `file_schema`: See [`Self::file_schema`] + #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] + #[allow(deprecated)] // `new` will be removed same time as `with_source` pub fn new( object_store_url: ObjectStoreUrl, file_schema: SchemaRef, diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 6562a9be458f..299bd5fd99d0 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -245,7 +245,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { )? .with_newlines_in_values(scan.newlines_in_values) .with_file_compression_type(FileCompressionType::UNCOMPRESSED); - Ok(conf.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } PhysicalPlanType::JsonScan(scan) => { let scan_conf = parse_protobuf_file_scan_config( @@ -254,7 +254,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(JsonSource::new()), )?; - Ok(scan_conf.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(scan_conf)))) } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] PhysicalPlanType::ParquetScan(scan) => { @@ -291,7 +291,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(source), )?; - Ok(base_config.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(base_config)))) } #[cfg(not(feature = "parquet"))] panic!("Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled") @@ -306,7 +306,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(AvroSource::new()), )?; - Ok(conf.build()) + Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } #[cfg(not(feature = "avro"))] panic!("Unable to process a Avro PhysicalPlan when `avro` feature is not enabled") diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index 4ca5af7b56d2..f272338772c7 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -783,7 +783,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { )]) .with_newlines_in_values(false); - roundtrip_test(scan_config.build()) + roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) } #[test] @@ -927,7 +927,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { } } - let exec_plan = scan_config.build(); + let exec_plan = Arc::new(DataSourceExec::new(Arc::new(scan_config))); let ctx = SessionContext::new(); roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?; @@ -1629,7 +1629,7 @@ async fn roundtrip_projection_source() -> Result<()> { let filter = Arc::new( FilterExec::try_new( Arc::new(BinaryExpr::new(col("c", &schema)?, Operator::Eq, lit(1))), - scan_config.build(), + Arc::new(DataSourceExec::new(Arc::new(scan_config))), )? .with_projection(Some(vec![0, 1]))?, ); diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 7bbdfc2a5d94..5ad093c30f9d 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -36,7 +36,7 @@ use substrait::proto::Type; use substrait::proto::{ expression::MaskExpression, read_rel::ReadType, rel::RelType, Rel, }; - +use datafusion::datasource::memory::DataSourceExec; use crate::variation_const::{ DEFAULT_CONTAINER_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF, @@ -152,7 +152,7 @@ pub async fn from_substrait_rel( } } - Ok(base_config.build() as Arc) + Ok(Arc::new(DataSourceExec::new(Arc::new(base_config))) as Arc) } _ => not_impl_err!( "Only LocalFile reads are supported when parsing physical" diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index f1284db2ad46..498fa5f157bf 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -29,6 +29,7 @@ use datafusion::prelude::{ParquetReadOptions, SessionContext}; use datafusion_substrait::physical_plan::{consumer, producer}; use substrait::proto::extensions; +use datafusion::datasource::memory::DataSourceExec; #[tokio::test] async fn parquet_exec() -> Result<()> { @@ -49,7 +50,7 @@ async fn parquet_exec() -> Result<()> { 123, )], ]); - let parquet_exec: Arc = scan_config.build(); + let parquet_exec: Arc = Arc::new(DataSourceExec::new(Arc::new(scan_config))); let mut extension_info: ( Vec, From e13bfa9aa4cab4e8a7251d4a6ca69645345e6fb2 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 21:04:27 +0000 Subject: [PATCH 05/14] Clean `FileScanConfig::new` --- .../examples/csv_json_opener.rs | 13 +++++++----- datafusion/core/src/datasource/mod.rs | 7 ++++--- .../core/src/datasource/physical_plan/avro.rs | 9 ++++---- .../core/src/datasource/physical_plan/json.rs | 21 +++++++++++-------- .../src/datasource/physical_plan/parquet.rs | 2 +- datafusion/core/src/test/mod.rs | 6 ++++-- datafusion/core/tests/fuzz_cases/pruning.rs | 2 +- .../core/tests/parquet/custom_reader.rs | 2 +- .../tests/parquet/external_access_plan.rs | 7 ++++--- datafusion/core/tests/parquet/page_pruning.rs | 5 +++-- .../core/tests/parquet/schema_coercion.rs | 6 +++--- .../enforce_distribution.rs | 7 ++++--- .../physical_optimizer/enforce_sorting.rs | 2 +- .../physical_optimizer/projection_pushdown.rs | 2 +- .../tests/physical_optimizer/test_utils.rs | 2 +- datafusion/datasource-csv/src/mod.rs | 5 +++-- datafusion/datasource-csv/src/source.rs | 7 ++++--- datafusion/datasource-json/src/file_format.rs | 2 +- datafusion/datasource-parquet/src/source.rs | 12 +++++------ datafusion/datasource/src/file_scan_config.rs | 16 ++++++-------- datafusion/proto/src/physical_plan/mod.rs | 9 ++++---- .../tests/cases/roundtrip_physical_plan.rs | 7 ++++--- .../tests/cases/roundtrip_physical_plan.rs | 6 +++--- 23 files changed, 84 insertions(+), 73 deletions(-) diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 6dc38a436a0c..1fc79171ec99 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -25,7 +25,7 @@ use datafusion::{ listing::PartitionedFile, object_store::ObjectStoreUrl, physical_plan::{ - CsvSource, FileScanConfig, FileSource, FileStream, JsonOpener, JsonSource, + CsvSource, FileSource, FileStream, JsonOpener, JsonSource, }, }, error::Result, @@ -35,6 +35,7 @@ use datafusion::{ use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; +use datafusion::datasource::physical_plan::FileScanConfigBuilder; /// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly /// read data from (CSV/JSON) into Arrow RecordBatches. @@ -56,14 +57,15 @@ async fn csv_opener() -> Result<()> { let path = std::path::Path::new(&path).canonicalize()?; - let scan_config = FileScanConfig::new( + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), Arc::clone(&schema), Arc::new(CsvSource::default()), ) .with_projection(Some(vec![12, 0])) .with_limit(Some(5)) - .with_file(PartitionedFile::new(path.display().to_string(), 10)); + .with_file(PartitionedFile::new(path.display().to_string(), 10)) + .build(); let config = CsvSource::new(true, b',', b'"') .with_comment(Some(b'#')) @@ -121,14 +123,15 @@ async fn json_opener() -> Result<()> { Arc::new(object_store), ); - let scan_config = FileScanConfig::new( + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), schema, Arc::new(JsonSource::default()), ) .with_projection(Some(vec![1, 0])) .with_limit(Some(5)) - .with_file(PartitionedFile::new(path.to_string(), 10)); + .with_file(PartitionedFile::new(path.to_string(), 10)) + .build(); let mut stream = FileStream::new( &scan_config, diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 2f8acae317e6..bbafe7ba761a 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -60,7 +60,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::assert_batches_sorted_eq; - use datafusion_datasource::file_scan_config::FileScanConfig; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::schema_adapter::{ DefaultSchemaAdapterFactory, SchemaAdapter, SchemaAdapterFactory, SchemaMapper, }; @@ -129,8 +129,9 @@ mod tests { .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {})), ); let base_conf = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema, source) - .with_file(partitioned_file); + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, source) + .with_file(partitioned_file) + .build(); let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_conf))); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 32cb78c0d437..83ddac38408f 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -32,9 +32,7 @@ mod tests { use arrow::datatypes::{DataType, Field, SchemaBuilder}; use datafusion_common::{assert_batches_eq, test_util, Result, ScalarValue}; use datafusion_datasource::file_format::FileFormat; - use datafusion_datasource::file_scan_config::{ - FileScanConfig, FileScanConfigBuilder, - }; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::PartitionedFile; use datafusion_datasource_avro::source::AvroSource; use datafusion_datasource_avro::AvroFormat; @@ -160,9 +158,10 @@ mod tests { let projection = Some(vec![0, 1, 2, actual_schema.fields().len()]); let source = Arc::new(AvroSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file(meta.into()) - .with_projection(projection); + .with_projection(projection) + .build(); let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); assert_eq!( diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index a15af4cb9e95..ba211ba20f94 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -169,7 +169,7 @@ mod tests { let state = session_ctx.state(); let task_ctx = session_ctx.task_ctx(); use arrow::datatypes::DataType; - use datafusion_datasource::file_scan_config::FileScanConfig; + use futures::StreamExt; let tmp_dir = TempDir::new()?; @@ -177,10 +177,11 @@ mod tests { prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; let source = Arc::new(JsonSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) .with_limit(Some(3)) - .with_file_compression_type(file_compression_type.to_owned()); + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); // TODO: this is not where schema inference should be tested @@ -233,7 +234,7 @@ mod tests { file_compression_type: FileCompressionType, ) -> Result<()> { use arrow::datatypes::DataType; - use datafusion_datasource::file_scan_config::FileScanConfig; + use futures::StreamExt; let session_ctx = SessionContext::new(); @@ -251,10 +252,11 @@ mod tests { let missing_field_idx = file_schema.fields.len() - 1; let source = Arc::new(JsonSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) .with_limit(Some(3)) - .with_file_compression_type(file_compression_type.to_owned()); + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let mut it = exec.execute(0, task_ctx)?; @@ -283,7 +285,7 @@ mod tests { async fn nd_json_exec_file_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - use datafusion_datasource::file_scan_config::FileScanConfig; + use futures::StreamExt; let session_ctx = SessionContext::new(); @@ -294,10 +296,11 @@ mod tests { prepare_store(&state, file_compression_type.to_owned(), tmp_dir.path()).await; let source = Arc::new(JsonSource::new()); - let conf = FileScanConfig::new(object_store_url, file_schema, source) + let conf = FileScanConfigBuilder::new(object_store_url, file_schema, source) .with_file_groups(file_groups) .with_projection(Some(vec![0, 2])) - .with_file_compression_type(file_compression_type.to_owned()); + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 2); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 030c5262ed54..1b9e3711e47e 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -51,7 +51,7 @@ mod tests { }; use datafusion_datasource::file_format::FileFormat; use datafusion_datasource::file_meta::FileMeta; - use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; use datafusion_datasource::{FileRange, PartitionedFile}; diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index a77721190fad..6406a1bc7259 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -56,6 +56,7 @@ use object_store::local_unpartitioned_file; use xz2::write::XzEncoder; #[cfg(feature = "compression")] use zstd::Encoder as ZstdEncoder; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; pub fn create_table_dual() -> Arc { let dual_schema = Arc::new(Schema::new(vec![ @@ -91,8 +92,9 @@ pub fn scan_partitioned_csv( work_dir, )?; let source = Arc::new(CsvSource::new(true, b'"', b'"')); - let config = partitioned_csv_config(schema, file_groups, source) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED); + let config = FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index 1cd51d84748f..afb1769b1b91 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -23,7 +23,7 @@ use bytes::{BufMut, Bytes, BytesMut}; use datafusion::{ datasource::{ listing::PartitionedFile, - physical_plan::{FileScanConfig, ParquetSource}, + physical_plan::ParquetSource, }, prelude::*, }; diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 068016f05545..8e1af16168f0 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -27,7 +27,7 @@ use datafusion::datasource::file_format::parquet::fetch_parquet_metadata; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ - FileMeta, FileScanConfig, ParquetFileMetrics, ParquetFileReaderFactory, ParquetSource, + FileMeta, ParquetFileMetrics, ParquetFileReaderFactory, ParquetSource, }; use datafusion::physical_plan::collect; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index 343fbb6ca819..7ffe6268c8e4 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -27,7 +27,7 @@ use arrow::datatypes::SchemaRef; use arrow::util::pretty::pretty_format_batches; use datafusion::common::Result; use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::ParquetSource; use datafusion::prelude::SessionContext; use datafusion_common::{assert_contains, DFSchema}; use datafusion_datasource_parquet::{ParquetAccessPlan, RowGroupAccess}; @@ -40,6 +40,7 @@ use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; #[tokio::test] @@ -349,8 +350,8 @@ impl TestFull { } else { Arc::new(ParquetSource::default()) }; - let config = FileScanConfig::new(object_store_url, schema.clone(), source) - .with_file(partitioned_file); + let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) + .with_file(partitioned_file).build(); let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index fe96a2eb5e71..65ec6b107afe 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -24,7 +24,7 @@ use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::ParquetSource; use datafusion::datasource::source::DataSourceExec; use datafusion::execution::context::SessionState; use datafusion::physical_plan::metrics::MetricValue; @@ -38,6 +38,7 @@ use datafusion_physical_expr::create_physical_expr; use futures::StreamExt; use object_store::path::Path; use object_store::ObjectMeta; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec { let object_store_url = ObjectStoreUrl::local_filesystem(); @@ -80,7 +81,7 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec .with_enable_page_index(true), ); let base_config = - FileScanConfig::new(object_store_url, schema, source).with_file(partitioned_file); + FileScanConfigBuilder::new(object_store_url, schema, source).with_file(partitioned_file).build(); DataSourceExec::new(Arc::new(base_config)) } diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 0e82e7a5d442..13cbc314213c 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -22,7 +22,7 @@ use arrow::array::{ StringArray, }; use arrow::datatypes::{DataType, Field, Schema}; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::ParquetSource; use datafusion::physical_plan::collect; use datafusion::prelude::SessionContext; use datafusion::test::object_store::local_unpartitioned_file; @@ -64,8 +64,8 @@ async fn multi_parquet_coercion() { ])); let source = Arc::new(ParquetSource::default()); let conf = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), file_schema, source) - .with_file_group(file_group); + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema, source) + .with_file_group(file_group).build(); let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 34fdc6d1724d..76c186b303af 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -30,7 +30,7 @@ use datafusion::config::ConfigOptions; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::{CsvSource, ParquetSource}; use datafusion::datasource::source::DataSourceExec; use datafusion_common::error::Result; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; @@ -222,7 +222,7 @@ fn csv_exec_multiple() -> Arc { // Created a sorted parquet exec with multiple files fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc { - let config = FileScanConfig::new( + let config = FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), Arc::new(CsvSource::new(false, b',', b'"')), @@ -231,7 +231,8 @@ fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc>, file_source: Arc, ) -> FileScanConfig { - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema, file_source) - .with_file_groups(file_groups) + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) + .with_file_groups(file_groups).build() } diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index 42da8bd92d20..2678c9f83ddb 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -407,7 +407,7 @@ impl ExecutionPlan for CsvExec { /// ``` /// # use std::sync::Arc; /// # use arrow::datatypes::Schema; -/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource::PartitionedFile; /// # use datafusion_datasource_csv::source::CsvSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; @@ -424,9 +424,10 @@ impl ExecutionPlan for CsvExec { /// .with_terminator(Some(b'#') /// )); /// // Create a DataSourceExec for reading the first 100MB of `file1.csv` -/// let config = FileScanConfig::new(object_store_url, file_schema, source) +/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source) /// .with_file(PartitionedFile::new("file1.csv", 100*1024*1024)) -/// .with_newlines_in_values(true); // The file contains newlines in values; +/// .with_newlines_in_values(true) // The file contains newlines in values; +/// .build(); /// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` #[derive(Debug, Clone, Default)] diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index f1ad5a5377ef..b36598a963d3 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -246,7 +246,7 @@ impl FileFormat for JsonFormat { async fn create_physical_plan( &self, _state: &dyn Session, - mut conf: FileScanConfig, + conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { let source = Arc::new(JsonSource::new()); diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index 50f49e7a83b2..e7d5c628c68e 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -77,7 +77,7 @@ use object_store::ObjectStore; /// ``` /// # use std::sync::Arc; /// # use arrow::datatypes::Schema; -/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource_parquet::source::ParquetSource; /// # use datafusion_datasource::PartitionedFile; /// # use datafusion_execution::object_store::ObjectStoreUrl; @@ -93,8 +93,8 @@ use object_store::ObjectStore; /// .with_predicate(Arc::clone(&file_schema), predicate) /// ); /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB -/// let config = FileScanConfig::new(object_store_url, file_schema, source) -/// .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)); +/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source) +/// .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build(); /// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` /// @@ -200,7 +200,7 @@ use object_store::ObjectStore; /// # use arrow::datatypes::{Schema, SchemaRef}; /// # use datafusion_datasource::PartitionedFile; /// # use datafusion_datasource_parquet::ParquetAccessPlan; -/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource_parquet::source::ParquetSource; /// # use datafusion_execution::object_store::ObjectStoreUrl; /// # use datafusion_datasource::source::DataSourceExec; @@ -216,8 +216,8 @@ use object_store::ObjectStore; /// let partitioned_file = PartitionedFile::new("my_file.parquet", 1234) /// .with_extensions(Arc::new(access_plan)); /// // create a FileScanConfig to scan this file -/// let config = FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) -/// .with_file(partitioned_file); +/// let config = FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema(), Arc::new(ParquetSource::default())) +/// .with_file(partitioned_file).build(); /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional /// // pruning based on predicates may also happen /// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 91fd8014901c..029767dcc303 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -72,7 +72,7 @@ use crate::{ /// # use datafusion_common::Statistics; /// # use datafusion_datasource::file::FileSource; /// # use datafusion_datasource::PartitionedFile; -/// # use datafusion_datasource::file_scan_config::FileScanConfig; +/// # use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; /// # use datafusion_datasource::file_stream::FileOpener; /// # use datafusion_datasource::source::DataSourceExec; /// # use datafusion_execution::object_store::ObjectStoreUrl; @@ -105,7 +105,7 @@ use crate::{ /// // create FileScan config for reading parquet files from file:// /// let object_store_url = ObjectStoreUrl::local_filesystem(); /// let file_source = Arc::new(ParquetSource::new()); -/// let config = FileScanConfig::new(object_store_url, file_schema, file_source) +/// let config = FileScanConfigBuilder::new(object_store_url, file_schema, file_source) /// .with_limit(Some(1000)) // read only the first 1000 records /// .with_projection(Some(vec![2, 3])) // project columns 2 and 3 /// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group @@ -115,7 +115,7 @@ use crate::{ /// .with_file_group(vec![ /// PartitionedFile::new("file2.parquet", 56), /// PartitionedFile::new("file3.parquet", 78), -/// ]); +/// ]).build(); /// // create an execution plan from the config /// let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); /// ``` @@ -550,7 +550,6 @@ impl FileScanConfig { /// # Parameters: /// * `object_store_url`: See [`Self::object_store_url`] /// * `file_schema`: See [`Self::file_schema`] - #[deprecated(since = "47.0.0", note = "use FileScanConfigBuilder instead")] #[allow(deprecated)] // `new` will be removed same time as `with_source` pub fn new( object_store_url: ObjectStoreUrl, @@ -558,8 +557,8 @@ impl FileScanConfig { file_source: Arc, ) -> Self { let statistics = Statistics::new_unknown(&file_schema); - - let mut config = Self { + let file_source = file_source.with_statistics(statistics.clone()); + Self { object_store_url, file_schema, file_groups: vec![], @@ -573,10 +572,7 @@ impl FileScanConfig { new_lines_in_values: false, file_source: Arc::clone(&file_source), batch_size: None, - }; - - config = config.with_source(Arc::clone(&file_source)); - config + } } /// Set the file source diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 299bd5fd99d0..6b91eb394bd0 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -33,7 +33,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::physical_plan::AvroSource; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; -use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, JsonSource}; +use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, FileScanConfigBuilder, JsonSource}; use datafusion::datasource::source::DataSourceExec; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; @@ -237,14 +237,15 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { .with_comment(comment), ); - let conf = parse_protobuf_file_scan_config( + let conf = FileScanConfigBuilder::from(parse_protobuf_file_scan_config( scan.base_conf.as_ref().unwrap(), registry, extension_codec, source, - )? + )?) .with_newlines_in_values(scan.newlines_in_values) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED); + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } PhysicalPlanType::JsonScan(scan) => { diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index f272338772c7..c3025afbb4ea 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -46,7 +46,7 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::listing::{ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ - wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfig, + wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfigBuilder, FileSinkConfig, FileSource, ParquetSource, }; use datafusion::datasource::source::DataSourceExec; @@ -773,7 +773,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { let file_source = Arc::new(ParquetSource::default()); let scan_config = - FileScanConfig::new(ObjectStoreUrl::local_filesystem(), schema, file_source) + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) .with_projection(Some(vec![0, 1])) .with_file_group(vec![file_group]) .with_table_partition_cols(vec![Field::new( @@ -781,7 +781,8 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { wrap_partition_type_in_dict(DataType::Int16), false, )]) - .with_newlines_in_values(false); + .with_newlines_in_values(false) + .build(); roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) } diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index 498fa5f157bf..d36b8321f6a9 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -22,7 +22,7 @@ use datafusion::arrow::datatypes::Schema; use datafusion::dataframe::DataFrame; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::object_store::ObjectStoreUrl; -use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; +use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; use datafusion::error::Result; use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::prelude::{ParquetReadOptions, SessionContext}; @@ -35,7 +35,7 @@ use datafusion::datasource::memory::DataSourceExec; async fn parquet_exec() -> Result<()> { let source = Arc::new(ParquetSource::default()); - let scan_config = FileScanConfig::new( + let scan_config = FileScanConfigBuilder::new( ObjectStoreUrl::local_filesystem(), Arc::new(Schema::empty()), source, @@ -49,7 +49,7 @@ async fn parquet_exec() -> Result<()> { "file://foo/part-1.parquet".to_string(), 123, )], - ]); + ]).build(); let parquet_exec: Arc = Arc::new(DataSourceExec::new(Arc::new(scan_config))); let mut extension_info: ( From 3c216b81edfd1adfe55c0a1169f09914c59938e2 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 21:08:13 +0000 Subject: [PATCH 06/14] Fix csv + fmt --- datafusion/core/src/datasource/mod.rs | 13 ++-- .../core/src/datasource/physical_plan/avro.rs | 2 +- .../core/src/datasource/physical_plan/csv.rs | 77 +++++++++++++------ .../core/src/datasource/physical_plan/json.rs | 7 +- .../src/datasource/physical_plan/parquet.rs | 6 +- datafusion/core/src/test/mod.rs | 9 ++- datafusion/core/tests/fuzz_cases/pruning.rs | 12 ++- .../core/tests/parquet/custom_reader.rs | 7 +- .../tests/parquet/external_access_plan.rs | 10 ++- datafusion/core/tests/parquet/page_pruning.rs | 7 +- .../core/tests/parquet/schema_coercion.rs | 14 ++-- .../enforce_distribution.rs | 19 ++--- .../physical_optimizer/enforce_sorting.rs | 5 +- .../physical_optimizer/projection_pushdown.rs | 2 +- datafusion/datasource-avro/src/file_format.rs | 10 ++- datafusion/datasource-csv/src/file_format.rs | 4 +- datafusion/datasource-csv/src/mod.rs | 5 +- datafusion/datasource-json/src/file_format.rs | 11 ++- .../datasource-parquet/src/file_format.rs | 10 ++- datafusion/datasource/src/file_scan_config.rs | 1 - datafusion/proto/src/physical_plan/mod.rs | 6 +- .../tests/cases/roundtrip_physical_plan.rs | 29 +++---- 22 files changed, 160 insertions(+), 106 deletions(-) diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index bbafe7ba761a..cef96bd76c1a 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -71,9 +71,9 @@ mod tests { use ::object_store::path::Path; use ::object_store::ObjectMeta; + use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::collect; use tempfile::TempDir; - use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn can_override_schema_adapter() { @@ -128,10 +128,13 @@ mod tests { ParquetSource::default() .with_schema_adapter_factory(Arc::new(TestSchemaAdapterFactory {})), ); - let base_conf = - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, source) - .with_file(partitioned_file) - .build(); + let base_conf = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + schema, + source, + ) + .with_file(partitioned_file) + .build(); let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_conf))); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index 83ddac38408f..58d5115f1d45 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -39,13 +39,13 @@ mod tests { use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_physical_plan::ExecutionPlan; + use datafusion_datasource::source::DataSourceExec; use futures::StreamExt; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use object_store::ObjectStore; use rstest::*; use url::Url; - use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn avro_exec_without_partition() -> Result<()> { diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 8798c4269043..ab94293bd3e5 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -49,12 +49,13 @@ mod tests { use arrow::datatypes::*; use bytes::Bytes; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; + use datafusion_datasource::source::DataSourceExec; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use rstest::*; use tempfile::TempDir; use url::Url; - use datafusion_datasource::source::DataSourceExec; fn aggr_test_schema() -> SchemaRef { let mut f1 = Field::new("c1", DataType::Utf8, false); @@ -108,10 +109,15 @@ mod tests { )?; let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = partitioned_csv_config(file_schema, file_groups, source) - .with_file_compression_type(file_compression_type) - .with_newlines_in_values(false) - .with_projection(Some(vec![0, 2, 4])); + let config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_file_compression_type(file_compression_type) + .with_newlines_in_values(false) + .with_projection(Some(vec![0, 2, 4])) + .build(); assert_eq!(13, config.file_schema.fields().len()); let csv = Arc::new(DataSourceExec::new(Arc::new(config))); @@ -171,10 +177,15 @@ mod tests { )?; let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = partitioned_csv_config(file_schema, file_groups, source) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_projection(Some(vec![4, 0, 2])); + let config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_projection(Some(vec![4, 0, 2])) + .build(); assert_eq!(13, config.file_schema.fields().len()); let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(3, csv.schema().fields().len()); @@ -234,10 +245,15 @@ mod tests { )?; let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = partitioned_csv_config(file_schema, file_groups, source) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)); + let config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(13, config.file_schema.fields().len()); let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(13, csv.schema().fields().len()); @@ -292,10 +308,15 @@ mod tests { )?; let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = partitioned_csv_config(file_schema, file_groups, source) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()) - .with_limit(Some(5)); + let config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .with_limit(Some(5)) + .build(); assert_eq!(14, config.file_schema.fields().len()); let csv = Arc::new(DataSourceExec::new(Arc::new(config))); assert_eq!(14, csv.schema().fields().len()); @@ -342,9 +363,14 @@ mod tests { )?; let source = Arc::new(CsvSource::new(true, b',', b'"')); - let mut config = partitioned_csv_config(file_schema, file_groups, source) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()); + let mut config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); // Add partition columns config.table_partition_cols = vec![Field::new("date", DataType::Utf8, false)]; @@ -444,9 +470,14 @@ mod tests { .unwrap(); let source = Arc::new(CsvSource::new(true, b',', b'"')); - let config = partitioned_csv_config(file_schema, file_groups, source) - .with_newlines_in_values(false) - .with_file_compression_type(file_compression_type.to_owned()); + let config = FileScanConfigBuilder::from(partitioned_csv_config( + file_schema, + file_groups, + source, + )) + .with_newlines_in_values(false) + .with_file_compression_type(file_compression_type.to_owned()) + .build(); let csv = Arc::new(DataSourceExec::new(Arc::new(config))); let it = csv.execute(0, task_ctx).unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index ba211ba20f94..a70ff71b3ede 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -50,13 +50,13 @@ mod tests { use arrow::datatypes::SchemaRef; use arrow::datatypes::{Field, SchemaBuilder}; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; + use datafusion_datasource::source::DataSourceExec; use object_store::chunked::ChunkedStore; use object_store::local::LocalFileSystem; use object_store::ObjectStore; use rstest::*; use tempfile::TempDir; use url::Url; - use datafusion_datasource::source::DataSourceExec; const TEST_DATA_BASE: &str = "tests/data"; @@ -169,7 +169,7 @@ mod tests { let state = session_ctx.state(); let task_ctx = session_ctx.task_ctx(); use arrow::datatypes::DataType; - + use futures::StreamExt; let tmp_dir = TempDir::new()?; @@ -234,7 +234,7 @@ mod tests { file_compression_type: FileCompressionType, ) -> Result<()> { use arrow::datatypes::DataType; - + use futures::StreamExt; let session_ctx = SessionContext::new(); @@ -285,7 +285,6 @@ mod tests { async fn nd_json_exec_file_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - use futures::StreamExt; let session_ctx = SessionContext::new(); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 1b9e3711e47e..e18b2372df2a 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -185,13 +185,13 @@ mod tests { ) .with_file_group(file_group) .with_projection(projection) - .build(); + .build(); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); - - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_config.clone()))); + let parquet_exec = + Arc::new(DataSourceExec::new(Arc::new(base_config.clone()))); RoundTripResult { batches: collect(parquet_exec.clone(), task_ctx).await, parquet_exec, diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 6406a1bc7259..7beab1490254 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -46,6 +46,7 @@ use datafusion_datasource::source::DataSourceExec; use bzip2::write::BzEncoder; #[cfg(feature = "compression")] use bzip2::Compression as BzCompression; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource_csv::partitioned_csv_config; #[cfg(feature = "compression")] use flate2::write::GzEncoder; @@ -56,7 +57,6 @@ use object_store::local_unpartitioned_file; use xz2::write::XzEncoder; #[cfg(feature = "compression")] use zstd::Encoder as ZstdEncoder; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; pub fn create_table_dual() -> Arc { let dual_schema = Arc::new(Schema::new(vec![ @@ -92,9 +92,10 @@ pub fn scan_partitioned_csv( work_dir, )?; let source = Arc::new(CsvSource::new(true, b'"', b'"')); - let config = FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) - .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); + let config = + FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) + .with_file_compression_type(FileCompressionType::UNCOMPRESSED) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index afb1769b1b91..3211065ec902 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -21,13 +21,12 @@ use arrow::array::{Array, RecordBatch, StringArray}; use arrow::datatypes::{DataType, Field, Schema}; use bytes::{BufMut, Bytes, BytesMut}; use datafusion::{ - datasource::{ - listing::PartitionedFile, - physical_plan::ParquetSource, - }, + datasource::{listing::PartitionedFile, physical_plan::ParquetSource}, prelude::*, }; use datafusion_common::DFSchema; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; use datafusion_execution::object_store::ObjectStoreUrl; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::{collect, filter::FilterExec, ExecutionPlan}; @@ -40,8 +39,6 @@ use parquet::{ use rand::seq::SliceRandom; use tokio::sync::Mutex; use url::Url; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn test_utf8_eq() { @@ -295,7 +292,8 @@ async fn execute_with_predicate( PartitionedFile::new(test_file.path.clone(), test_file.size as u64) }) .collect(), - ).build(); + ) + .build(); let exec = Arc::new(DataSourceExec::new(Arc::new(config))); let exec = Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc; diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 8e1af16168f0..68d478f037bc 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -36,6 +36,8 @@ use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::Result; use bytes::Bytes; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; use futures::future::BoxFuture; use futures::{FutureExt, TryFutureExt}; use insta::assert_snapshot; @@ -46,8 +48,6 @@ use parquet::arrow::async_reader::AsyncFileReader; use parquet::arrow::ArrowWriter; use parquet::errors::ParquetError; use parquet::file::metadata::ParquetMetaData; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::source::DataSourceExec; const EXPECTED_USER_DEFINED_METADATA: &str = "some-user-defined-metadata"; @@ -91,7 +91,8 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { file_schema, source, ) - .with_file_group(file_group).build(); + .with_file_group(file_group) + .build(); let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_config))); diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index 7ffe6268c8e4..ced83bae4151 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -36,12 +36,12 @@ use datafusion_expr::{col, lit, Expr}; use datafusion_physical_plan::metrics::MetricsSet; use datafusion_physical_plan::ExecutionPlan; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::source::DataSourceExec; #[tokio::test] async fn none() { @@ -351,9 +351,11 @@ impl TestFull { Arc::new(ParquetSource::default()) }; let config = FileScanConfigBuilder::new(object_store_url, schema.clone(), source) - .with_file(partitioned_file).build(); + .with_file(partitioned_file) + .build(); - let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); + let plan: Arc = + Arc::new(DataSourceExec::new(Arc::new(config))); // run the DataSourceExec and collect the results let results = diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 65ec6b107afe..7006bf083eee 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -35,10 +35,10 @@ use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{col, lit, Expr}; use datafusion_physical_expr::create_physical_expr; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use futures::StreamExt; use object_store::path::Path; use object_store::ObjectMeta; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec { let object_store_url = ObjectStoreUrl::local_filesystem(); @@ -80,8 +80,9 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> DataSourceExec .with_predicate(Arc::clone(&schema), predicate) .with_enable_page_index(true), ); - let base_config = - FileScanConfigBuilder::new(object_store_url, schema, source).with_file(partitioned_file).build(); + let base_config = FileScanConfigBuilder::new(object_store_url, schema, source) + .with_file(partitioned_file) + .build(); DataSourceExec::new(Arc::new(base_config)) } diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 13cbc314213c..7214e36890e9 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -30,13 +30,13 @@ use datafusion_common::test_util::batches_to_sort_string; use datafusion_common::Result; use datafusion_execution::object_store::ObjectStoreUrl; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; +use datafusion_datasource::source::DataSourceExec; use insta::assert_snapshot; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; use parquet::file::properties::WriterProperties; use tempfile::NamedTempFile; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; -use datafusion_datasource::source::DataSourceExec; /// Test for reading data from multiple parquet files with different schemas and coercing them into a single schema. #[tokio::test] @@ -63,9 +63,13 @@ async fn multi_parquet_coercion() { Field::new("c3", DataType::Float64, true), ])); let source = Arc::new(ParquetSource::default()); - let conf = - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), file_schema, source) - .with_file_group(file_group).build(); + let conf = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + file_schema, + source, + ) + .with_file_group(file_group) + .build(); let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 76c186b303af..13801719d10e 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -232,7 +232,7 @@ fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc Result<()> { let plan = aggregate_exec_with_alias( Arc::new(DataSourceExec::new(Arc::new( - FileScanConfigBuilder::new( - ObjectStoreUrl::parse("test:///").unwrap(), - schema(), - Arc::new(CsvSource::new(false, b',', b'"')), - ) - .with_file(PartitionedFile::new("x".to_string(), 100)) - .with_file_compression_type(compression_type) - .build()))), + FileScanConfigBuilder::new( + ObjectStoreUrl::parse("test:///").unwrap(), + schema(), + Arc::new(CsvSource::new(false, b',', b'"')), + ) + .with_file(PartitionedFile::new("x".to_string(), 100)) + .with_file_compression_type(compression_type) + .build(), + ))), vec![("a".to_string(), "a".to_string())], ); let test_config = TestConfig::default() diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index dc8c41484500..733e9a3e9e20 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -56,9 +56,9 @@ use datafusion_functions_aggregate::average::avg_udaf; use datafusion_functions_aggregate::count::count_udaf; use datafusion_functions_aggregate::min_max::{max_udaf, min_udaf}; -use rstest::rstest; use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::source::DataSourceExec; +use rstest::rstest; /// Create a csv exec for tests fn csv_exec_ordered( @@ -76,8 +76,7 @@ fn csv_exec_ordered( .with_output_ordering(vec![sort_exprs]) .build(); - Arc::new( - DataSourceExec::new(Arc::new(config))) + Arc::new(DataSourceExec::new(Arc::new(config))) } /// Created a sorted parquet exec diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 0f73a9dbc792..95037a9da91c 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -59,9 +59,9 @@ use datafusion_physical_plan::streaming::StreamingTableExec; use datafusion_physical_plan::union::UnionExec; use datafusion_physical_plan::{get_plan_string, ExecutionPlan}; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_expr_common::columnar_value::ColumnarValue; use itertools::Itertools; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; /// Mocked UDF #[derive(Debug)] diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index aad90e47dc7b..cceaed413e1f 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -26,6 +26,8 @@ use datafusion_common::{Result, Statistics}; use datafusion_datasource::file_compression_type::FileCompressionType; use datafusion_datasource::file_format::{FileFormat, FileFormatFactory}; +use crate::avro_to_arrow::read_avro_schema_from_reader; +use crate::source::AvroSource; use arrow::datatypes::Schema; use arrow::datatypes::SchemaRef; use async_trait::async_trait; @@ -36,12 +38,10 @@ use datafusion_common::GetExt; use datafusion_common::DEFAULT_AVRO_EXTENSION; use datafusion_datasource::file::FileSource; use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; -use datafusion_datasource::source::DataSourceExec; -use crate::avro_to_arrow::read_avro_schema_from_reader; -use crate::source::AvroSource; #[derive(Default)] /// Factory struct used to create [`AvroFormat`] @@ -151,7 +151,9 @@ impl FileFormat for AvroFormat { conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { - let config = FileScanConfigBuilder::from(conf).with_source(self.file_source()).build(); + let config = FileScanConfigBuilder::from(conf) + .with_source(self.file_source()) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) } diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index c59019182825..ece807fc6517 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -51,16 +51,16 @@ use datafusion_expr::dml::InsertOp; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexRequirement; +use crate::source::CsvSource; use async_trait::async_trait; use bytes::{Buf, Bytes}; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use futures::stream::BoxStream; use futures::{pin_mut, Stream, StreamExt, TryStreamExt}; use object_store::{delimited::newline_delimited_stream, ObjectMeta, ObjectStore}; use regex::Regex; -use datafusion_datasource::source::DataSourceExec; -use crate::source::CsvSource; #[derive(Default)] /// Factory used to create [`CsvFormat`] diff --git a/datafusion/datasource-csv/src/mod.rs b/datafusion/datasource-csv/src/mod.rs index 2c39a83d282e..4f479353508a 100644 --- a/datafusion/datasource-csv/src/mod.rs +++ b/datafusion/datasource-csv/src/mod.rs @@ -25,10 +25,10 @@ pub mod source; use std::sync::Arc; use arrow::datatypes::SchemaRef; +use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_datasource::{ file::FileSource, file_scan_config::FileScanConfig, PartitionedFile, }; -use datafusion_datasource::file_scan_config::FileScanConfigBuilder; use datafusion_execution::object_store::ObjectStoreUrl; pub use file_format::*; @@ -39,5 +39,6 @@ pub fn partitioned_csv_config( file_source: Arc, ) -> FileScanConfig { FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) - .with_file_groups(file_groups).build() + .with_file_groups(file_groups) + .build() } diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index b36598a963d3..9178d7011e14 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -54,12 +54,12 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; +use crate::source::JsonSource; use async_trait::async_trait; use bytes::{Buf, Bytes}; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_expr_common::sort_expr::LexRequirement; use object_store::{GetResultPayload, ObjectMeta, ObjectStore}; -use datafusion_datasource::source::DataSourceExec; -use crate::source::JsonSource; #[derive(Default)] /// Factory struct used to create [JsonFormat] @@ -250,7 +250,12 @@ impl FileFormat for JsonFormat { _filters: Option<&Arc>, ) -> Result> { let source = Arc::new(JsonSource::new()); - let conf = FileScanConfigBuilder::from(conf).with_file_compression_type(FileCompressionType::from(self.options.compression)).with_source(source).build(); + let conf = FileScanConfigBuilder::from(conf) + .with_file_compression_type(FileCompressionType::from( + self.options.compression, + )) + .with_source(source) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index e9a812e5ce02..d0a9b51cd476 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -58,8 +58,11 @@ use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr_common::sort_expr::LexRequirement; use datafusion_physical_plan::Accumulator; +use crate::can_expr_be_pushed_down_with_schemas; +use crate::source::ParquetSource; use async_trait::async_trait; use bytes::Bytes; +use datafusion_datasource::source::DataSourceExec; use datafusion_physical_plan::insert::{DataSink, DataSinkExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use futures::future::BoxFuture; @@ -82,9 +85,6 @@ use parquet::file::writer::SerializedFileWriter; use parquet::format::FileMetaData; use tokio::io::{AsyncWrite, AsyncWriteExt}; use tokio::sync::mpsc::{self, Receiver, Sender}; -use datafusion_datasource::source::DataSourceExec; -use crate::can_expr_be_pushed_down_with_schemas; -use crate::source::ParquetSource; /// Initial writing buffer size. Note this is just a size hint for efficiency. It /// will grow beyond the set value if needed. @@ -420,7 +420,9 @@ impl FileFormat for ParquetFormat { source = source.with_metadata_size_hint(metadata_size_hint) } - let conf = FileScanConfigBuilder::from(conf).with_source(Arc::new(source)).build(); + let conf = FileScanConfigBuilder::from(conf) + .with_source(Arc::new(source)) + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 029767dcc303..707d021ffed4 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -198,7 +198,6 @@ pub struct FileScanConfigBuilder { new_lines_in_values: Option, pub batch_size: Option, - } impl FileScanConfigBuilder { diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 6b91eb394bd0..540e84f944ba 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -33,7 +33,9 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::physical_plan::AvroSource; #[cfg(feature = "parquet")] use datafusion::datasource::physical_plan::ParquetSource; -use datafusion::datasource::physical_plan::{CsvSource, FileScanConfig, FileScanConfigBuilder, JsonSource}; +use datafusion::datasource::physical_plan::{ + CsvSource, FileScanConfig, FileScanConfigBuilder, JsonSource, +}; use datafusion::datasource::source::DataSourceExec; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::execution::FunctionRegistry; @@ -245,7 +247,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { )?) .with_newlines_in_values(scan.newlines_in_values) .with_file_compression_type(FileCompressionType::UNCOMPRESSED) - .build(); + .build(); Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) } PhysicalPlanType::JsonScan(scan) => { diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index c3025afbb4ea..b5821fd5db89 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -46,8 +46,8 @@ use datafusion::datasource::file_format::parquet::ParquetSink; use datafusion::datasource::listing::{ListingTableUrl, PartitionedFile}; use datafusion::datasource::object_store::ObjectStoreUrl; use datafusion::datasource::physical_plan::{ - wrap_partition_type_in_dict, wrap_partition_value_in_dict, - FileScanConfigBuilder, FileSinkConfig, FileSource, ParquetSource, + wrap_partition_type_in_dict, wrap_partition_value_in_dict, FileScanConfigBuilder, + FileSinkConfig, FileSource, ParquetSource, }; use datafusion::datasource::source::DataSourceExec; use datafusion::execution::FunctionRegistry; @@ -772,17 +772,20 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { let schema = Arc::new(Schema::new(vec![Field::new("col", DataType::Utf8, false)])); let file_source = Arc::new(ParquetSource::default()); - let scan_config = - FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), schema, file_source) - .with_projection(Some(vec![0, 1])) - .with_file_group(vec![file_group]) - .with_table_partition_cols(vec![Field::new( - "part".to_string(), - wrap_partition_type_in_dict(DataType::Int16), - false, - )]) - .with_newlines_in_values(false) - .build(); + let scan_config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + schema, + file_source, + ) + .with_projection(Some(vec![0, 1])) + .with_file_group(vec![file_group]) + .with_table_partition_cols(vec![Field::new( + "part".to_string(), + wrap_partition_type_in_dict(DataType::Int16), + false, + )]) + .with_newlines_in_values(false) + .build(); roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) } From 1fc4b3d405b3135e4b8bfb7018503bd357cd7406 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 21:11:57 +0000 Subject: [PATCH 07/14] More fixes --- datafusion-examples/examples/csv_json_opener.rs | 10 ++++------ datafusion-examples/examples/parquet_index.rs | 2 +- datafusion/substrait/src/physical_plan/consumer.rs | 13 +++++++------ .../tests/cases/roundtrip_physical_plan.rs | 8 +++++--- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/datafusion-examples/examples/csv_json_opener.rs b/datafusion-examples/examples/csv_json_opener.rs index 1fc79171ec99..1a2c2cbff418 100644 --- a/datafusion-examples/examples/csv_json_opener.rs +++ b/datafusion-examples/examples/csv_json_opener.rs @@ -24,18 +24,16 @@ use datafusion::{ file_format::file_compression_type::FileCompressionType, listing::PartitionedFile, object_store::ObjectStoreUrl, - physical_plan::{ - CsvSource, FileSource, FileStream, JsonOpener, JsonSource, - }, + physical_plan::{CsvSource, FileSource, FileStream, JsonOpener, JsonSource}, }, error::Result, physical_plan::metrics::ExecutionPlanMetricsSet, test_util::aggr_test_schema, }; +use datafusion::datasource::physical_plan::FileScanConfigBuilder; use futures::StreamExt; use object_store::{local::LocalFileSystem, memory::InMemory, ObjectStore}; -use datafusion::datasource::physical_plan::FileScanConfigBuilder; /// This example demonstrates using the low level [`FileStream`] / [`FileOpener`] APIs to directly /// read data from (CSV/JSON) into Arrow RecordBatches. @@ -65,7 +63,7 @@ async fn csv_opener() -> Result<()> { .with_projection(Some(vec![12, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.display().to_string(), 10)) - .build(); + .build(); let config = CsvSource::new(true, b',', b'"') .with_comment(Some(b'#')) @@ -131,7 +129,7 @@ async fn json_opener() -> Result<()> { .with_projection(Some(vec![1, 0])) .with_limit(Some(5)) .with_file(PartitionedFile::new(path.to_string(), 10)) - .build(); + .build(); let mut stream = FileStream::new( &scan_config, diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index dbd7d2c8c265..4934c2e87d85 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -27,6 +27,7 @@ use datafusion::common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; use datafusion::datasource::listing::PartitionedFile; +use datafusion::datasource::memory::DataSourceExec; use datafusion::datasource::physical_plan::{FileScanConfigBuilder, ParquetSource}; use datafusion::datasource::TableProvider; use datafusion::execution::object_store::ObjectStoreUrl; @@ -52,7 +53,6 @@ use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::Arc; use tempfile::TempDir; use url::Url; -use datafusion::datasource::memory::DataSourceExec; /// This example demonstrates building a secondary index over multiple Parquet /// files and using that index during query to skip ("prune") files that do not diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index 5ad093c30f9d..312fe265f1f9 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -27,8 +27,13 @@ use datafusion::error::{DataFusionError, Result}; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; +use crate::variation_const::{ + DEFAULT_CONTAINER_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, + VIEW_CONTAINER_TYPE_VARIATION_REF, +}; use async_recursion::async_recursion; use chrono::DateTime; +use datafusion::datasource::memory::DataSourceExec; use object_store::ObjectMeta; use substrait::proto::r#type::{Kind, Nullability}; use substrait::proto::read_rel::local_files::file_or_files::PathType; @@ -36,11 +41,6 @@ use substrait::proto::Type; use substrait::proto::{ expression::MaskExpression, read_rel::ReadType, rel::RelType, Rel, }; -use datafusion::datasource::memory::DataSourceExec; -use crate::variation_const::{ - DEFAULT_CONTAINER_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF, - VIEW_CONTAINER_TYPE_VARIATION_REF, -}; /// Convert Substrait Rel to DataFusion ExecutionPlan #[async_recursion] @@ -152,7 +152,8 @@ pub async fn from_substrait_rel( } } - Ok(Arc::new(DataSourceExec::new(Arc::new(base_config))) as Arc) + Ok(Arc::new(DataSourceExec::new(Arc::new(base_config))) + as Arc) } _ => not_impl_err!( "Only LocalFile reads are supported when parsing physical" diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index d36b8321f6a9..6aa509ce3770 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -28,8 +28,8 @@ use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::prelude::{ParquetReadOptions, SessionContext}; use datafusion_substrait::physical_plan::{consumer, producer}; -use substrait::proto::extensions; use datafusion::datasource::memory::DataSourceExec; +use substrait::proto::extensions; #[tokio::test] async fn parquet_exec() -> Result<()> { @@ -49,8 +49,10 @@ async fn parquet_exec() -> Result<()> { "file://foo/part-1.parquet".to_string(), 123, )], - ]).build(); - let parquet_exec: Arc = Arc::new(DataSourceExec::new(Arc::new(scan_config))); + ]) + .build(); + let parquet_exec: Arc = + Arc::new(DataSourceExec::new(Arc::new(scan_config))); let mut extension_info: ( Vec, From ae32a014903eafda485c8bd3122d557422ed7c00 Mon Sep 17 00:00:00 2001 From: blaginin Date: Mon, 24 Mar 2025 21:58:45 +0000 Subject: [PATCH 08/14] Remove pub --- datafusion/datasource/src/file_scan_config.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 707d021ffed4..40cf4093825f 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -183,21 +183,12 @@ pub struct FileScanConfigBuilder { projection: Option>, table_partition_cols: Vec, constraints: Option, - file_groups: Vec>, - - /// Estimated overall statistics of the files, taking `filters` into account. - /// Defaults to [`Statistics::new_unknown`]. statistics: Option, - - /// All equivalent lexicographical orderings that describe the schema. output_ordering: Vec, - file_compression_type: Option, - new_lines_in_values: Option, - - pub batch_size: Option, + batch_size: Option, } impl FileScanConfigBuilder { From 7e3521c5d6bb9342ad276102c3bbcd320ab8639d Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 13:44:17 +0000 Subject: [PATCH 09/14] Remove todo --- datafusion/datasource/src/file_scan_config.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 8c9b9f9886ff..7fc3f1273dcf 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -873,7 +873,6 @@ impl FileScanConfig { .collect()) } - // TODO: This function should be moved into DataSourceExec once FileScanConfig moved out of datafusion/core /// Returns a new [`DataSourceExec`] to scan the files specified by this config #[deprecated(since = "47.0.0", note = "use DataSourceExec::new instead")] pub fn build(self) -> Arc { From d13dfc525b2c37abcacb36d7c9b1b49d2481cfb7 Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 13:57:07 +0000 Subject: [PATCH 10/14] Add usage example --- datafusion/datasource/src/file_scan_config.rs | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 7fc3f1273dcf..fb2ddae2e00b 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -175,6 +175,55 @@ pub struct FileScanConfig { pub batch_size: Option, } +/// A builder for [`FileScanConfig`]'s. +/// +/// Example: +/// +/// ```rust +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field, Schema}; +/// # use datafusion_datasource::file_scan_config::{FileScanConfigBuilder, FileScanConfig}; +/// # use datafusion_datasource::file_compression_type::FileCompressionType; +/// # use datafusion_datasource::file_groups::FileGroup; +/// # use datafusion_datasource::PartitionedFile; +/// # use datafusion_execution::object_store::ObjectStoreUrl; +/// # use datafusion_common::Statistics; +/// # use datafusion_datasource::file::FileSource; +/// +/// # fn main() { +/// # fn set_source(file_source: impl FileSource) { +/// // Create a schema for our Parquet files +/// let schema = Arc::new(Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// Field::new("value", DataType::Utf8, false), +/// ])); +/// +/// // Create a builder for scanning Parquet files from a local filesystem +/// let config = FileScanConfigBuilder::new( +/// ObjectStoreUrl::local_filesystem(), +/// schema, +/// file_source, +/// ) +/// // Set a limit of 1000 rows +/// .with_limit(Some(1000)) +/// // Project only the first column +/// .with_projection(Some(vec![0])) +/// // Add partition columns +/// .with_table_partition_cols(vec![ +/// Field::new("date", DataType::Utf8, false), +/// ]) +/// // Add a file group with two files +/// .with_file_group(FileGroup::new(vec![ +/// PartitionedFile::new("data/date=2024-01-01/file1.parquet", 1024), +/// PartitionedFile::new("data/date=2024-01-01/file2.parquet", 2048), +/// ])) +/// // Set compression type +/// .with_file_compression_type(FileCompressionType::UNCOMPRESSED) +/// // Build the final config +/// .build(); +/// # } +/// # } +/// ``` #[derive(Clone)] pub struct FileScanConfigBuilder { object_store_url: ObjectStoreUrl, From e1a056d3933ca384194ed75dcc26903299a8c6b2 Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 14:06:08 +0000 Subject: [PATCH 11/14] Fix input type --- datafusion/datasource/src/file_scan_config.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index fb2ddae2e00b..839e0745d50f 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -191,7 +191,7 @@ pub struct FileScanConfig { /// # use datafusion_datasource::file::FileSource; /// /// # fn main() { -/// # fn set_source(file_source: impl FileSource) { +/// # fn with_source(file_source: Arc) { /// // Create a schema for our Parquet files /// let schema = Arc::new(Schema::new(vec![ /// Field::new("id", DataType::Int32, false), From 748a3354faf7e222c8acd4f28e21cc04010fb152 Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 14:33:44 +0000 Subject: [PATCH 12/14] Add `from_data_source` --- .../examples/advanced_parquet_index.rs | 2 +- datafusion-examples/examples/parquet_index.rs | 4 +--- datafusion/core/src/datasource/file_format/arrow.rs | 2 +- datafusion/core/src/datasource/memory.rs | 2 +- datafusion/core/src/datasource/mod.rs | 2 +- datafusion/core/src/datasource/physical_plan/avro.rs | 6 +++--- datafusion/core/src/datasource/physical_plan/csv.rs | 12 ++++++------ datafusion/core/src/datasource/physical_plan/json.rs | 8 ++++---- .../core/src/datasource/physical_plan/parquet.rs | 11 +++++------ datafusion/core/src/test/mod.rs | 2 +- datafusion/core/src/test_util/parquet.rs | 4 ++-- datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs | 4 ++-- datafusion/core/tests/fuzz_cases/pruning.rs | 2 +- datafusion/core/tests/fuzz_cases/window_fuzz.rs | 8 ++++---- datafusion/core/tests/memory_limit/mod.rs | 2 +- datafusion/core/tests/parquet/custom_reader.rs | 2 +- .../core/tests/parquet/external_access_plan.rs | 3 +-- datafusion/core/tests/parquet/schema_coercion.rs | 4 ++-- .../tests/physical_optimizer/enforce_distribution.rs | 10 +++++----- .../core/tests/physical_optimizer/enforce_sorting.rs | 6 +++--- .../tests/physical_optimizer/projection_pushdown.rs | 6 +++--- .../core/tests/physical_optimizer/test_utils.rs | 4 ++-- datafusion/datasource-avro/src/file_format.rs | 2 +- datafusion/datasource-csv/src/file_format.rs | 2 +- datafusion/datasource-csv/src/source.rs | 2 +- datafusion/datasource-json/src/file_format.rs | 2 +- datafusion/datasource-parquet/src/file_format.rs | 2 +- datafusion/datasource-parquet/src/source.rs | 6 +++--- datafusion/datasource/src/file_scan_config.rs | 8 ++++---- datafusion/datasource/src/memory.rs | 8 ++++---- datafusion/datasource/src/source.rs | 4 ++++ datafusion/proto/src/physical_plan/mod.rs | 8 ++++---- .../proto/tests/cases/roundtrip_physical_plan.rs | 8 ++++---- datafusion/substrait/src/physical_plan/consumer.rs | 7 ++++--- .../substrait/tests/cases/roundtrip_physical_plan.rs | 2 +- 35 files changed, 84 insertions(+), 83 deletions(-) diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index d3f2f04428f1..b8c303e22161 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -507,7 +507,7 @@ impl TableProvider for IndexTableProvider { .build(); // Finally, put it all together into a DataSourceExec - Ok(Arc::new(DataSourceExec::new(Arc::new(file_scan_config)))) + Ok(DataSourceExec::from_data_source(file_scan_config)) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index 9772aa3b5a32..3e0bf492625e 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -258,9 +258,7 @@ impl TableProvider for IndexTableProvider { PartitionedFile::new(canonical_path.display().to_string(), file_size), ); } - Ok(Arc::new(DataSourceExec::new(Arc::new( - file_scan_config_builder.build(), - )))) + Ok(DataSourceExec::from_data_source(file_scan_config_builder.build())) } /// Tell DataFusion to push filters down to the scan method diff --git a/datafusion/core/src/datasource/file_format/arrow.rs b/datafusion/core/src/datasource/file_format/arrow.rs index e2eb21f5c554..6c7c9463cf3b 100644 --- a/datafusion/core/src/datasource/file_format/arrow.rs +++ b/datafusion/core/src/datasource/file_format/arrow.rs @@ -179,7 +179,7 @@ impl FileFormat for ArrowFormat { .with_source(source) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) + Ok(DataSourceExec::from_data_source(config)) } async fn create_writer_physical_plan( diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index 27352c5146ce..0288cd3e8bc7 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -251,7 +251,7 @@ impl TableProvider for MemTable { source = source.try_with_sort_information(file_sort_order)?; } - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + Ok(DataSourceExec::from_data_source(source)) } /// Returns an ExecutionPlan that inserts the execution results of a given [`ExecutionPlan`] into this [`MemTable`]. diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 59567cfa8204..35a451cbc803 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -137,7 +137,7 @@ mod tests { .with_file(partitioned_file) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_conf))); + let parquet_exec = DataSourceExec::from_data_source(base_conf); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/src/datasource/physical_plan/avro.rs b/datafusion/core/src/datasource/physical_plan/avro.rs index f5eaa7434940..8a00af959ccc 100644 --- a/datafusion/core/src/datasource/physical_plan/avro.rs +++ b/datafusion/core/src/datasource/physical_plan/avro.rs @@ -91,7 +91,7 @@ mod tests { .with_projection(Some(vec![0, 1, 2])) .build(); - let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let source_exec = DataSourceExec::from_data_source(conf); assert_eq!( source_exec .properties() @@ -163,7 +163,7 @@ mod tests { .with_projection(projection) .build(); - let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let source_exec = DataSourceExec::from_data_source(conf); assert_eq!( source_exec .properties() @@ -236,7 +236,7 @@ mod tests { .with_table_partition_cols(vec![Field::new("date", DataType::Utf8, false)]) .build(); - let source_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let source_exec = DataSourceExec::from_data_source(conf); assert_eq!( source_exec diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 4383fa34bf69..5914924797dc 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -122,7 +122,7 @@ mod tests { .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); assert_eq!(3, csv.schema().fields().len()); @@ -186,7 +186,7 @@ mod tests { .with_projection(Some(vec![4, 0, 2])) .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); assert_eq!(3, csv.schema().fields().len()); let mut stream = csv.execute(0, task_ctx)?; @@ -251,7 +251,7 @@ mod tests { .with_limit(Some(5)) .build(); assert_eq!(13, config.file_schema.fields().len()); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); assert_eq!(13, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -314,7 +314,7 @@ mod tests { .with_limit(Some(5)) .build(); assert_eq!(14, config.file_schema.fields().len()); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); assert_eq!(14, csv.schema().fields().len()); // errors due to https://github.com/apache/datafusion/issues/4918 @@ -380,7 +380,7 @@ mod tests { // partitions are resolved during scan anyway assert_eq!(13, config.file_schema.fields().len()); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); assert_eq!(2, csv.schema().fields().len()); let mut it = csv.execute(0, task_ctx)?; @@ -472,7 +472,7 @@ mod tests { .with_newlines_in_values(false) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let csv = Arc::new(DataSourceExec::new(Arc::new(config))); + let csv = DataSourceExec::from_data_source(config); let it = csv.execute(0, task_ctx).unwrap(); let batches: Vec<_> = it.try_collect().await.unwrap(); diff --git a/datafusion/core/src/datasource/physical_plan/json.rs b/datafusion/core/src/datasource/physical_plan/json.rs index 6841d86803cd..910c4316d973 100644 --- a/datafusion/core/src/datasource/physical_plan/json.rs +++ b/datafusion/core/src/datasource/physical_plan/json.rs @@ -183,7 +183,7 @@ mod tests { .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let exec = DataSourceExec::from_data_source(conf); // TODO: this is not where schema inference should be tested @@ -258,7 +258,7 @@ mod tests { .with_limit(Some(3)) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let exec = DataSourceExec::from_data_source(conf); let mut it = exec.execute(0, task_ctx)?; let batch = it.next().await.unwrap()?; @@ -301,7 +301,7 @@ mod tests { .with_projection(Some(vec![0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let exec = DataSourceExec::from_data_source(conf); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 2); @@ -349,7 +349,7 @@ mod tests { .with_projection(Some(vec![3, 0, 2])) .with_file_compression_type(file_compression_type.to_owned()) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let exec = DataSourceExec::from_data_source(conf); let inferred_schema = exec.schema(); assert_eq!(inferred_schema.fields().len(), 3); diff --git a/datafusion/core/src/datasource/physical_plan/parquet.rs b/datafusion/core/src/datasource/physical_plan/parquet.rs index 57e755e391c6..9e1b2822e854 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet.rs @@ -192,8 +192,7 @@ mod tests { let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); - let parquet_exec = - Arc::new(DataSourceExec::new(Arc::new(base_config.clone()))); + let parquet_exec = DataSourceExec::from_data_source(base_config.clone()); RoundTripResult { batches: collect(parquet_exec.clone(), task_ctx).await, parquet_exec, @@ -1129,7 +1128,7 @@ mod tests { .with_file_groups(file_groups) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let parquet_exec = DataSourceExec::from_data_source(config); assert_eq!( parquet_exec .properties() @@ -1245,7 +1244,7 @@ mod tests { ]) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let parquet_exec = DataSourceExec::from_data_source(config); let partition_count = parquet_exec .data_source() .output_partitioning() @@ -1310,7 +1309,7 @@ mod tests { .with_file(partitioned_file) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let parquet_exec = DataSourceExec::from_data_source(config); let mut results = parquet_exec.execute(0, state.task_ctx())?; let batch = results.next().await.unwrap(); @@ -1968,7 +1967,7 @@ mod tests { }) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let exec = DataSourceExec::from_data_source(config); let res = collect(exec, ctx.task_ctx()).await.unwrap(); assert_eq!(res.len(), 2); diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index a9980fc58bfb..8719a16f4919 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -96,7 +96,7 @@ pub fn scan_partitioned_csv( FileScanConfigBuilder::from(partitioned_csv_config(schema, file_groups, source)) .with_file_compression_type(FileCompressionType::UNCOMPRESSED) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) + Ok(DataSourceExec::from_data_source(config)) } /// Returns file groups [`Vec`] for scanning `partitions` of `filename` diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 3fbba8e0c6f4..084554eecbdb 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -187,13 +187,13 @@ impl TestParquetFile { Arc::clone(&physical_filter_expr), )); let config = scan_config_builder.with_source(source).build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let parquet_exec = DataSourceExec::from_data_source(config); let exec = Arc::new(FilterExec::try_new(physical_filter_expr, parquet_exec)?); Ok(exec) } else { let config = scan_config_builder.build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) + Ok(DataSourceExec::from_data_source(config)) } } diff --git a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs index 1b98a19581ea..dcf477135a37 100644 --- a/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/aggregate_fuzz.rs @@ -353,12 +353,12 @@ async fn run_aggregate_test(input1: Vec, group_by_columns: Vec<&str ) .unwrap(); - let running_source = Arc::new(DataSourceExec::new(Arc::new( + let running_source = DataSourceExec::from_data_source( MemorySourceConfig::try_new(&[input1.clone()], schema.clone(), None) .unwrap() .try_with_sort_information(vec![sort_keys]) .unwrap(), - ))); + ); let aggregate_expr = vec![ diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index 3211065ec902..11dd961a54ee 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -294,7 +294,7 @@ async fn execute_with_predicate( .collect(), ) .build(); - let exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let exec = DataSourceExec::from_data_source(config); let exec = Arc::new(FilterExec::try_new(predicate, exec).unwrap()) as Arc; diff --git a/datafusion/core/tests/fuzz_cases/window_fuzz.rs b/datafusion/core/tests/fuzz_cases/window_fuzz.rs index a7f9e38c9ae3..6b166dd32782 100644 --- a/datafusion/core/tests/fuzz_cases/window_fuzz.rs +++ b/datafusion/core/tests/fuzz_cases/window_fuzz.rs @@ -636,10 +636,10 @@ async fn run_window_test( options: Default::default(), }, ]); - let mut exec1 = Arc::new(DataSourceExec::new(Arc::new( + let mut exec1 = DataSourceExec::from_data_source( MemorySourceConfig::try_new(&[vec![concat_input_record]], schema.clone(), None)? .try_with_sort_information(vec![source_sort_keys.clone()])?, - ))) as _; + ) as _; // Table is ordered according to ORDER BY a, b, c In linear test we use PARTITION BY b, ORDER BY a // For WindowAggExec to produce correct result it need table to be ordered by b,a. Hence add a sort. if is_linear { @@ -662,10 +662,10 @@ async fn run_window_test( exec1, false, )?) as _; - let exec2 = Arc::new(DataSourceExec::new(Arc::new( + let exec2 = DataSourceExec::from_data_source( MemorySourceConfig::try_new(&[input1.clone()], schema.clone(), None)? .try_with_sort_information(vec![source_sort_keys.clone()])?, - ))); + ); let running_window_exec = Arc::new(BoundedWindowAggExec::try_new( vec![create_window_expr( &window_fn, diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 6a0a797d4ded..dd5acc8d8908 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -943,6 +943,6 @@ impl TableProvider for SortedTableProvider { )? .try_with_sort_information(self.sort_information.clone())?; - Ok(Arc::new(DataSourceExec::new(Arc::new(mem_conf)))) + Ok(DataSourceExec::from_data_source(mem_conf)) } } diff --git a/datafusion/core/tests/parquet/custom_reader.rs b/datafusion/core/tests/parquet/custom_reader.rs index 68d478f037bc..ce5c0d720174 100644 --- a/datafusion/core/tests/parquet/custom_reader.rs +++ b/datafusion/core/tests/parquet/custom_reader.rs @@ -94,7 +94,7 @@ async fn route_data_access_ops_to_parquet_file_reader_factory() { .with_file_group(file_group) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(base_config))); + let parquet_exec = DataSourceExec::from_data_source(base_config); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/tests/parquet/external_access_plan.rs b/datafusion/core/tests/parquet/external_access_plan.rs index ced83bae4151..bbef073345b7 100644 --- a/datafusion/core/tests/parquet/external_access_plan.rs +++ b/datafusion/core/tests/parquet/external_access_plan.rs @@ -354,8 +354,7 @@ impl TestFull { .with_file(partitioned_file) .build(); - let plan: Arc = - Arc::new(DataSourceExec::new(Arc::new(config))); + let plan: Arc = DataSourceExec::from_data_source(config); // run the DataSourceExec and collect the results let results = diff --git a/datafusion/core/tests/parquet/schema_coercion.rs b/datafusion/core/tests/parquet/schema_coercion.rs index 7214e36890e9..59cbf4b0872e 100644 --- a/datafusion/core/tests/parquet/schema_coercion.rs +++ b/datafusion/core/tests/parquet/schema_coercion.rs @@ -71,7 +71,7 @@ async fn multi_parquet_coercion() { .with_file_group(file_group) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(conf))); + let parquet_exec = DataSourceExec::from_data_source(conf); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); @@ -129,7 +129,7 @@ async fn multi_parquet_coercion_projection() { .with_projection(Some(vec![1, 0, 2])) .build(); - let parquet_exec = Arc::new(DataSourceExec::new(Arc::new(config))); + let parquet_exec = DataSourceExec::from_data_source(config); let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); diff --git a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs index 26d3b7f26148..9898f6204e88 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_distribution.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_distribution.rs @@ -197,7 +197,7 @@ fn parquet_exec_multiple_sorted( .with_output_ordering(output_ordering) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } fn csv_exec() -> Arc { @@ -214,7 +214,7 @@ fn csv_exec_with_sort(output_ordering: Vec) -> Arc .with_output_ordering(output_ordering) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } fn csv_exec_multiple() -> Arc { @@ -235,7 +235,7 @@ fn csv_exec_multiple_sorted(output_ordering: Vec) -> Arc Result<()> { }; let plan = aggregate_exec_with_alias( - Arc::new(DataSourceExec::new(Arc::new( + DataSourceExec::from_data_source( FileScanConfigBuilder::new( ObjectStoreUrl::parse("test:///").unwrap(), schema(), @@ -2544,7 +2544,7 @@ fn parallelization_compressed_csv() -> Result<()> { .with_file(PartitionedFile::new("x".to_string(), 100)) .with_file_compression_type(compression_type) .build(), - ))), + ), vec![("a".to_string(), "a".to_string())], ); let test_config = TestConfig::default() diff --git a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs index 733e9a3e9e20..4d2c875d3f1d 100644 --- a/datafusion/core/tests/physical_optimizer/enforce_sorting.rs +++ b/datafusion/core/tests/physical_optimizer/enforce_sorting.rs @@ -76,7 +76,7 @@ fn csv_exec_ordered( .with_output_ordering(vec![sort_exprs]) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } /// Created a sorted parquet exec @@ -96,7 +96,7 @@ pub fn parquet_exec_sorted( .with_output_ordering(vec![sort_exprs]) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } /// Create a sorted Csv exec @@ -115,7 +115,7 @@ fn csv_exec_sorted( .with_output_ordering(vec![sort_exprs]) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } /// Runs the sort enforcement optimizer and asserts the plan diff --git a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs index 95037a9da91c..911d2c0cee05 100644 --- a/datafusion/core/tests/physical_optimizer/projection_pushdown.rs +++ b/datafusion/core/tests/physical_optimizer/projection_pushdown.rs @@ -382,7 +382,7 @@ fn create_simple_csv_exec() -> Arc { .with_projection(Some(vec![0, 1, 2, 3, 4])) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } fn create_projecting_csv_exec() -> Arc { @@ -401,7 +401,7 @@ fn create_projecting_csv_exec() -> Arc { .with_projection(Some(vec![3, 2, 1])) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } fn create_projecting_memory_exec() -> Arc { @@ -1413,7 +1413,7 @@ fn partitioned_data_source() -> Arc { .with_projection(Some(vec![0, 1, 2])) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } #[test] diff --git a/datafusion/core/tests/physical_optimizer/test_utils.rs b/datafusion/core/tests/physical_optimizer/test_utils.rs index 2ef2eaed62f8..4587f99989d3 100644 --- a/datafusion/core/tests/physical_optimizer/test_utils.rs +++ b/datafusion/core/tests/physical_optimizer/test_utils.rs @@ -77,7 +77,7 @@ pub fn parquet_exec(schema: &SchemaRef) -> Arc { .with_file(PartitionedFile::new("x".to_string(), 100)) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } /// Create a single parquet file that is sorted @@ -93,7 +93,7 @@ pub(crate) fn parquet_exec_with_sort( .with_output_ordering(output_ordering) .build(); - Arc::new(DataSourceExec::new(Arc::new(config))) + DataSourceExec::from_data_source(config) } pub fn schema() -> SchemaRef { diff --git a/datafusion/datasource-avro/src/file_format.rs b/datafusion/datasource-avro/src/file_format.rs index 5a2027acd287..4b50fee1d326 100644 --- a/datafusion/datasource-avro/src/file_format.rs +++ b/datafusion/datasource-avro/src/file_format.rs @@ -155,7 +155,7 @@ impl FileFormat for AvroFormat { let config = FileScanConfigBuilder::from(conf) .with_source(self.file_source()) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) + Ok(DataSourceExec::from_data_source(config)) } fn file_source(&self) -> Arc { diff --git a/datafusion/datasource-csv/src/file_format.rs b/datafusion/datasource-csv/src/file_format.rs index 2f9ff7ec0b5c..76f3c50a70a7 100644 --- a/datafusion/datasource-csv/src/file_format.rs +++ b/datafusion/datasource-csv/src/file_format.rs @@ -433,7 +433,7 @@ impl FileFormat for CsvFormat { let config = conf_builder.with_source(source).build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(config)))) + Ok(DataSourceExec::from_data_source(config)) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-csv/src/source.rs b/datafusion/datasource-csv/src/source.rs index e6e151d62270..6db4d1870320 100644 --- a/datafusion/datasource-csv/src/source.rs +++ b/datafusion/datasource-csv/src/source.rs @@ -428,7 +428,7 @@ impl ExecutionPlan for CsvExec { /// .with_file(PartitionedFile::new("file1.csv", 100*1024*1024)) /// .with_newlines_in_values(true) // The file contains newlines in values; /// .build(); -/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); +/// let exec = (DataSourceExec::from_data_source(config)); /// ``` #[derive(Debug, Clone, Default)] pub struct CsvSource { diff --git a/datafusion/datasource-json/src/file_format.rs b/datafusion/datasource-json/src/file_format.rs index 81ba9f061b70..a6c52312e412 100644 --- a/datafusion/datasource-json/src/file_format.rs +++ b/datafusion/datasource-json/src/file_format.rs @@ -257,7 +257,7 @@ impl FileFormat for JsonFormat { )) .with_source(source) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) + Ok(DataSourceExec::from_data_source(conf)) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 7b3954d4204e..1d9a67fd2eb6 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -421,7 +421,7 @@ impl FileFormat for ParquetFormat { let conf = FileScanConfigBuilder::from(conf) .with_source(Arc::new(source)) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) + Ok(DataSourceExec::from_data_source(conf)) } async fn create_writer_physical_plan( diff --git a/datafusion/datasource-parquet/src/source.rs b/datafusion/datasource-parquet/src/source.rs index e7d5c628c68e..d67d0698a959 100644 --- a/datafusion/datasource-parquet/src/source.rs +++ b/datafusion/datasource-parquet/src/source.rs @@ -95,7 +95,7 @@ use object_store::ObjectStore; /// // Create a DataSourceExec for reading `file1.parquet` with a file size of 100MB /// let config = FileScanConfigBuilder::new(object_store_url, file_schema, source) /// .with_file(PartitionedFile::new("file1.parquet", 100*1024*1024)).build(); -/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); +/// let exec = DataSourceExec::from_data_source(config); /// ``` /// /// # Features @@ -177,7 +177,7 @@ use object_store::ObjectStore; /// .clone() /// .with_file_groups(vec![file_group.clone()]); /// -/// Arc::new(DataSourceExec::new(Arc::new(new_config))) +/// (DataSourceExec::from_data_source(new_config)) /// }) /// .collect::>(); /// ``` @@ -220,7 +220,7 @@ use object_store::ObjectStore; /// .with_file(partitioned_file).build(); /// // this parquet DataSourceExec will not even try to read row groups 2 and 4. Additional /// // pruning based on predicates may also happen -/// let exec = Arc::new(DataSourceExec::new(Arc::new(config))); +/// let exec = DataSourceExec::from_data_source(config); /// ``` /// /// For a complete example, see the [`advanced_parquet_index` example]). diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index 839e0745d50f..dcf8ba02a9ef 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -119,7 +119,7 @@ use crate::{ /// PartitionedFile::new("file3.parquet", 78), /// ])).build(); /// // create an execution plan from the config -/// let plan: Arc = Arc::new(DataSourceExec::new(Arc::new(config))); +/// let plan: Arc = DataSourceExec::from_data_source(config); /// ``` #[derive(Clone)] pub struct FileScanConfig { @@ -571,13 +571,13 @@ impl DataSource for FileScanConfig { .clone() .unwrap_or((0..self.file_schema.fields().len()).collect()), ); - Arc::new(DataSourceExec::new(Arc::new( + DataSourceExec::from_data_source( FileScanConfigBuilder::from(file_scan) // Assign projected statistics to source .with_projection(Some(new_projections)) .with_source(source) .build(), - ))) as _ + ) as _ })) } } @@ -925,7 +925,7 @@ impl FileScanConfig { /// Returns a new [`DataSourceExec`] to scan the files specified by this config #[deprecated(since = "47.0.0", note = "use DataSourceExec::new instead")] pub fn build(self) -> Arc { - Arc::new(DataSourceExec::new(Arc::new(self))) + DataSourceExec::from_data_source(self) } /// Write the data_type based on file_source diff --git a/datafusion/datasource/src/memory.rs b/datafusion/datasource/src/memory.rs index 64fd56971b29..f2e36672cd5c 100644 --- a/datafusion/datasource/src/memory.rs +++ b/datafusion/datasource/src/memory.rs @@ -521,7 +521,7 @@ impl MemorySourceConfig { projection: Option>, ) -> Result> { let source = Self::try_new(partitions, schema, projection)?; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + Ok(DataSourceExec::from_data_source(source)) } /// Create a new execution plan from a list of constant values (`ValuesExec`) @@ -611,7 +611,7 @@ impl MemorySourceConfig { show_sizes: true, fetch: None, }; - Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) + Ok(DataSourceExec::from_data_source(source)) } /// Set the limit of the files @@ -760,10 +760,10 @@ mod memory_source_tests { expected_output_order.extend(sort2.clone()); let sort_information = vec![sort1.clone(), sort2.clone()]; - let mem_exec = Arc::new(DataSourceExec::new(Arc::new( + let mem_exec = DataSourceExec::from_data_source( MemorySourceConfig::try_new(&[vec![]], schema, None)? .try_with_sort_information(sort_information)?, - ))); + ); assert_eq!( mem_exec.properties().output_ordering().unwrap(), diff --git a/datafusion/datasource/src/source.rs b/datafusion/datasource/src/source.rs index 74fd123e8df9..6c9122ce1ac1 100644 --- a/datafusion/datasource/src/source.rs +++ b/datafusion/datasource/src/source.rs @@ -195,6 +195,10 @@ impl ExecutionPlan for DataSourceExec { } impl DataSourceExec { + pub fn from_data_source(data_source: impl DataSource + 'static) -> Arc { + Arc::new(Self::new(Arc::new(data_source))) + } + pub fn new(data_source: Arc) -> Self { let cache = Self::compute_properties(Arc::clone(&data_source)); Self { data_source, cache } diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index b3958679bfa7..24cc0d5b3b02 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -247,7 +247,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { .with_newlines_in_values(scan.newlines_in_values) .with_file_compression_type(FileCompressionType::UNCOMPRESSED) .build(); - Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) + Ok(DataSourceExec::from_data_source(conf)) } PhysicalPlanType::JsonScan(scan) => { let scan_conf = parse_protobuf_file_scan_config( @@ -256,7 +256,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(JsonSource::new()), )?; - Ok(Arc::new(DataSourceExec::new(Arc::new(scan_conf)))) + Ok(DataSourceExec::from_data_source(scan_conf)) } #[cfg_attr(not(feature = "parquet"), allow(unused_variables))] PhysicalPlanType::ParquetScan(scan) => { @@ -293,7 +293,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(source), )?; - Ok(Arc::new(DataSourceExec::new(Arc::new(base_config)))) + Ok(DataSourceExec::from_data_source(base_config)) } #[cfg(not(feature = "parquet"))] panic!("Unable to process a Parquet PhysicalPlan when `parquet` feature is not enabled") @@ -308,7 +308,7 @@ impl AsExecutionPlan for protobuf::PhysicalPlanNode { extension_codec, Arc::new(AvroSource::new()), )?; - Ok(Arc::new(DataSourceExec::new(Arc::new(conf)))) + Ok(DataSourceExec::from_data_source(conf)) } #[cfg(not(feature = "avro"))] panic!("Unable to process a Avro PhysicalPlan when `avro` feature is not enabled") diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index d9a559b00cbc..e3d411e4d40e 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -761,7 +761,7 @@ fn roundtrip_parquet_exec_with_pruning_predicate() -> Result<()> { }) .build(); - roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) + roundtrip_test(DataSourceExec::from_data_source(scan_config)) } #[tokio::test] @@ -788,7 +788,7 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { .with_newlines_in_values(false) .build(); - roundtrip_test(Arc::new(DataSourceExec::new(Arc::new(scan_config)))) + roundtrip_test(DataSourceExec::from_data_source(scan_config)) } #[test] @@ -932,7 +932,7 @@ fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { } } - let exec_plan = Arc::new(DataSourceExec::new(Arc::new(scan_config))); + let exec_plan = DataSourceExec::from_data_source(scan_config); let ctx = SessionContext::new(); roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?; @@ -1634,7 +1634,7 @@ async fn roundtrip_projection_source() -> Result<()> { let filter = Arc::new( FilterExec::try_new( Arc::new(BinaryExpr::new(col("c", &schema)?, Operator::Eq, lit(1))), - Arc::new(DataSourceExec::new(Arc::new(scan_config))), + DataSourceExec::from_data_source(scan_config), )? .with_projection(Some(vec![0, 1]))?, ); diff --git a/datafusion/substrait/src/physical_plan/consumer.rs b/datafusion/substrait/src/physical_plan/consumer.rs index bb90acca4e6e..4990054ac7fc 100644 --- a/datafusion/substrait/src/physical_plan/consumer.rs +++ b/datafusion/substrait/src/physical_plan/consumer.rs @@ -156,9 +156,10 @@ pub async fn from_substrait_rel( } } - Ok(Arc::new(DataSourceExec::new(Arc::new( - base_config_builder.build(), - ))) as Arc) + Ok( + DataSourceExec::from_data_source(base_config_builder.build()) + as Arc, + ) } _ => not_impl_err!( "Only LocalFile reads are supported when parsing physical" diff --git a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs index 8dd6076f6dd8..64599465f96f 100644 --- a/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_physical_plan.rs @@ -54,7 +54,7 @@ async fn parquet_exec() -> Result<()> { ]) .build(); let parquet_exec: Arc = - Arc::new(DataSourceExec::new(Arc::new(scan_config))); + DataSourceExec::from_data_source(scan_config); let mut extension_info: ( Vec, From 958f81e5987fd42511a6888f74a25305a0f86897 Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 14:34:45 +0000 Subject: [PATCH 13/14] Fmt --- datafusion-examples/examples/parquet_index.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index 3e0bf492625e..0b6bccc27b1d 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -258,7 +258,9 @@ impl TableProvider for IndexTableProvider { PartitionedFile::new(canonical_path.display().to_string(), file_size), ); } - Ok(DataSourceExec::from_data_source(file_scan_config_builder.build())) + Ok(DataSourceExec::from_data_source( + file_scan_config_builder.build(), + )) } /// Tell DataFusion to push filters down to the scan method From 45b61751d7dd6b93c821a2be8503be87a8a67d32 Mon Sep 17 00:00:00 2001 From: blaginin Date: Thu, 27 Mar 2025 14:49:06 +0000 Subject: [PATCH 14/14] Add docs for `with_source` --- datafusion/datasource/src/file_scan_config.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/datafusion/datasource/src/file_scan_config.rs b/datafusion/datasource/src/file_scan_config.rs index dcf8ba02a9ef..5172dafb1f91 100644 --- a/datafusion/datasource/src/file_scan_config.rs +++ b/datafusion/datasource/src/file_scan_config.rs @@ -278,6 +278,10 @@ impl FileScanConfigBuilder { self } + /// Set the file source for scanning files. + /// + /// This method allows you to change the file source implementation (e.g. ParquetSource, CsvSource, etc.) + /// after the builder has been created. pub fn with_source(mut self, file_source: Arc) -> Self { self.file_source = file_source; self