From 37b55265904414368c81e227072c9954c0fa48cc Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 21 Jun 2024 01:44:42 +0800 Subject: [PATCH 01/53] early draft --- .../examples/dynamic_csv_sql.rs | 47 ++++++++ datafusion/core/Cargo.toml | 1 + .../core/src/catalog/dynamic_file_schema.rs | 111 ++++++++++++++++++ datafusion/core/src/catalog/mod.rs | 1 + datafusion/core/src/execution/context/mod.rs | 9 +- .../core/src/execution/session_state.rs | 6 +- 6 files changed, 172 insertions(+), 3 deletions(-) create mode 100644 datafusion-examples/examples/dynamic_csv_sql.rs create mode 100644 datafusion/core/src/catalog/dynamic_file_schema.rs diff --git a/datafusion-examples/examples/dynamic_csv_sql.rs b/datafusion-examples/examples/dynamic_csv_sql.rs new file mode 100644 index 000000000000..d2755c494019 --- /dev/null +++ b/datafusion-examples/examples/dynamic_csv_sql.rs @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use datafusion::error::Result; +use datafusion::prelude::*; + +/// This example demonstrates executing a simple query against an Arrow data source (CSV) and +/// fetching results +#[tokio::main] +async fn main() -> Result<()> { + // create local execution context + let ctx = SessionContext::new(); + + let testdata = datafusion::test_util::arrow_test_data(); + let path = &format!("file:///{testdata}/csv/aggregate_test_100.csv"); + // execute the query + let df = ctx + .sql( + format!( + r#"SELECT column_1, MIN(column_12), MAX(column_12) + FROM '{}' + WHERE column_11 > 0.1 AND column_11 < 0.9 + GROUP BY column_1"#, + path + ) + .as_str(), + ) + .await?; + + // print the results + df.show().await?; + Ok(()) +} diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 45617d88dc0c..1574f40ff92d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -108,6 +108,7 @@ datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } +dirs = "4.0.0" flate2 = { version = "1.0.24", optional = true } futures = { workspace = true } glob = "0.3.0" diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs new file mode 100644 index 000000000000..d91b0711bd4b --- /dev/null +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -0,0 +1,111 @@ +use std::any::Any; +use std::sync::{Arc, Weak}; + +use async_trait::async_trait; +use dirs::home_dir; +use parking_lot::{Mutex, RwLock}; + +use datafusion_common::plan_datafusion_err; + +use crate::catalog::schema::SchemaProvider; +use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; +use crate::datasource::TableProvider; +use crate::error::Result; +use crate::execution::context::SessionState; + +/// Wraps another schema provider +pub struct DynamicFileSchemaProvider { + inner: Arc, + state_store: StateStore +} + + +impl DynamicFileSchemaProvider { + pub fn new(inner: Arc) -> Self { + Self { + inner, + state_store: StateStore::new(), + } + } + + pub fn with_state(&self, state: Weak>) { + self.state_store.with_state(state); + } +} + +#[async_trait] +impl SchemaProvider for DynamicFileSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> Result>> { + self.inner.register_table(name, table) + } + + async fn table(&self, name: &str) -> Result>> { + let inner_table = self.inner.table(name).await?; + if inner_table.is_some() { + return Ok(inner_table); + } + let optimized_url = substitute_tilde(name.to_owned()); + let table_url = ListingTableUrl::parse(optimized_url.as_str())?; + let state = &self.state_store.get_state() + .upgrade() + .ok_or_else(|| plan_datafusion_err!("locking error"))? + .read() + .clone(); + let cfg = ListingTableConfig::new(table_url.clone()) + .infer(&state) + .await?; + + Ok(Some(Arc::new(ListingTable::try_new(cfg)?))) + } + + fn deregister_table(&self, name: &str) -> Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} +fn substitute_tilde(cur: String) -> String { + if let Some(usr_dir_path) = home_dir() { + if let Some(usr_dir) = usr_dir_path.to_str() { + if cur.starts_with('~') && !usr_dir.is_empty() { + return cur.replacen('~', usr_dir, 1); + } + } + } + cur +} + +pub struct StateStore { + state: Arc>>>> +} + +impl StateStore { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(None)) + } + } + + pub fn with_state(&self, state: Weak>) { + let mut lock = self.state.lock(); + *lock = Some(state); + } + + pub fn get_state(&self) -> Weak> { + self.state.lock().clone().unwrap() + } +} \ No newline at end of file diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs index 53b133339924..5ddb8de76efa 100644 --- a/datafusion/core/src/catalog/mod.rs +++ b/datafusion/core/src/catalog/mod.rs @@ -17,6 +17,7 @@ //! Interfaces and default implementations of catalogs and schemas. +pub mod dynamic_file_schema; pub mod information_schema; pub mod listing_schema; pub mod schema; diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 6fa83d3d931e..1e39e619c5bb 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -75,6 +75,7 @@ use url::Url; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; mod avro; mod csv; @@ -305,10 +306,14 @@ impl SessionContext { /// Creates a new `SessionContext` using the provided [`SessionState`] pub fn new_with_state(state: SessionState) -> Self { + let state_ref = Arc::new(RwLock::new(state.clone())); + state.schema_for_ref("datafusion.public.xx").unwrap() + .as_any().downcast_ref::().unwrap() + .with_state(Arc::downgrade(&state_ref)); Self { - session_id: state.session_id().to_string(), + session_id: state_ref.clone().read().session_id().to_string(), session_start_time: Utc::now(), - state: Arc::new(RwLock::new(state)), + state: state_ref, } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 1df77a1f9e0b..21f4f86e4007 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -17,6 +17,7 @@ //! [`SessionState`]: information required to run queries in a session +use crate::catalog::dynamic_file_schema::{DynamicFileSchemaProvider}; use crate::catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}; use crate::catalog::listing_schema::ListingSchemaProvider; use crate::catalog::schema::{MemorySchemaProvider, SchemaProvider}; @@ -196,11 +197,14 @@ impl SessionState { if config.create_default_catalog_and_schema() { let default_catalog = MemoryCatalogProvider::new(); + let schema = DynamicFileSchemaProvider::new( + Arc::new(MemorySchemaProvider::new()), + ); default_catalog .register_schema( &config.options().catalog.default_schema, - Arc::new(MemorySchemaProvider::new()), + Arc::new(schema), ) .expect("memory catalog provider can register schema"); From ad1a854a2516433c547f40e0059c90a1088a0f0e Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 21 Jun 2024 01:50:28 +0800 Subject: [PATCH 02/53] fmt --- .../core/src/catalog/dynamic_file_schema.rs | 19 +++++++++++++------ datafusion/core/src/execution/context/mod.rs | 12 ++++++++---- .../core/src/execution/session_state.rs | 7 +++---- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index d91b0711bd4b..6e151af1469f 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -16,10 +16,9 @@ use crate::execution::context::SessionState; /// Wraps another schema provider pub struct DynamicFileSchemaProvider { inner: Arc, - state_store: StateStore + state_store: StateStore, } - impl DynamicFileSchemaProvider { pub fn new(inner: Arc) -> Self { Self { @@ -58,13 +57,15 @@ impl SchemaProvider for DynamicFileSchemaProvider { } let optimized_url = substitute_tilde(name.to_owned()); let table_url = ListingTableUrl::parse(optimized_url.as_str())?; - let state = &self.state_store.get_state() + let state = &self + .state_store + .get_state() .upgrade() .ok_or_else(|| plan_datafusion_err!("locking error"))? .read() .clone(); let cfg = ListingTableConfig::new(table_url.clone()) - .infer(&state) + .infer(state) .await?; Ok(Some(Arc::new(ListingTable::try_new(cfg)?))) @@ -90,13 +91,13 @@ fn substitute_tilde(cur: String) -> String { } pub struct StateStore { - state: Arc>>>> + state: Arc>>>>, } impl StateStore { pub fn new() -> Self { Self { - state: Arc::new(Mutex::new(None)) + state: Arc::new(Mutex::new(None)), } } @@ -108,4 +109,10 @@ impl StateStore { pub fn get_state(&self) -> Weak> { self.state.lock().clone().unwrap() } +} + +impl Default for StateStore { + fn default() -> Self { + Self::new() + } } \ No newline at end of file diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 1e39e619c5bb..61fd570af2f9 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -72,10 +72,10 @@ use object_store::ObjectStore; use parking_lot::RwLock; use url::Url; +use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; -use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; mod avro; mod csv; @@ -306,9 +306,13 @@ impl SessionContext { /// Creates a new `SessionContext` using the provided [`SessionState`] pub fn new_with_state(state: SessionState) -> Self { - let state_ref = Arc::new(RwLock::new(state.clone())); - state.schema_for_ref("datafusion.public.xx").unwrap() - .as_any().downcast_ref::().unwrap() + let state_ref = Arc::new(RwLock::new(state.clone())); + state + .schema_for_ref("datafusion.public.xx") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() .with_state(Arc::downgrade(&state_ref)); Self { session_id: state_ref.clone().read().session_id().to_string(), diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 21f4f86e4007..614bac4acfb7 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -17,7 +17,7 @@ //! [`SessionState`]: information required to run queries in a session -use crate::catalog::dynamic_file_schema::{DynamicFileSchemaProvider}; +use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; use crate::catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}; use crate::catalog::listing_schema::ListingSchemaProvider; use crate::catalog::schema::{MemorySchemaProvider, SchemaProvider}; @@ -197,9 +197,8 @@ impl SessionState { if config.create_default_catalog_and_schema() { let default_catalog = MemoryCatalogProvider::new(); - let schema = DynamicFileSchemaProvider::new( - Arc::new(MemorySchemaProvider::new()), - ); + let schema = + DynamicFileSchemaProvider::new(Arc::new(MemorySchemaProvider::new())); default_catalog .register_schema( From 97ea11c338d964ecd4925d03945ba119cc34aa97 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 21 Jun 2024 23:44:33 +0800 Subject: [PATCH 03/53] add example for dynamic file query --- datafusion-examples/examples/csv_sql.rs | 38 ++++++++++----- .../examples/dynamic_csv_sql.rs | 47 ------------------- .../external_dependency/query-aws-s3.rs | 8 ++++ 3 files changed, 34 insertions(+), 59 deletions(-) delete mode 100644 datafusion-examples/examples/dynamic_csv_sql.rs diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index 851fdcb626d2..d19cfa16499e 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -24,25 +24,39 @@ use datafusion::prelude::*; #[tokio::main] async fn main() -> Result<()> { // create local execution context - let ctx = SessionContext::new(); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let ctx = SessionContext::new_with_config(cfg); let testdata = datafusion::test_util::arrow_test_data(); - + let path = &format!("{testdata}/csv/aggregate_test_100.csv"); // register csv file with the execution context - ctx.register_csv( - "aggregate_test_100", - &format!("{testdata}/csv/aggregate_test_100.csv"), - CsvReadOptions::new(), - ) - .await?; + ctx.register_csv("aggregate_test_100", &path, CsvReadOptions::new()) + .await?; // execute the query let df = ctx .sql( - "SELECT c1, MIN(c12), MAX(c12) \ - FROM aggregate_test_100 \ - WHERE c11 > 0.1 AND c11 < 0.9 \ - GROUP BY c1", + r#"SELECT c1, MIN(c12), MAX(c12) + FROM aggregate_test_100 + WHERE c11 > 0.1 AND c11 < 0.9 + GROUP BY c1"#, + ) + .await?; + + // print the results + df.show().await?; + + // query the file by the path dynamically. + let df = ctx + .sql( + format!( + r#"SELECT c1, MIN(c12), MAX(c12) + FROM '{}' + WHERE c11 > 0.1 AND c11 < 0.9 + GROUP BY c1"#, + &path + ) + .as_str(), ) .await?; diff --git a/datafusion-examples/examples/dynamic_csv_sql.rs b/datafusion-examples/examples/dynamic_csv_sql.rs deleted file mode 100644 index d2755c494019..000000000000 --- a/datafusion-examples/examples/dynamic_csv_sql.rs +++ /dev/null @@ -1,47 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use datafusion::error::Result; -use datafusion::prelude::*; - -/// This example demonstrates executing a simple query against an Arrow data source (CSV) and -/// fetching results -#[tokio::main] -async fn main() -> Result<()> { - // create local execution context - let ctx = SessionContext::new(); - - let testdata = datafusion::test_util::arrow_test_data(); - let path = &format!("file:///{testdata}/csv/aggregate_test_100.csv"); - // execute the query - let df = ctx - .sql( - format!( - r#"SELECT column_1, MIN(column_12), MAX(column_12) - FROM '{}' - WHERE column_11 > 0.1 AND column_11 < 0.9 - GROUP BY column_1"#, - path - ) - .as_str(), - ) - .await?; - - // print the results - df.show().await?; - Ok(()) -} diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query-aws-s3.rs index e32286e30e4f..128d04df213e 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query-aws-s3.rs @@ -63,5 +63,13 @@ async fn main() -> Result<()> { // print the results df.show().await?; + // dynamic query by the file path + let df = ctx + .sql(format!(r#"SELECT * FROM '{}' LIMIT 10"#, &path).as_str()) + .await?; + + // print the results + df.show().await?; + Ok(()) } From 2729c49675366c648f7b377faad8f44d72538509 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 21 Jun 2024 23:44:50 +0800 Subject: [PATCH 04/53] add test and refactor --- .../core/src/catalog/dynamic_file_schema.rs | 21 +++++++++- datafusion/core/src/execution/context/mod.rs | 40 ++++++++++++++++++- 2 files changed, 58 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 6e151af1469f..7cae164a2324 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::any::Any; use std::sync::{Arc, Weak}; @@ -90,7 +107,7 @@ fn substitute_tilde(cur: String) -> String { cur } -pub struct StateStore { +pub(crate) struct StateStore { state: Arc>>>>, } @@ -115,4 +132,4 @@ impl Default for StateStore { fn default() -> Self { Self::new() } -} \ No newline at end of file +} diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 61fd570af2f9..7182dcc75bc6 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -308,7 +308,12 @@ impl SessionContext { pub fn new_with_state(state: SessionState) -> Self { let state_ref = Arc::new(RwLock::new(state.clone())); state - .schema_for_ref("datafusion.public.xx") + // provide a fake table reference to get the default schema provider. + .schema_for_ref(TableReference::full( + state.config_options().catalog.default_catalog.as_str(), + state.config_options().catalog.default_schema.as_str(), + UNNAMED_TABLE, + )) .unwrap() .as_any() .downcast_ref::() @@ -1658,6 +1663,39 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_dynamic_file_query() -> Result<()> { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let path = path.join("tests/tpch-csv/customer.csv"); + let url = format!("file://{}", path.display()); + + let rt_cfg = RuntimeConfig::new(); + let runtime = Arc::new(RuntimeEnv::new(rt_cfg).unwrap()); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_state = SessionState::new_with_config_rt(cfg, runtime); + let ctx = SessionContext::new_with_state(session_state); + + let result = plan_and_collect( + &ctx, + format!("select c_name from '{}' limit 3;", &url).as_str(), + ) + .await?; + + let actual = arrow::util::pretty::pretty_format_batches(&result) + .unwrap() + .to_string(); + let expected = r#"+--------------------+ +| c_name | ++--------------------+ +| Customer#000000002 | +| Customer#000000003 | +| Customer#000000004 | ++--------------------+"#; + assert_eq!(actual, expected); + + Ok(()) + } + #[tokio::test] async fn custom_query_planner() -> Result<()> { let runtime = Arc::new(RuntimeEnv::default()); From 6f865778586486367728a9eec25491402dcfca58 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 01:11:37 +0800 Subject: [PATCH 05/53] clippy and add doc --- datafusion-cli/Cargo.lock | 1 + datafusion-cli/src/catalog.rs | 82 ++++--------------- .../core/src/catalog/dynamic_file_schema.rs | 54 +++++++++++- 3 files changed, 72 insertions(+), 65 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index c5b34df4f1cf..2a0874ba3431 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1149,6 +1149,7 @@ dependencies = [ "datafusion-physical-expr-common", "datafusion-physical-plan", "datafusion-sql", + "dirs", "flate2", "futures", "glob", diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index faa657da6511..957de1a31d26 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -18,22 +18,19 @@ use std::any::Any; use std::sync::{Arc, Weak}; -use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; - -use datafusion::catalog::schema::SchemaProvider; +use async_trait::async_trait; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; +use datafusion::catalog::schema::SchemaProvider; use datafusion::common::plan_datafusion_err; -use datafusion::datasource::listing::{ - ListingTable, ListingTableConfig, ListingTableUrl, -}; +use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::SessionState; - -use async_trait::async_trait; use dirs::home_dir; use parking_lot::RwLock; +use crate::object_storage::{AwsOptions, GcpOptions, get_object_store}; + /// Wraps another catalog, automatically creating table providers /// for local files if needed pub struct DynamicFileCatalog { @@ -115,7 +112,8 @@ impl CatalogProvider for DynamicFileCatalogProvider { } } -/// Wraps another schema provider +/// Wraps another schema provider. [DynamicFileSchemaProvider] is responsible for registering the required +/// object stores for the file locations. struct DynamicFileSchemaProvider { inner: Arc, state: Weak>, @@ -149,9 +147,11 @@ impl SchemaProvider for DynamicFileSchemaProvider { } async fn table(&self, name: &str) -> Result>> { - let inner_table = self.inner.table(name).await?; - if inner_table.is_some() { - return Ok(inner_table); + let inner_table = self.inner.table(name).await; + if inner_table.is_ok() { + if let Some(inner_table) = inner_table? { + return Ok(Some(inner_table)); + } } // if the inner schema provider didn't have a table by @@ -195,16 +195,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { state.runtime_env().register_object_store(url, store); } } - - let config = match ListingTableConfig::new(table_url).infer(&state).await { - Ok(cfg) => cfg, - Err(_) => { - // treat as non-existing - return Ok(None); - } - }; - - Ok(Some(Arc::new(ListingTable::try_new(config)?))) + self.inner.table(name).await } fn deregister_table(&self, name: &str) -> Result>> { @@ -228,11 +219,11 @@ fn substitute_tilde(cur: String) -> String { #[cfg(test)] mod tests { - use super::*; - use datafusion::catalog::schema::SchemaProvider; use datafusion::prelude::SessionContext; + use super::*; + fn setup_context() -> (SessionContext, Arc) { let mut ctx = SessionContext::new(); ctx.register_catalog_list(Arc::new(DynamicFileCatalog::new( @@ -262,7 +253,7 @@ mod tests { let (ctx, schema) = setup_context(); // That's a non registered table so expecting None here - let table = schema.table(&location).await.unwrap(); + let table = schema.table(&location).await.ok(); assert!(table.is_none()); // It should still create an object store for the location in the SessionState @@ -286,7 +277,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await.unwrap(); + let table = schema.table(&location).await.ok(); assert!(table.is_none()); let store = ctx @@ -308,7 +299,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await.unwrap(); + let table = schema.table(&location).await.ok(); assert!(table.is_none()); let store = ctx @@ -330,41 +321,4 @@ mod tests { assert!(schema.table(location).await.is_err()); } - #[cfg(not(target_os = "windows"))] - #[test] - fn test_substitute_tilde() { - use std::env; - use std::path::MAIN_SEPARATOR; - let original_home = home_dir(); - let test_home_path = if cfg!(windows) { - "C:\\Users\\user" - } else { - "/home/user" - }; - env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - test_home_path, - ); - let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; - let expected = format!( - "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", - test_home_path, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR - ); - let actual = substitute_tilde(input.to_string()); - assert_eq!(actual, expected); - match original_home { - Some(home_path) => env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - home_path.to_str().unwrap(), - ), - None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), - } - } } diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 7cae164a2324..af564f47fbb3 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths + use std::any::Any; use std::sync::{Arc, Weak}; @@ -30,13 +32,17 @@ use crate::datasource::TableProvider; use crate::error::Result; use crate::execution::context::SessionState; -/// Wraps another schema provider +/// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. +/// +/// The provider will try to create a table provider from the file path if the table provider +/// isn't exist in the inner schema provider. The required object store must be registered in the session context. pub struct DynamicFileSchemaProvider { inner: Arc, state_store: StateStore, } impl DynamicFileSchemaProvider { + /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. pub fn new(inner: Arc) -> Self { Self { inner, @@ -44,6 +50,7 @@ impl DynamicFileSchemaProvider { } } + /// register the state store to the schema provider. pub fn with_state(&self, state: Weak>) { self.state_store.with_state(state); } @@ -107,6 +114,7 @@ fn substitute_tilde(cur: String) -> String { cur } +/// The state store that stores the reference of the runtime session state. pub(crate) struct StateStore { state: Arc>>>>, } @@ -133,3 +141,47 @@ impl Default for StateStore { Self::new() } } + +#[cfg(test)] +mod tests { + use dirs::home_dir; + use crate::catalog::dynamic_file_schema::substitute_tilde; + + #[cfg(not(target_os = "windows"))] + #[test] + fn test_substitute_tilde() { + use std::env; + use std::path::MAIN_SEPARATOR; + let original_home = home_dir(); + let test_home_path = if cfg!(windows) { + "C:\\Users\\user" + } else { + "/home/user" + }; + env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + test_home_path, + ); + let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; + let expected = format!( + "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", + test_home_path, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR + ); + let actual = substitute_tilde(input.to_string()); + assert_eq!(actual, expected); + match original_home { + Some(home_path) => env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + home_path.to_str().unwrap(), + ), + None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), + } + } +} From c91cdc6697023d29e3b74a5fd086257a790c3bf1 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 01:13:32 +0800 Subject: [PATCH 06/53] cargo fmt --- datafusion/core/src/catalog/dynamic_file_schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index af564f47fbb3..441d6bd06343 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -144,8 +144,8 @@ impl Default for StateStore { #[cfg(test)] mod tests { - use dirs::home_dir; use crate::catalog::dynamic_file_schema::substitute_tilde; + use dirs::home_dir; #[cfg(not(target_os = "windows"))] #[test] From 3306df64ee9765eba411cd967ece500144474b8c Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 01:21:13 +0800 Subject: [PATCH 07/53] extract substitute_tilde function --- datafusion-cli/src/catalog.rs | 12 +----------- datafusion/core/src/catalog/dynamic_file_schema.rs | 4 +++- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index 957de1a31d26..d21ed9cb2939 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -20,13 +20,13 @@ use std::sync::{Arc, Weak}; use async_trait::async_trait; use datafusion::catalog::{CatalogProvider, CatalogProviderList}; +use datafusion::catalog::dynamic_file_schema::substitute_tilde; use datafusion::catalog::schema::SchemaProvider; use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::SessionState; -use dirs::home_dir; use parking_lot::RwLock; use crate::object_storage::{AwsOptions, GcpOptions, get_object_store}; @@ -206,16 +206,6 @@ impl SchemaProvider for DynamicFileSchemaProvider { self.inner.table_exist(name) } } -fn substitute_tilde(cur: String) -> String { - if let Some(usr_dir_path) = home_dir() { - if let Some(usr_dir) = usr_dir_path.to_str() { - if cur.starts_with('~') && !usr_dir.is_empty() { - return cur.replacen('~', usr_dir, 1); - } - } - } - cur -} #[cfg(test)] mod tests { diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 441d6bd06343..098eef1317a6 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -103,7 +103,9 @@ impl SchemaProvider for DynamicFileSchemaProvider { self.inner.table_exist(name) } } -fn substitute_tilde(cur: String) -> String { + +/// Substitute the tilde character in the file path with the user home directory. +pub fn substitute_tilde(cur: String) -> String { if let Some(usr_dir_path) = home_dir() { if let Some(usr_dir) = usr_dir_path.to_str() { if cur.starts_with('~') && !usr_dir.is_empty() { From d82b273f7e68af2ba833c8559bea89eed64070dc Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 02:04:45 +0800 Subject: [PATCH 08/53] fix the error handling --- .../core/src/catalog/dynamic_file_schema.rs | 23 ++++++++++++------- datafusion/core/src/execution/context/mod.rs | 17 +++++++++----- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 098eef1317a6..ae971f90b467 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -75,12 +75,15 @@ impl SchemaProvider for DynamicFileSchemaProvider { } async fn table(&self, name: &str) -> Result>> { - let inner_table = self.inner.table(name).await?; - if inner_table.is_some() { - return Ok(inner_table); + if let Ok(Some(inner_table)) = self.inner.table(name).await { + return Ok(Some(inner_table)); } + let optimized_url = substitute_tilde(name.to_owned()); - let table_url = ListingTableUrl::parse(optimized_url.as_str())?; + let Ok(table_url) = ListingTableUrl::parse(optimized_url.as_str()) else { + return Ok(None); + }; + let state = &self .state_store .get_state() @@ -88,11 +91,15 @@ impl SchemaProvider for DynamicFileSchemaProvider { .ok_or_else(|| plan_datafusion_err!("locking error"))? .read() .clone(); - let cfg = ListingTableConfig::new(table_url.clone()) + if let Ok(cfg) = ListingTableConfig::new(table_url.clone()) .infer(state) - .await?; - - Ok(Some(Arc::new(ListingTable::try_new(cfg)?))) + .await + { + ListingTable::try_new(cfg) + .map(|table| Some(Arc::new(table) as Arc)) + } else { + Ok(None) + } } fn deregister_table(&self, name: &str) -> Result>> { diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 7182dcc75bc6..10ba96a660cd 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -307,18 +307,23 @@ impl SessionContext { /// Creates a new `SessionContext` using the provided [`SessionState`] pub fn new_with_state(state: SessionState) -> Self { let state_ref = Arc::new(RwLock::new(state.clone())); - state + + if let Ok(provider) = state // provide a fake table reference to get the default schema provider. .schema_for_ref(TableReference::full( state.config_options().catalog.default_catalog.as_str(), state.config_options().catalog.default_schema.as_str(), UNNAMED_TABLE, )) - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .with_state(Arc::downgrade(&state_ref)); + { + if let Some(provider) = provider + .as_any() + .downcast_ref::() + { + provider.with_state(Arc::downgrade(&state_ref)); + } + } + Self { session_id: state_ref.clone().read().session_id().to_string(), session_start_time: Utc::now(), From c0491d5773817ff821b2e321d15435301cd1c51c Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 02:07:35 +0800 Subject: [PATCH 09/53] fmt and clippy --- datafusion-cli/src/catalog.rs | 4 ++-- datafusion-examples/examples/csv_sql.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index d21ed9cb2939..0c75a9875198 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -19,9 +19,9 @@ use std::any::Any; use std::sync::{Arc, Weak}; use async_trait::async_trait; -use datafusion::catalog::{CatalogProvider, CatalogProviderList}; use datafusion::catalog::dynamic_file_schema::substitute_tilde; use datafusion::catalog::schema::SchemaProvider; +use datafusion::catalog::{CatalogProvider, CatalogProviderList}; use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::ListingTableUrl; use datafusion::datasource::TableProvider; @@ -29,7 +29,7 @@ use datafusion::error::Result; use datafusion::execution::context::SessionState; use parking_lot::RwLock; -use crate::object_storage::{AwsOptions, GcpOptions, get_object_store}; +use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; /// Wraps another catalog, automatically creating table providers /// for local files if needed diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index d19cfa16499e..f12a8c938b07 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -30,7 +30,7 @@ async fn main() -> Result<()> { let testdata = datafusion::test_util::arrow_test_data(); let path = &format!("{testdata}/csv/aggregate_test_100.csv"); // register csv file with the execution context - ctx.register_csv("aggregate_test_100", &path, CsvReadOptions::new()) + ctx.register_csv("aggregate_test_100", path, CsvReadOptions::new()) .await?; // execute the query @@ -54,7 +54,7 @@ async fn main() -> Result<()> { FROM '{}' WHERE c11 > 0.1 AND c11 < 0.9 GROUP BY c1"#, - &path + path ) .as_str(), ) From a60eeea1dc8ea63220b8e343cc9aa90885e59d72 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 02:49:36 +0800 Subject: [PATCH 10/53] fix test --- datafusion-cli/src/catalog.rs | 6 +-- .../sqllogictest/test_files/array_query.slt | 44 +++++++++---------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index 0c75a9875198..508d698c8ac5 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -243,7 +243,7 @@ mod tests { let (ctx, schema) = setup_context(); // That's a non registered table so expecting None here - let table = schema.table(&location).await.ok(); + let table = schema.table(&location).await?; assert!(table.is_none()); // It should still create an object store for the location in the SessionState @@ -267,7 +267,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await.ok(); + let table = schema.table(&location).await?; assert!(table.is_none()); let store = ctx @@ -289,7 +289,7 @@ mod tests { let (ctx, schema) = setup_context(); - let table = schema.table(&location).await.ok(); + let table = schema.table(&location).await?; assert!(table.is_none()); let store = ctx diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt index 24c99fc849b6..227f874f7765 100644 --- a/datafusion/sqllogictest/test_files/array_query.slt +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -19,7 +19,7 @@ # Make a table with multiple input partitions statement ok -CREATE TABLE data AS +CREATE TABLE test_data AS SELECT * FROM (VALUES ([1,2,3], [4,5], 1) ) @@ -31,7 +31,7 @@ CREATE TABLE data AS ; query ??I rowsort -SELECT * FROM data; +SELECT * FROM test_data; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 @@ -42,47 +42,47 @@ SELECT * FROM data; ########### query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM data WHERE column1 = [1,2,3]; +SELECT * FROM test_data WHERE column1 = [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM data WHERE column1 = column2 +SELECT * FROM test_data WHERE column1 = column2 query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM data WHERE column1 != [1,2,3]; +SELECT * FROM test_data WHERE column1 != [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM data WHERE column1 != column2 +SELECT * FROM test_data WHERE column1 != column2 ########### # Aggregates ########### query error Internal error: Min/Max accumulator not implemented for type List -SELECT min(column1) FROM data; +SELECT min(column1) FROM test_data; query error Internal error: Min/Max accumulator not implemented for type List -SELECT max(column1) FROM data; +SELECT max(column1) FROM test_data; query I -SELECT count(column1) FROM data; +SELECT count(column1) FROM test_data; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM data; +SELECT count(distinct column1) FROM test_data; ---- 2 query I -SELECT count(distinct column2) FROM data; +SELECT count(distinct column2) FROM test_data; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM data; +SELECT count(distinct column1), count(distinct column2) FROM test_data; ---- 2 2 @@ -93,24 +93,24 @@ SELECT count(distinct column1), count(distinct column2) FROM data; query I -SELECT count(column1) FROM data GROUP BY column3; +SELECT count(column1) FROM test_data GROUP BY column3; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM data GROUP BY column3; +SELECT count(distinct column1) FROM test_data GROUP BY column3; ---- 2 query I -SELECT count(distinct column2) FROM data GROUP BY column3; +SELECT count(distinct column2) FROM test_data GROUP BY column3; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM data GROUP BY column3; +SELECT count(distinct column1), count(distinct column2) FROM test_data GROUP BY column3; ---- 2 2 @@ -120,21 +120,21 @@ SELECT count(distinct column1), count(distinct column2) FROM data GROUP BY colum ########### query ??I -SELECT * FROM data ORDER BY column2; +SELECT * FROM test_data ORDER BY column2; ---- [2, 3] [2, 3] 1 [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 query ??I -SELECT * FROM data ORDER BY column2 DESC; +SELECT * FROM test_data ORDER BY column2 DESC; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM data ORDER BY column2 DESC NULLS LAST; +SELECT * FROM test_data ORDER BY column2 DESC NULLS LAST; ---- [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 @@ -142,14 +142,14 @@ SELECT * FROM data ORDER BY column2 DESC NULLS LAST; # multi column query ??I -SELECT * FROM data ORDER BY column1, column2; +SELECT * FROM test_data ORDER BY column1, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM data ORDER BY column1, column3, column2; +SELECT * FROM test_data ORDER BY column1, column3, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 @@ -157,4 +157,4 @@ SELECT * FROM data ORDER BY column1, column3, column2; statement ok -drop table data +drop table test_data From 9fa01aa8b46ed00278c94c28223fad6ed4838c30 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 11:50:30 +0800 Subject: [PATCH 11/53] fix sqllogictests --- datafusion/sqllogictest/test_files/describe.slt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index a15c3a109cab..a37bdeaeabbc 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -60,8 +60,12 @@ DROP TABLE aggregate_simple; # Describe file (currently we can only describe file in datafusion-cli, fix this after issue (#4850) has been done) ########## -statement error Error during planning: table 'datafusion.public.../core/tests/data/aggregate_simple.csv' not found +query TTT DESCRIBE '../core/tests/data/aggregate_simple.csv'; +---- +column_1 Utf8 YES +column_2 Utf8 YES +column_3 Utf8 YES ########## # Describe command From 2ab3639942f1e729b84de89cb2f6e2963312924c Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 12:12:35 +0800 Subject: [PATCH 12/53] ignore dirs for windows test --- datafusion/core/src/catalog/dynamic_file_schema.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index ae971f90b467..06b2459d4bbb 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -154,6 +154,7 @@ impl Default for StateStore { #[cfg(test)] mod tests { use crate::catalog::dynamic_file_schema::substitute_tilde; + #[cfg(not(target_os = "windows"))] use dirs::home_dir; #[cfg(not(target_os = "windows"))] From a8ee733cde962ef6a3cb1681dc89f4af25a85048 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 12:27:11 +0800 Subject: [PATCH 13/53] enhance the test for every file format --- .../sqllogictest/test_files/arrow_files.slt | 15 +++++++++++ datafusion/sqllogictest/test_files/avro.slt | 25 +++++++++++++++++++ .../sqllogictest/test_files/csv_files.slt | 17 +++++++++++++ datafusion/sqllogictest/test_files/json.slt | 16 ++++++++++++ .../sqllogictest/test_files/parquet.slt | 12 +++++++++ 5 files changed, 85 insertions(+) diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 8cf3550fdb25..b9acb6bdc487 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -43,6 +43,14 @@ SELECT * FROM arrow_simple 3 baz false 4 NULL true +query ITB +SELECT * FROM '../core/tests/data/example.arrow'; +---- +1 foo true +2 bar NULL +3 baz false +4 NULL true + # ARROW partitioned table statement ok CREATE EXTERNAL TABLE arrow_partitioned ( @@ -64,6 +72,13 @@ SELECT * FROM arrow_partitioned ORDER BY f0; 3 baz true 456 4 NULL NULL 456 +# dynamic select arrow file in the folder +query ITB +SELECT * FROM '../core/tests/data/partitioned_table_arrow/part=123' ORDER BY f0; +---- +1 foo true +2 bar false + # select all fields query IITB SELECT part, f0, f1, f2 FROM arrow_partitioned ORDER BY f0; diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index fced1924ced9..55b2925031e6 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -136,6 +136,18 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.avro' +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + # test avro query with snappy query IT SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_snappy @@ -149,6 +161,19 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_snappy 0 0 1 1 +# dynamic query snappy avro file +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.snappy.avro' +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + # test avro query with bzip2 query IT SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_bzip2 diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 8902b3eebf24..91a7ed62013f 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -50,6 +50,23 @@ id7 value7 id8 value8 id9 value9 +# query the csv file dynamically with the config of current session +query TT +select * from '../core/tests/data/quote.csv'; +---- +c1 c2 +~id0~ ~value0~ +~id1~ ~value1~ +~id2~ ~value2~ +~id3~ ~value3~ +~id4~ ~value4~ +~id5~ ~value5~ +~id6~ ~value6~ +~id7~ ~value7~ +~id8~ ~value8~ +~id9~ ~value9~ + + query TT select * from csv_with_escape; ---- diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index 5d3c23d5130b..c6d6a7fa197d 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -45,6 +45,22 @@ SELECT a, b FROM json_test 5 -3.5 7 -3.5 +query IR rowsort +SELECT a, b FROM '../core/tests/data/2.json' +---- +-10 -3.5 +1 -3.5 +1 0.6 +1 0.6 +1 2 +1 2 +1 2 +1 2 +100000000000000 0.6 +2 0.6 +5 -3.5 +7 -3.5 + query TT EXPLAIN SELECT count(*) from json_test ---- diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index e70f800bde74..3477bff4ae49 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -202,6 +202,18 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../parquet-testing/data/alltypes_plain.parquet'; +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + # Clean up statement ok DROP TABLE alltypes_plain; From 7faab9f4176d21200eec7176701b831ab98a50fa Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 22 Jun 2024 12:41:49 +0800 Subject: [PATCH 14/53] disable the test for windows --- datafusion/core/src/catalog/dynamic_file_schema.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 06b2459d4bbb..4f6bf5bcf282 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -151,13 +151,12 @@ impl Default for StateStore { } } +#[cfg(not(target_os = "windows"))] #[cfg(test)] mod tests { use crate::catalog::dynamic_file_schema::substitute_tilde; - #[cfg(not(target_os = "windows"))] use dirs::home_dir; - #[cfg(not(target_os = "windows"))] #[test] fn test_substitute_tilde() { use std::env; From e1f3908e7787bd5798b9004c3856fca6b8b93402 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Mon, 1 Jul 2024 22:48:16 +0800 Subject: [PATCH 15/53] make dynamic file query configurable --- .../core/src/catalog/dynamic_file_schema.rs | 98 +++++++++---------- datafusion/core/src/execution/context/mod.rs | 49 ++++++---- .../core/src/execution/session_state.rs | 35 ++++++- 3 files changed, 107 insertions(+), 75 deletions(-) diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 4f6bf5bcf282..60d8e72f9dce 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -18,11 +18,10 @@ //! dynamic_file_schema contains a SchemaProvider that creates tables from file paths use std::any::Any; -use std::sync::{Arc, Weak}; +use std::sync::Arc; use async_trait::async_trait; use dirs::home_dir; -use parking_lot::{Mutex, RwLock}; use datafusion_common::plan_datafusion_err; @@ -30,7 +29,7 @@ use crate::catalog::schema::SchemaProvider; use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; use crate::datasource::TableProvider; use crate::error::Result; -use crate::execution::context::SessionState; +use crate::execution::session_state::StateStore; /// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. /// @@ -38,21 +37,16 @@ use crate::execution::context::SessionState; /// isn't exist in the inner schema provider. The required object store must be registered in the session context. pub struct DynamicFileSchemaProvider { inner: Arc, - state_store: StateStore, + factory: Arc, } impl DynamicFileSchemaProvider { /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. - pub fn new(inner: Arc) -> Self { - Self { - inner, - state_store: StateStore::new(), - } - } - - /// register the state store to the schema provider. - pub fn with_state(&self, state: Weak>) { - self.state_store.with_state(state); + pub fn new( + inner: Arc, + factory: Arc, + ) -> Self { + Self { inner, factory } } } @@ -80,26 +74,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { } let optimized_url = substitute_tilde(name.to_owned()); - let Ok(table_url) = ListingTableUrl::parse(optimized_url.as_str()) else { - return Ok(None); - }; - - let state = &self - .state_store - .get_state() - .upgrade() - .ok_or_else(|| plan_datafusion_err!("locking error"))? - .read() - .clone(); - if let Ok(cfg) = ListingTableConfig::new(table_url.clone()) - .infer(state) - .await - { - ListingTable::try_new(cfg) - .map(|table| Some(Arc::new(table) as Arc)) - } else { - Ok(None) - } + self.factory.try_new(optimized_url.as_str()).await } fn deregister_table(&self, name: &str) -> Result>> { @@ -123,31 +98,48 @@ pub fn substitute_tilde(cur: String) -> String { cur } -/// The state store that stores the reference of the runtime session state. -pub(crate) struct StateStore { - state: Arc>>>>, +/// [UrlTableFactory] is a factory that can create a table provider from the given url. +#[async_trait] +pub trait UrlTableFactory: Sync + Send { + /// create a new table provider from the provided url + async fn try_new(&self, url: &str) -> Result>>; } -impl StateStore { - pub fn new() -> Self { - Self { - state: Arc::new(Mutex::new(None)), - } - } - - pub fn with_state(&self, state: Weak>) { - let mut lock = self.state.lock(); - *lock = Some(state); - } +/// [DynamicListTableFactory] is a factory that can create a [ListingTable] from the given url. +#[derive(Default)] +pub struct DynamicListTableFactory { + state_store: Arc, +} - pub fn get_state(&self) -> Weak> { - self.state.lock().clone().unwrap() +impl DynamicListTableFactory { + pub fn new(state_store: Arc) -> Self { + Self { state_store } } } -impl Default for StateStore { - fn default() -> Self { - Self::new() +#[async_trait] +impl UrlTableFactory for DynamicListTableFactory { + async fn try_new(&self, url: &str) -> Result>> { + let Ok(table_url) = ListingTableUrl::parse(url) else { + return Ok(None); + }; + + let state = &self + .state_store + .get_state() + .upgrade() + .ok_or_else(|| plan_datafusion_err!("locking error"))? + .read() + .clone(); + if let Ok(cfg) = ListingTableConfig::new(table_url.clone()) + .infer(state) + .await + { + ListingTable::try_new(cfg) + .map(|table| Some(Arc::new(table) as Arc)) + } else { + Ok(None) + } } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 10ba96a660cd..a7ea08b73e1f 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -52,7 +52,7 @@ use arrow::record_batch::RecordBatch; use arrow_schema::Schema; use datafusion_common::{ config::{ConfigExtension, TableOptions}, - exec_err, not_impl_err, plan_err, + config_err, exec_err, not_impl_err, plan_err, tree_node::{TreeNodeRecursion, TreeNodeVisitor}, DFSchema, SchemaReference, TableReference, }; @@ -72,7 +72,11 @@ use object_store::ObjectStore; use parking_lot::RwLock; use url::Url; -use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; +use crate::catalog::dynamic_file_schema::{ + DynamicFileSchemaProvider, DynamicListTableFactory, +}; +use crate::catalog::schema::SchemaProvider; +use crate::execution::session_state::StateStore; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; @@ -306,28 +310,37 @@ impl SessionContext { /// Creates a new `SessionContext` using the provided [`SessionState`] pub fn new_with_state(state: SessionState) -> Self { - let state_ref = Arc::new(RwLock::new(state.clone())); + Self { + session_id: state.session_id().to_string(), + session_start_time: Utc::now(), + state: Arc::new(RwLock::new(state.clone())), + } + } - if let Ok(provider) = state + pub fn enable_url_table(&self) -> Result>> { + let state_ref = self.state(); + let catalog_name = state_ref.config_options().catalog.default_catalog.as_str(); + let schema_name = state_ref.config_options().catalog.default_schema.as_str(); + if let Ok(provider) = state_ref // provide a fake table reference to get the default schema provider. .schema_for_ref(TableReference::full( - state.config_options().catalog.default_catalog.as_str(), - state.config_options().catalog.default_schema.as_str(), + catalog_name, + schema_name, UNNAMED_TABLE, )) { - if let Some(provider) = provider - .as_any() - .downcast_ref::() - { - provider.with_state(Arc::downgrade(&state_ref)); - } - } - - Self { - session_id: state_ref.clone().read().session_id().to_string(), - session_start_time: Utc::now(), - state: state_ref, + let state_store = Arc::new(StateStore::new()); + state_store.with_state(self.state_weak_ref()); + let factory = Arc::new(DynamicListTableFactory::new(state_store)); + let new_provider = + Arc::new(DynamicFileSchemaProvider::new(provider, factory)); + state_ref + .catalog_list() + .catalog(catalog_name) + .unwrap() + .register_schema(schema_name, new_provider) + } else { + config_err!("default catalog and schema are required for url table") } } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 614bac4acfb7..29c43d4c1faa 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -17,7 +17,6 @@ //! [`SessionState`]: information required to run queries in a session -use crate::catalog::dynamic_file_schema::DynamicFileSchemaProvider; use crate::catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}; use crate::catalog::listing_schema::ListingSchemaProvider; use crate::catalog::schema::{MemorySchemaProvider, SchemaProvider}; @@ -69,11 +68,12 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use datafusion_sql::parser::{DFParser, Statement}; use datafusion_sql::planner::{ContextProvider, ParserOptions, SqlToRel}; +use parking_lot::{Mutex, RwLock}; use sqlparser::dialect::dialect_from_str; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; -use std::sync::Arc; +use std::sync::{Arc, Weak}; use url::Url; use uuid::Uuid; @@ -197,8 +197,7 @@ impl SessionState { if config.create_default_catalog_and_schema() { let default_catalog = MemoryCatalogProvider::new(); - let schema = - DynamicFileSchemaProvider::new(Arc::new(MemorySchemaProvider::new())); + let schema = MemorySchemaProvider::new(); default_catalog .register_schema( @@ -1092,3 +1091,31 @@ impl<'a> SimplifyInfo for SessionSimplifyProvider<'a> { expr.get_type(self.df_schema) } } + +/// The state store that stores the reference of the runtime session state. +pub(crate) struct StateStore { + state: Arc>>>>, +} + +impl StateStore { + pub fn new() -> Self { + Self { + state: Arc::new(Mutex::new(None)), + } + } + + pub fn with_state(&self, state: Weak>) { + let mut lock = self.state.lock(); + *lock = Some(state); + } + + pub fn get_state(&self) -> Weak> { + self.state.lock().clone().unwrap() + } +} + +impl Default for StateStore { + fn default() -> Self { + Self::new() + } +} From cf73ba2a0785d96913051319e2263e689c90d2a2 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Mon, 1 Jul 2024 22:50:14 +0800 Subject: [PATCH 16/53] revert array_query.slt --- .../sqllogictest/test_files/array_query.slt | 44 +++++++++---------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt index 227f874f7765..265618669091 100644 --- a/datafusion/sqllogictest/test_files/array_query.slt +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -19,7 +19,7 @@ # Make a table with multiple input partitions statement ok -CREATE TABLE test_data AS +CREATE TABLE data AS SELECT * FROM (VALUES ([1,2,3], [4,5], 1) ) @@ -31,7 +31,7 @@ CREATE TABLE test_data AS ; query ??I rowsort -SELECT * FROM test_data; +SELECT * FROM date; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 @@ -42,47 +42,47 @@ SELECT * FROM test_data; ########### query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM test_data WHERE column1 = [1,2,3]; +SELECT * FROM date WHERE column1 = [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM test_data WHERE column1 = column2 +SELECT * FROM date WHERE column1 = column2 query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM test_data WHERE column1 != [1,2,3]; +SELECT * FROM date WHERE column1 != [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM test_data WHERE column1 != column2 +SELECT * FROM date WHERE column1 != column2 ########### # Aggregates ########### query error Internal error: Min/Max accumulator not implemented for type List -SELECT min(column1) FROM test_data; +SELECT min(column1) FROM date; query error Internal error: Min/Max accumulator not implemented for type List -SELECT max(column1) FROM test_data; +SELECT max(column1) FROM date; query I -SELECT count(column1) FROM test_data; +SELECT count(column1) FROM date; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM test_data; +SELECT count(distinct column1) FROM date; ---- 2 query I -SELECT count(distinct column2) FROM test_data; +SELECT count(distinct column2) FROM date; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM test_data; +SELECT count(distinct column1), count(distinct column2) FROM date; ---- 2 2 @@ -93,24 +93,24 @@ SELECT count(distinct column1), count(distinct column2) FROM test_data; query I -SELECT count(column1) FROM test_data GROUP BY column3; +SELECT count(column1) FROM date GROUP BY column3; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM test_data GROUP BY column3; +SELECT count(distinct column1) FROM date GROUP BY column3; ---- 2 query I -SELECT count(distinct column2) FROM test_data GROUP BY column3; +SELECT count(distinct column2) FROM date GROUP BY column3; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM test_data GROUP BY column3; +SELECT count(distinct column1), count(distinct column2) FROM date GROUP BY column3; ---- 2 2 @@ -120,21 +120,21 @@ SELECT count(distinct column1), count(distinct column2) FROM test_data GROUP BY ########### query ??I -SELECT * FROM test_data ORDER BY column2; +SELECT * FROM date ORDER BY column2; ---- [2, 3] [2, 3] 1 [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 query ??I -SELECT * FROM test_data ORDER BY column2 DESC; +SELECT * FROM date ORDER BY column2 DESC; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM test_data ORDER BY column2 DESC NULLS LAST; +SELECT * FROM date ORDER BY column2 DESC NULLS LAST; ---- [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 @@ -142,14 +142,14 @@ SELECT * FROM test_data ORDER BY column2 DESC NULLS LAST; # multi column query ??I -SELECT * FROM test_data ORDER BY column1, column2; +SELECT * FROM date ORDER BY column1, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM test_data ORDER BY column1, column3, column2; +SELECT * FROM date ORDER BY column1, column3, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 @@ -157,4 +157,4 @@ SELECT * FROM test_data ORDER BY column1, column3, column2; statement ok -drop table test_data +drop table date From c641e6bed6778d904bf8a0bfeee21527fc1078cb Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Mon, 1 Jul 2024 23:40:41 +0800 Subject: [PATCH 17/53] modified the test and add example --- datafusion-examples/examples/csv_sql.rs | 1 + .../external_dependency/query-aws-s3.rs | 1 + .../core/src/catalog/dynamic_file_schema.rs | 1 + datafusion/core/src/execution/context/mod.rs | 31 ++++++++++++++ .../core/src/execution/session_state.rs | 5 ++- datafusion/sqllogictest/src/test_context.rs | 5 +++ .../sqllogictest/test_files/array_query.slt | 42 +++++++++---------- .../sqllogictest/test_files/describe.slt | 2 +- 8 files changed, 65 insertions(+), 23 deletions(-) diff --git a/datafusion-examples/examples/csv_sql.rs b/datafusion-examples/examples/csv_sql.rs index f12a8c938b07..f35656835f4c 100644 --- a/datafusion-examples/examples/csv_sql.rs +++ b/datafusion-examples/examples/csv_sql.rs @@ -46,6 +46,7 @@ async fn main() -> Result<()> { // print the results df.show().await?; + ctx.enable_url_table()?; // query the file by the path dynamically. let df = ctx .sql( diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query-aws-s3.rs index 128d04df213e..11c517d3b3b6 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query-aws-s3.rs @@ -64,6 +64,7 @@ async fn main() -> Result<()> { df.show().await?; // dynamic query by the file path + ctx.enable_url_table()?; let df = ctx .sql(format!(r#"SELECT * FROM '{}' LIMIT 10"#, &path).as_str()) .await?; diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 60d8e72f9dce..08cd404241c3 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -112,6 +112,7 @@ pub struct DynamicListTableFactory { } impl DynamicListTableFactory { + /// Create a new [DynamicListTableFactory] with the given state store. pub fn new(state_store: Arc) -> Self { Self { state_store } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index a7ea08b73e1f..5f1b4dcbd5ff 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -317,6 +317,37 @@ impl SessionContext { } } + /// Enable the dynamic file query for the current session. + /// See [DynamicFileSchemaProvider] for more details + /// + /// # Example: query the url table + /// + /// ``` + /// use datafusion::prelude::*; + /// # use datafusion::{error::Result, assert_batches_eq}; + /// # #[tokio::main] + /// # async fn main() -> Result<()> { + /// let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + /// let ctx = SessionContext::new_with_config(cfg); + /// ctx.enable_url_table().ok(); + /// let results = ctx + /// .sql("SELECT a, MIN(b) FROM 'tests/data/example.csv' as example GROUP BY a LIMIT 100") + /// .await? + /// .collect() + /// .await?; + /// assert_batches_eq!( + /// &[ + /// "+---+----------------+", + /// "| a | MIN(example.b) |", + /// "+---+----------------+", + /// "| 1 | 2 |", + /// "+---+----------------+", + /// ], + /// &results + /// ); + /// # Ok(()) + /// # } + /// ``` pub fn enable_url_table(&self) -> Result>> { let state_ref = self.state(); let catalog_name = state_ref.config_options().catalog.default_catalog.as_str(); diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 29c43d4c1faa..cdf0716621ac 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1093,22 +1093,25 @@ impl<'a> SimplifyInfo for SessionSimplifyProvider<'a> { } /// The state store that stores the reference of the runtime session state. -pub(crate) struct StateStore { +pub struct StateStore { state: Arc>>>>, } impl StateStore { + /// Create a new [StateStore] pub fn new() -> Self { Self { state: Arc::new(Mutex::new(None)), } } + /// Set the session state of the store pub fn with_state(&self, state: Weak>) { let mut lock = self.state.lock(); *lock = Some(state); } + /// Get the current session state of the store pub fn get_state(&self) -> Weak> { self.state.lock().clone().unwrap() } diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index dd27727e3ad5..505026b2dfde 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -90,6 +90,7 @@ impl TestContext { { info!("Registering avro tables"); register_avro_tables(&mut test_ctx).await; + test_ctx.ctx.enable_url_table().ok(); } #[cfg(not(feature = "avro"))] { @@ -97,6 +98,10 @@ impl TestContext { return None; } } + "describe.slt" | "arrow_files.slt" | "csv_files.slt" | "json.slt" + | "parquet.slt" => { + test_ctx.ctx.enable_url_table().ok(); + } "joins.slt" => { info!("Registering partition table tables"); let example_udf = create_example_udf(); diff --git a/datafusion/sqllogictest/test_files/array_query.slt b/datafusion/sqllogictest/test_files/array_query.slt index 265618669091..24c99fc849b6 100644 --- a/datafusion/sqllogictest/test_files/array_query.slt +++ b/datafusion/sqllogictest/test_files/array_query.slt @@ -31,7 +31,7 @@ CREATE TABLE data AS ; query ??I rowsort -SELECT * FROM date; +SELECT * FROM data; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 @@ -42,47 +42,47 @@ SELECT * FROM date; ########### query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM date WHERE column1 = [1,2,3]; +SELECT * FROM data WHERE column1 = [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) == List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM date WHERE column1 = column2 +SELECT * FROM data WHERE column1 = column2 query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM date WHERE column1 != [1,2,3]; +SELECT * FROM data WHERE column1 != [1,2,3]; query error DataFusion error: Arrow error: Invalid argument error: Invalid comparison operation: List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) != List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\) -SELECT * FROM date WHERE column1 != column2 +SELECT * FROM data WHERE column1 != column2 ########### # Aggregates ########### query error Internal error: Min/Max accumulator not implemented for type List -SELECT min(column1) FROM date; +SELECT min(column1) FROM data; query error Internal error: Min/Max accumulator not implemented for type List -SELECT max(column1) FROM date; +SELECT max(column1) FROM data; query I -SELECT count(column1) FROM date; +SELECT count(column1) FROM data; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM date; +SELECT count(distinct column1) FROM data; ---- 2 query I -SELECT count(distinct column2) FROM date; +SELECT count(distinct column2) FROM data; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM date; +SELECT count(distinct column1), count(distinct column2) FROM data; ---- 2 2 @@ -93,24 +93,24 @@ SELECT count(distinct column1), count(distinct column2) FROM date; query I -SELECT count(column1) FROM date GROUP BY column3; +SELECT count(column1) FROM data GROUP BY column3; ---- 3 # note single count distincts are rewritten to use a group by query I -SELECT count(distinct column1) FROM date GROUP BY column3; +SELECT count(distinct column1) FROM data GROUP BY column3; ---- 2 query I -SELECT count(distinct column2) FROM date GROUP BY column3; +SELECT count(distinct column2) FROM data GROUP BY column3; ---- 2 # note multiple count distincts are not rewritten query II -SELECT count(distinct column1), count(distinct column2) FROM date GROUP BY column3; +SELECT count(distinct column1), count(distinct column2) FROM data GROUP BY column3; ---- 2 2 @@ -120,21 +120,21 @@ SELECT count(distinct column1), count(distinct column2) FROM date GROUP BY colum ########### query ??I -SELECT * FROM date ORDER BY column2; +SELECT * FROM data ORDER BY column2; ---- [2, 3] [2, 3] 1 [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 query ??I -SELECT * FROM date ORDER BY column2 DESC; +SELECT * FROM data ORDER BY column2 DESC; ---- [1, 2, 3] NULL 1 [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM date ORDER BY column2 DESC NULLS LAST; +SELECT * FROM data ORDER BY column2 DESC NULLS LAST; ---- [1, 2, 3] [4, 5] 1 [2, 3] [2, 3] 1 @@ -142,14 +142,14 @@ SELECT * FROM date ORDER BY column2 DESC NULLS LAST; # multi column query ??I -SELECT * FROM date ORDER BY column1, column2; +SELECT * FROM data ORDER BY column1, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 [2, 3] [2, 3] 1 query ??I -SELECT * FROM date ORDER BY column1, column3, column2; +SELECT * FROM data ORDER BY column1, column3, column2; ---- [1, 2, 3] [4, 5] 1 [1, 2, 3] NULL 1 @@ -157,4 +157,4 @@ SELECT * FROM date ORDER BY column1, column3, column2; statement ok -drop table date +drop table data diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index a37bdeaeabbc..edbc40ebca01 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -57,7 +57,7 @@ statement ok DROP TABLE aggregate_simple; ########## -# Describe file (currently we can only describe file in datafusion-cli, fix this after issue (#4850) has been done) +# Describe file ########## query TTT From 080626354931082f2c91d0fb6313266c22e27e87 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 2 Jul 2024 00:08:03 +0800 Subject: [PATCH 18/53] make dirs be optional --- datafusion/core/Cargo.toml | 4 +++- datafusion/core/src/catalog/dynamic_file_schema.rs | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 1574f40ff92d..3e1311cca96d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -47,6 +47,7 @@ backtrace = ["datafusion-common/backtrace"] compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression", "tokio-util"] crypto_expressions = ["datafusion-functions/crypto_expressions"] datetime_expressions = ["datafusion-functions/datetime_expressions"] +dirs = ["dep:dirs"] default = [ "array_expressions", "crypto_expressions", @@ -57,6 +58,7 @@ default = [ "unicode_expressions", "compression", "parquet", + "dirs", ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) @@ -108,7 +110,7 @@ datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } -dirs = "4.0.0" +dirs = { version = "4.0.0", optional = true } flate2 = { version = "1.0.24", optional = true } futures = { workspace = true } glob = "0.3.0" diff --git a/datafusion/core/src/catalog/dynamic_file_schema.rs b/datafusion/core/src/catalog/dynamic_file_schema.rs index 08cd404241c3..c60025966b20 100644 --- a/datafusion/core/src/catalog/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog/dynamic_file_schema.rs @@ -21,6 +21,7 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; +#[cfg(feature = "dirs")] use dirs::home_dir; use datafusion_common::plan_datafusion_err; @@ -87,6 +88,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { } /// Substitute the tilde character in the file path with the user home directory. +#[cfg(feature = "dirs")] pub fn substitute_tilde(cur: String) -> String { if let Some(usr_dir_path) = home_dir() { if let Some(usr_dir) = usr_dir_path.to_str() { @@ -98,6 +100,12 @@ pub fn substitute_tilde(cur: String) -> String { cur } +/// Do nothing if the feature "dirs" is disabled. +#[cfg(not(feature = "dirs"))] +pub fn substitute_tilde(cur: String) -> String { + cur +} + /// [UrlTableFactory] is a factory that can create a table provider from the given url. #[async_trait] pub trait UrlTableFactory: Sync + Send { @@ -144,7 +152,7 @@ impl UrlTableFactory for DynamicListTableFactory { } } -#[cfg(not(target_os = "windows"))] +#[cfg(all(not(target_os = "windows"), not(feature = "dirs")))] #[cfg(test)] mod tests { use crate::catalog::dynamic_file_schema::substitute_tilde; From f4d24e60660dd98669aaf708cfbade3a207acd1c Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 2 Jul 2024 00:30:21 +0800 Subject: [PATCH 19/53] enable dynamic file query in cli --- datafusion-cli/src/main.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index f469fda4f960..5553bbb4ac8c 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -178,7 +178,9 @@ async fn main_inner() -> Result<()> { let mut ctx = SessionContext::new_with_config_rt(session_config.clone(), Arc::new(runtime_env)); ctx.refresh_catalogs().await?; - // install dynamic catalog provider that knows how to open files + // enable dynamic file query + ctx.enable_url_table()?; + // install dynamic catalog provider that can register required object stores ctx.register_catalog_list(Arc::new(DynamicFileCatalog::new( ctx.state().catalog_list(), ctx.state_weak_ref(), From 9964150c56460bb80bc5e8effcf63f857bced4a6 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 2 Jul 2024 00:35:15 +0800 Subject: [PATCH 20/53] cargo fmt --- datafusion/core/src/execution/session_state.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 0519011c9699..a1f1e89e663f 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -75,8 +75,8 @@ use datafusion_physical_expr::create_physical_expr; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use datafusion_sql::parser::{DFParser, Statement}; -use parking_lot::{Mutex, RwLock}; use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}; +use parking_lot::{Mutex, RwLock}; use sqlparser::ast::Expr as SQLExpr; use sqlparser::dialect::dialect_from_str; use std::collections::hash_map::Entry; From da1e5d39ca9f42373ed4afe029ff3abf610f4460 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 2 Jul 2024 00:49:16 +0800 Subject: [PATCH 21/53] modified example --- datafusion/core/src/execution/context/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 16eca5afcab3..63bcecd4a94f 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -330,7 +330,7 @@ impl SessionContext { /// # async fn main() -> Result<()> { /// let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); /// let ctx = SessionContext::new_with_config(cfg); - /// ctx.enable_url_table().ok(); + /// ctx.enable_url_table()?; /// let results = ctx /// .sql("SELECT a, MIN(b) FROM 'tests/data/example.csv' as example GROUP BY a LIMIT 100") /// .await? From ed670feaef413b7a5b1ba4b56debac37073d966b Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 2 Jul 2024 00:52:47 +0800 Subject: [PATCH 22/53] fix test --- datafusion/core/src/execution/context/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 63bcecd4a94f..18069eeb34e0 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1792,6 +1792,7 @@ mod tests { let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_state = SessionState::new_with_config_rt(cfg, runtime); let ctx = SessionContext::new_with_state(session_state); + ctx.enable_url_table()?; let result = plan_and_collect( &ctx, From fb8b9e012898ae290bc00e754244923aaf040f2a Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 13 Aug 2024 20:47:24 +0800 Subject: [PATCH 23/53] fix merge conflict --- .../core/src/execution/session_state.rs | 105 ------------------ 1 file changed, 105 deletions(-) diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 014c4308fa69..caa15c06bc59 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -279,117 +279,12 @@ impl SessionState { runtime: Arc, catalog_list: Arc, ) -> Self { -<<<<<<< HEAD - let session_id = Uuid::new_v4().to_string(); - - // Create table_factories for all default formats - let mut table_factories: HashMap> = - HashMap::new(); - #[cfg(feature = "parquet")] - table_factories.insert("PARQUET".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("CSV".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("JSON".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("NDJSON".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("AVRO".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("ARROW".into(), Arc::new(DefaultTableFactory::new())); - - if config.create_default_catalog_and_schema() { - let default_catalog = MemoryCatalogProvider::new(); - let schema = MemorySchemaProvider::new(); - - default_catalog - .register_schema( - &config.options().catalog.default_schema, - Arc::new(schema), - ) - .expect("memory catalog provider can register schema"); - - Self::register_default_schema( - &config, - &table_factories, - &runtime, - &default_catalog, - ); - - catalog_list.register_catalog( - config.options().catalog.default_catalog.clone(), - Arc::new(default_catalog), - ); - } - - let mut new_self = SessionState { - session_id, - analyzer: Analyzer::new(), - optimizer: Optimizer::new(), - physical_optimizers: PhysicalOptimizer::new(), - query_planner: Arc::new(DefaultQueryPlanner {}), - catalog_list, - table_functions: HashMap::new(), - scalar_functions: HashMap::new(), - aggregate_functions: HashMap::new(), - window_functions: HashMap::new(), - serializer_registry: Arc::new(EmptySerializerRegistry), - file_formats: HashMap::new(), - table_options: TableOptions::default_from_session_config(config.options()), - config, - execution_props: ExecutionProps::new(), - runtime_env: runtime, - table_factories, - function_factory: None, - }; - - #[cfg(feature = "parquet")] - if let Err(e) = - new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) - { - log::info!("Unable to register default ParquetFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) - { - log::info!("Unable to register default JsonFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false) - { - log::info!("Unable to register default CsvFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false) - { - log::info!("Unable to register default ArrowFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false) - { - log::info!("Unable to register default AvroFormat: {e}") - }; - - // register built in functions - functions::register_all(&mut new_self) - .expect("can not register built in functions"); - - // register crate of array expressions (if enabled) - #[cfg(feature = "array_expressions")] - functions_array::register_all(&mut new_self) - .expect("can not register array expressions"); - - functions_aggregate::register_all(&mut new_self) - .expect("can not register aggregate functions"); - - new_self -======= SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) .with_catalog_list(catalog_list) .with_default_features() .build() ->>>>>>> main } /// Returns new [`SessionState`] using the provided From fa73ae7bc2eef97624a0b66cb2a4e6a4101e6013 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 14 Aug 2024 18:33:26 +0800 Subject: [PATCH 24/53] tmp --- datafusion/catalog/src/dynamic_file/catalog.rs | 0 datafusion/catalog/src/dynamic_file/mod.rs | 1 + 2 files changed, 1 insertion(+) create mode 100644 datafusion/catalog/src/dynamic_file/catalog.rs create mode 100644 datafusion/catalog/src/dynamic_file/mod.rs diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/datafusion/catalog/src/dynamic_file/mod.rs b/datafusion/catalog/src/dynamic_file/mod.rs new file mode 100644 index 000000000000..c2748cf5607a --- /dev/null +++ b/datafusion/catalog/src/dynamic_file/mod.rs @@ -0,0 +1 @@ +mod catalog; \ No newline at end of file From 04cc155277359cd5fb6797d324ffd5057106f4aa Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 14 Aug 2024 21:33:01 +0800 Subject: [PATCH 25/53] tmp --- datafusion/catalog/src/lib.rs | 1 + ...dynamic_file_schema.rs => dynamic_file.rs} | 85 ++++++++++++++++++- datafusion/core/src/catalog_common/mod.rs | 1 + datafusion/core/src/execution/context/mod.rs | 10 +++ 4 files changed, 94 insertions(+), 3 deletions(-) rename datafusion/core/src/catalog_common/{dynamic_file_schema.rs => dynamic_file.rs} (75%) diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs index fe76b5dc9c64..bf4ced94dc72 100644 --- a/datafusion/catalog/src/lib.rs +++ b/datafusion/catalog/src/lib.rs @@ -19,6 +19,7 @@ mod catalog; mod schema; mod session; mod table; +mod dynamic_file; pub use catalog::*; pub use schema::*; diff --git a/datafusion/core/src/catalog_common/dynamic_file_schema.rs b/datafusion/core/src/catalog_common/dynamic_file.rs similarity index 75% rename from datafusion/core/src/catalog_common/dynamic_file_schema.rs rename to datafusion/core/src/catalog_common/dynamic_file.rs index c60025966b20..4398419e0b54 100644 --- a/datafusion/core/src/catalog_common/dynamic_file_schema.rs +++ b/datafusion/core/src/catalog_common/dynamic_file.rs @@ -18,20 +18,99 @@ //! dynamic_file_schema contains a SchemaProvider that creates tables from file paths use std::any::Any; -use std::sync::Arc; +use std::sync::{Arc, Weak}; use async_trait::async_trait; #[cfg(feature = "dirs")] use dirs::home_dir; - +use parking_lot::RwLock; +use datafusion_catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; use datafusion_common::plan_datafusion_err; -use crate::catalog::schema::SchemaProvider; use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; use crate::datasource::TableProvider; use crate::error::Result; +use crate::execution::session_state::SessionState; use crate::execution::session_state::StateStore; +pub struct DynamicFileCatalog { + inner: Arc, + state_store: Arc, +} + +impl DynamicFileCatalog { + pub fn new( + inner: Arc, + state_store: Arc, + ) -> Self { + Self { inner, state_store } + } +} + +impl CatalogProviderList for DynamicFileCatalog { + fn as_any(&self) -> &dyn Any { + self + } + + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + self.inner.register_catalog(name, catalog) + } + + fn catalog_names(&self) -> Vec { + self.inner.catalog_names() + } + + fn catalog(&self, name: &str) -> Option> { + self.inner + .catalog(name) + .map(|catalog| + Arc::new(DynamicFileCatalogProvider::new(self.state_store.get_state())) as _) + } +} + + +/// Wraps another catalog provider +struct DynamicFileCatalogProvider { + state: Weak>, +} + +impl DynamicFileCatalogProvider { + pub fn new(state: Weak>) -> Self { + Self { + state: state.clone(), + } + } +} + +impl CatalogProvider for DynamicFileCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + vec![] + } + + fn schema(&self, _: &str) -> Option> { + Some(Arc::new(DynamicFileSchemaProvider::new( + self.state.clone(), + Arc::new(DynamicListTableFactory::default()), + ))) + } + + fn register_schema( + &self, + _name: &str, + _schema: Arc, + ) -> Result>> { + unimplemented!("register_schema is not supported for DynamicFileCatalogProvider") + } +} + /// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. /// /// The provider will try to create a table provider from the file path if the table provider diff --git a/datafusion/core/src/catalog_common/mod.rs b/datafusion/core/src/catalog_common/mod.rs index b8414378862e..f2422331f391 100644 --- a/datafusion/core/src/catalog_common/mod.rs +++ b/datafusion/core/src/catalog_common/mod.rs @@ -25,6 +25,7 @@ pub mod information_schema; pub mod listing_schema; pub mod memory; +pub mod dynamic_file; pub use crate::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; pub use memory::{ diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 139e941ad7d1..56d1b0866cf1 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -306,6 +306,8 @@ impl SessionContext { Self::new_with_state(state) } + pub fn new_ + /// Creates a new `SessionContext` using the provided /// [`SessionConfig`] and a [`RuntimeEnv`]. #[deprecated(since = "32.0.0", note = "Use SessionState::new_with_config_rt")] @@ -380,6 +382,14 @@ impl SessionContext { } } + pub fn enable_url_table_1(&self) -> Result { + let state_ref = self.state(); + let builder = SessionStateBuilder::new_from_existing(self.state()); + let inner = state_ref.catalog_list(); + let state_store = Arc::new(StateStore::new()); + + } + /// Creates a new `SessionContext` using the provided [`SessionState`] #[deprecated(since = "32.0.0", note = "Use SessionContext::new_with_state")] pub fn with_state(state: SessionState) -> Self { From 1ede35ebfae13af234b813a852ebde9e0461ac1e Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 15 Aug 2024 01:27:42 +0800 Subject: [PATCH 26/53] tmp --- .../core/src/catalog_common/dynamic_file.rs | 43 ++++++--------- datafusion/core/src/execution/context/mod.rs | 53 +++++-------------- .../core/src/execution/session_state.rs | 1 - 3 files changed, 29 insertions(+), 68 deletions(-) diff --git a/datafusion/core/src/catalog_common/dynamic_file.rs b/datafusion/core/src/catalog_common/dynamic_file.rs index 4398419e0b54..d02658467ee8 100644 --- a/datafusion/core/src/catalog_common/dynamic_file.rs +++ b/datafusion/core/src/catalog_common/dynamic_file.rs @@ -18,19 +18,17 @@ //! dynamic_file_schema contains a SchemaProvider that creates tables from file paths use std::any::Any; -use std::sync::{Arc, Weak}; +use std::sync::Arc; use async_trait::async_trait; #[cfg(feature = "dirs")] use dirs::home_dir; -use parking_lot::RwLock; use datafusion_catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; use datafusion_common::plan_datafusion_err; use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; use crate::datasource::TableProvider; use crate::error::Result; -use crate::execution::session_state::SessionState; use crate::execution::session_state::StateStore; pub struct DynamicFileCatalog { @@ -65,23 +63,21 @@ impl CatalogProviderList for DynamicFileCatalog { } fn catalog(&self, name: &str) -> Option> { - self.inner - .catalog(name) - .map(|catalog| - Arc::new(DynamicFileCatalogProvider::new(self.state_store.get_state())) as _) + Some(self.inner.catalog(name) + .unwrap_or(Arc::new(DynamicFileCatalogProvider::new(Arc::clone(&self.state_store))) as _)) } } /// Wraps another catalog provider struct DynamicFileCatalogProvider { - state: Weak>, + state_store: Arc, } impl DynamicFileCatalogProvider { - pub fn new(state: Weak>) -> Self { + pub fn new(state_store: Arc) -> Self { Self { - state: state.clone(), + state_store, } } } @@ -97,8 +93,7 @@ impl CatalogProvider for DynamicFileCatalogProvider { fn schema(&self, _: &str) -> Option> { Some(Arc::new(DynamicFileSchemaProvider::new( - self.state.clone(), - Arc::new(DynamicListTableFactory::default()), + Arc::new(DynamicListTableFactory::new(Arc::clone(&self.state_store))), ))) } @@ -116,17 +111,15 @@ impl CatalogProvider for DynamicFileCatalogProvider { /// The provider will try to create a table provider from the file path if the table provider /// isn't exist in the inner schema provider. The required object store must be registered in the session context. pub struct DynamicFileSchemaProvider { - inner: Arc, factory: Arc, } impl DynamicFileSchemaProvider { /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. pub fn new( - inner: Arc, factory: Arc, ) -> Self { - Self { inner, factory } + Self { factory } } } @@ -137,32 +130,28 @@ impl SchemaProvider for DynamicFileSchemaProvider { } fn table_names(&self) -> Vec { - self.inner.table_names() + unimplemented!("table_names is not supported for DynamicFileSchemaProvider") } fn register_table( &self, - name: String, - table: Arc, + _name: String, + _table: Arc, ) -> Result>> { - self.inner.register_table(name, table) + unimplemented!("register_table is not supported for DynamicFileSchemaProvider") } async fn table(&self, name: &str) -> Result>> { - if let Ok(Some(inner_table)) = self.inner.table(name).await { - return Ok(Some(inner_table)); - } - let optimized_url = substitute_tilde(name.to_owned()); self.factory.try_new(optimized_url.as_str()).await } - fn deregister_table(&self, name: &str) -> Result>> { - self.inner.deregister_table(name) + fn deregister_table(&self, _name: &str) -> Result>> { + unimplemented!("deregister_table is not supported for DynamicFileSchemaProvider") } - fn table_exist(&self, name: &str) -> bool { - self.inner.table_exist(name) + fn table_exist(&self, _name: &str) -> bool { + unimplemented!("table_exist is not supported for DynamicFileSchemaProvider") } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 56d1b0866cf1..74426d42cab3 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -54,7 +54,7 @@ use arrow::record_batch::RecordBatch; use arrow_schema::Schema; use datafusion_common::{ config::{ConfigExtension, TableOptions}, - config_err, exec_err, not_impl_err, plan_datafusion_err, plan_err, + exec_err, not_impl_err, plan_datafusion_err, plan_err, tree_node::{TreeNodeRecursion, TreeNodeVisitor}, DFSchema, SchemaReference, TableReference, }; @@ -74,12 +74,12 @@ use chrono::{DateTime, Utc}; use object_store::ObjectStore; use parking_lot::RwLock; use url::Url; -use datafusion_catalog::SchemaProvider; use crate::execution::session_state::{SessionStateBuilder, StateStore}; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; +use crate::catalog_common::dynamic_file::DynamicFileCatalog; mod avro; mod csv; @@ -306,8 +306,6 @@ impl SessionContext { Self::new_with_state(state) } - pub fn new_ - /// Creates a new `SessionContext` using the provided /// [`SessionConfig`] and a [`RuntimeEnv`]. #[deprecated(since = "32.0.0", note = "Use SessionState::new_with_config_rt")] @@ -330,13 +328,12 @@ impl SessionContext { /// # Example: query the url table /// /// ``` - /// use datafusion::prelude::*; + /// # use datafusion::prelude::*; /// # use datafusion::{error::Result, assert_batches_eq}; /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); - /// let ctx = SessionContext::new_with_config(cfg); - /// ctx.enable_url_table()?; + /// let ctx = SessionContext::new_with_config(cfg).enable_url_table()?; /// let results = ctx /// .sql("SELECT a, MIN(b) FROM 'tests/data/example.csv' as example GROUP BY a LIMIT 100") /// .await? @@ -355,39 +352,16 @@ impl SessionContext { /// # Ok(()) /// # } /// ``` - pub fn enable_url_table(&self) -> Result>> { + pub fn enable_url_table(&self) -> Self { let state_ref = self.state(); - let catalog_name = state_ref.config_options().catalog.default_catalog.as_str(); - let schema_name = state_ref.config_options().catalog.default_schema.as_str(); - if let Ok(provider) = state_ref - // provide a fake table reference to get the default schema provider. - .schema_for_ref(TableReference::full( - catalog_name, - schema_name, - UNNAMED_TABLE, - )) - { - let state_store = Arc::new(StateStore::new()); - state_store.with_state(self.state_weak_ref()); - let factory = Arc::new(DynamicListTableFactory::new(state_store)); - let new_provider = - Arc::new(DynamicFileSchemaProvider::new(provider, factory)); - state_ref - .catalog_list() - .catalog(catalog_name) - .unwrap() - .register_schema(schema_name, new_provider) - } else { - config_err!("default catalog and schema are required for url table") - } - } - - pub fn enable_url_table_1(&self) -> Result { - let state_ref = self.state(); - let builder = SessionStateBuilder::new_from_existing(self.state()); - let inner = state_ref.catalog_list(); let state_store = Arc::new(StateStore::new()); - + let catalog_list = Arc::new(DynamicFileCatalog::new(Arc::clone(state_ref.catalog_list()), Arc::clone(&state_store))); + let new_state = SessionStateBuilder::new_from_existing(self.state()) + .with_catalog_list(catalog_list) + .build(); + let ctx = SessionContext::new_with_state(new_state); + state_store.with_state(ctx.state_weak_ref()); + ctx } /// Creates a new `SessionContext` using the provided [`SessionState`] @@ -1830,8 +1804,7 @@ mod tests { let runtime = Arc::new(RuntimeEnv::new(rt_cfg).unwrap()); let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); let session_state = SessionState::new_with_config_rt(cfg, runtime); - let ctx = SessionContext::new_with_state(session_state); - ctx.enable_url_table()?; + let ctx = SessionContext::new_with_state(session_state).enable_url_table(); let result = plan_and_collect( &ctx, diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index caa15c06bc59..193d3850b6c0 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -76,7 +76,6 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; use std::sync::{Arc, Weak}; -use url::Url; use uuid::Uuid; /// `SessionState` contains all the necessary state to plan and execute queries, From 51b1d412b8eb6107d0a0371ba31f16fc2b6b4bfe Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Thu, 15 Aug 2024 23:22:53 +0800 Subject: [PATCH 27/53] fix the catalog and schema --- .../core/src/catalog_common/dynamic_file.rs | 43 +++++++++++-------- datafusion/core/src/execution/context/mod.rs | 6 +-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/datafusion/core/src/catalog_common/dynamic_file.rs b/datafusion/core/src/catalog_common/dynamic_file.rs index d02658467ee8..13bbeac9fab9 100644 --- a/datafusion/core/src/catalog_common/dynamic_file.rs +++ b/datafusion/core/src/catalog_common/dynamic_file.rs @@ -63,20 +63,22 @@ impl CatalogProviderList for DynamicFileCatalog { } fn catalog(&self, name: &str) -> Option> { - Some(self.inner.catalog(name) - .unwrap_or(Arc::new(DynamicFileCatalogProvider::new(Arc::clone(&self.state_store))) as _)) + self.inner.catalog(name) + .map(|catalog| Arc::new(DynamicFileCatalogProvider::new(catalog, Arc::clone(&self.state_store))) as _) } } /// Wraps another catalog provider struct DynamicFileCatalogProvider { + inner: Arc, state_store: Arc, } impl DynamicFileCatalogProvider { - pub fn new(state_store: Arc) -> Self { + pub fn new(inner: Arc, state_store: Arc) -> Self { Self { + inner, state_store, } } @@ -88,21 +90,22 @@ impl CatalogProvider for DynamicFileCatalogProvider { } fn schema_names(&self) -> Vec { - vec![] + self.schema_names() } - fn schema(&self, _: &str) -> Option> { - Some(Arc::new(DynamicFileSchemaProvider::new( + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name).map(|schema| Arc::new(DynamicFileSchemaProvider::new( + schema, Arc::new(DynamicListTableFactory::new(Arc::clone(&self.state_store))), - ))) + )) as _) } fn register_schema( &self, - _name: &str, - _schema: Arc, + name: &str, + schema: Arc, ) -> Result>> { - unimplemented!("register_schema is not supported for DynamicFileCatalogProvider") + self.inner.register_schema(name, schema) } } @@ -111,15 +114,17 @@ impl CatalogProvider for DynamicFileCatalogProvider { /// The provider will try to create a table provider from the file path if the table provider /// isn't exist in the inner schema provider. The required object store must be registered in the session context. pub struct DynamicFileSchemaProvider { + inner: Arc, factory: Arc, } impl DynamicFileSchemaProvider { /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. pub fn new( + inner: Arc, factory: Arc, ) -> Self { - Self { factory } + Self { inner, factory } } } @@ -130,15 +135,15 @@ impl SchemaProvider for DynamicFileSchemaProvider { } fn table_names(&self) -> Vec { - unimplemented!("table_names is not supported for DynamicFileSchemaProvider") + self.inner.table_names() } fn register_table( &self, - _name: String, - _table: Arc, + name: String, + table: Arc, ) -> Result>> { - unimplemented!("register_table is not supported for DynamicFileSchemaProvider") + self.inner.register_table(name, table) } async fn table(&self, name: &str) -> Result>> { @@ -146,12 +151,12 @@ impl SchemaProvider for DynamicFileSchemaProvider { self.factory.try_new(optimized_url.as_str()).await } - fn deregister_table(&self, _name: &str) -> Result>> { - unimplemented!("deregister_table is not supported for DynamicFileSchemaProvider") + fn deregister_table(&self, name: &str) -> Result>> { + self.deregister_table(name) } - fn table_exist(&self, _name: &str) -> bool { - unimplemented!("table_exist is not supported for DynamicFileSchemaProvider") + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 74426d42cab3..ffa7d8625214 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1799,13 +1799,9 @@ mod tests { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let path = path.join("tests/tpch-csv/customer.csv"); let url = format!("file://{}", path.display()); - - let rt_cfg = RuntimeConfig::new(); - let runtime = Arc::new(RuntimeEnv::new(rt_cfg).unwrap()); let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); - let session_state = SessionState::new_with_config_rt(cfg, runtime); + let session_state = SessionStateBuilder::new().with_config(cfg).build(); let ctx = SessionContext::new_with_state(session_state).enable_url_table(); - let result = plan_and_collect( &ctx, format!("select c_name from '{}' limit 3;", &url).as_str(), From 75b0b849b9e10e9f207c22a932da4e77f29b04e3 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 01:51:14 +0800 Subject: [PATCH 28/53] move dynamic file catalog to datafusion-catalog --- datafusion-examples/examples/dataframe.rs | 6 + datafusion/catalog/Cargo.toml | 5 + .../catalog/src/dynamic_file/catalog.rs | 241 ++++++++++++++++ datafusion/catalog/src/dynamic_file/mod.rs | 2 +- datafusion/catalog/src/lib.rs | 3 +- datafusion/catalog/src/session.rs | 34 ++- datafusion/core/Cargo.toml | 4 +- .../core/src/catalog_common/dynamic_file.rs | 270 ------------------ datafusion/core/src/catalog_common/mod.rs | 1 - .../core/src/datasource/dynamic_file.rs | 74 +++++ datafusion/core/src/datasource/mod.rs | 1 + datafusion/core/src/execution/context/mod.rs | 26 +- .../core/src/execution/session_state.rs | 34 +-- datafusion/sqllogictest/src/test_context.rs | 5 +- 14 files changed, 385 insertions(+), 321 deletions(-) delete mode 100644 datafusion/core/src/catalog_common/dynamic_file.rs create mode 100644 datafusion/core/src/datasource/dynamic_file.rs diff --git a/datafusion-examples/examples/dataframe.rs b/datafusion-examples/examples/dataframe.rs index ea01c53b1c62..d7e0068ef88f 100644 --- a/datafusion-examples/examples/dataframe.rs +++ b/datafusion-examples/examples/dataframe.rs @@ -64,6 +64,12 @@ async fn main() -> Result<()> { .await?; parquet_df.describe().await.unwrap().show().await?; + let dyn_ctx = ctx.enable_url_table(); + let df = dyn_ctx + .sql(&format!("SELECT * FROM '{}'", file_path.to_str().unwrap())) + .await?; + df.show().await?; + Ok(()) } diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index ff28d8e0c64a..402dbe789508 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -27,6 +27,9 @@ repository.workspace = true rust-version.workspace = true version.workspace = true +[features] +home_dir= ["dep:dirs"] + [dependencies] arrow-schema = { workspace = true } async-trait = "0.1.41" @@ -34,6 +37,8 @@ datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-plan = { workspace = true } +dirs = { version = "4.0.0", optional = true } +parking_lot = { workspace = true } [lints] workspace = true diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index e69de29bb2d1..0a9cfdd13f1d 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -0,0 +1,241 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths + +use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; +use async_trait::async_trait; +#[cfg(feature = "home_dir")] +use dirs::home_dir; +use std::any::Any; +use std::sync::Arc; + +pub struct DynamicFileCatalog { + inner: Arc, + factory: Arc, +} + +impl DynamicFileCatalog { + pub fn new( + inner: Arc, + factory: Arc, + ) -> Self { + Self { inner, factory } + } +} + +impl CatalogProviderList for DynamicFileCatalog { + fn as_any(&self) -> &dyn Any { + self + } + + fn register_catalog( + &self, + name: String, + catalog: Arc, + ) -> Option> { + self.inner.register_catalog(name, catalog) + } + + fn catalog_names(&self) -> Vec { + self.inner.catalog_names() + } + + fn catalog(&self, name: &str) -> Option> { + self.inner.catalog(name).map(|catalog| { + Arc::new(DynamicFileCatalogProvider::new( + catalog, + Arc::clone(&self.factory), + )) as _ + }) + } +} + +/// Wraps another catalog provider +struct DynamicFileCatalogProvider { + inner: Arc, + factory: Arc, +} + +impl DynamicFileCatalogProvider { + pub fn new( + inner: Arc, + factory: Arc, + ) -> Self { + Self { inner, factory } + } +} + +impl CatalogProvider for DynamicFileCatalogProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema_names(&self) -> Vec { + self.inner.schema_names() + } + + fn schema(&self, name: &str) -> Option> { + self.inner.schema(name).map(|schema| { + Arc::new(DynamicFileSchemaProvider::new( + schema, + Arc::clone(&self.factory), + )) as _ + }) + } + + fn register_schema( + &self, + name: &str, + schema: Arc, + ) -> datafusion_common::Result>> { + self.inner.register_schema(name, schema) + } +} + +/// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. +/// +/// The provider will try to create a table provider from the file path if the table provider +/// isn't exist in the inner schema provider. The required object store must be registered in the session context. +pub struct DynamicFileSchemaProvider { + inner: Arc, + factory: Arc, +} + +impl DynamicFileSchemaProvider { + /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. + pub fn new( + inner: Arc, + factory: Arc, + ) -> Self { + Self { inner, factory } + } +} + +#[async_trait] +impl SchemaProvider for DynamicFileSchemaProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn table_names(&self) -> Vec { + self.inner.table_names() + } + + async fn table( + &self, + name: &str, + ) -> datafusion_common::Result>> { + if let Some(table) = self.inner.table(name).await? { + return Ok(Some(table)); + }; + + self.factory.try_new(name).await + } + + fn register_table( + &self, + name: String, + table: Arc, + ) -> datafusion_common::Result>> { + self.inner.register_table(name, table) + } + + fn deregister_table( + &self, + name: &str, + ) -> datafusion_common::Result>> { + self.inner.deregister_table(name) + } + + fn table_exist(&self, name: &str) -> bool { + self.inner.table_exist(name) + } +} + +/// Substitute the tilde character in the file path with the user home directory. +#[cfg(feature = "home_dir")] +pub fn substitute_tilde(cur: String) -> String { + if let Some(usr_dir_path) = home_dir() { + if let Some(usr_dir) = usr_dir_path.to_str() { + if cur.starts_with('~') && !usr_dir.is_empty() { + return cur.replacen('~', usr_dir, 1); + } + } + } + cur +} + +/// Do nothing if the feature "home_dir" is disabled. +#[cfg(not(feature = "home_dir"))] +pub fn substitute_tilde(cur: String) -> String { + cur +} + +/// [UrlTableFactory] is a factory that can create a table provider from the given url. +#[async_trait] +pub trait UrlTableFactory: Sync + Send { + /// create a new table provider from the provided url + async fn try_new( + &self, + url: &str, + ) -> datafusion_common::Result>>; +} + +#[cfg(all(not(target_os = "windows"), not(feature = "home_dir")))] +#[cfg(test)] +mod tests { + use crate::dynamic_file::catalog::substitute_tilde; + use dirs::home_dir; + + #[test] + fn test_substitute_tilde() { + use std::env; + use std::path::MAIN_SEPARATOR; + let original_home = home_dir(); + let test_home_path = if cfg!(windows) { + "C:\\Users\\user" + } else { + "/home/user" + }; + env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + test_home_path, + ); + let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; + let expected = format!( + "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", + test_home_path, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR + ); + let actual = substitute_tilde(input.to_string()); + assert_eq!(actual, expected); + match original_home { + Some(home_path) => env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + home_path.to_str().unwrap(), + ), + None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), + } + } +} diff --git a/datafusion/catalog/src/dynamic_file/mod.rs b/datafusion/catalog/src/dynamic_file/mod.rs index c2748cf5607a..41cb7e416f1d 100644 --- a/datafusion/catalog/src/dynamic_file/mod.rs +++ b/datafusion/catalog/src/dynamic_file/mod.rs @@ -1 +1 @@ -mod catalog; \ No newline at end of file +pub(crate) mod catalog; diff --git a/datafusion/catalog/src/lib.rs b/datafusion/catalog/src/lib.rs index bf4ced94dc72..21630f267d2c 100644 --- a/datafusion/catalog/src/lib.rs +++ b/datafusion/catalog/src/lib.rs @@ -16,12 +16,13 @@ // under the License. mod catalog; +mod dynamic_file; mod schema; mod session; mod table; -mod dynamic_file; pub use catalog::*; +pub use dynamic_file::catalog::*; pub use schema::*; pub use session::*; pub use table::*; diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index 05d2684ed3e0..e444b00fb7c1 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -24,9 +24,10 @@ use datafusion_execution::TaskContext; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{AggregateUDF, Expr, LogicalPlan, ScalarUDF, WindowUDF}; use datafusion_physical_plan::{ExecutionPlan, PhysicalExpr}; +use parking_lot::{Mutex, RwLock}; use std::any::Any; use std::collections::HashMap; -use std::sync::Arc; +use std::sync::{Arc, Weak}; /// Interface for accessing [`SessionState`] from the catalog. /// @@ -136,3 +137,34 @@ impl From<&dyn Session> for TaskContext { ) } } + +/// The state store that stores the reference of the runtime session state. +pub struct SessionStore { + session: Arc>>>>, +} + +impl SessionStore { + /// Create a new [SessionStore] + pub fn new() -> Self { + Self { + session: Arc::new(Mutex::new(None)), + } + } + + /// Set the session state of the store + pub fn with_state(&self, state: Weak>) { + let mut lock = self.session.lock(); + *lock = Some(state); + } + + /// Get the current session of the store + pub fn get_session(&self) -> Weak> { + self.session.lock().clone().unwrap() + } +} + +impl Default for SessionStore { + fn default() -> Self { + Self::new() + } +} diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index d3b60e761eb4..bb2b55e64031 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -49,7 +49,6 @@ backtrace = ["datafusion-common/backtrace"] compression = ["xz2", "bzip2", "flate2", "zstd", "async-compression", "tokio-util"] crypto_expressions = ["datafusion-functions/crypto_expressions"] datetime_expressions = ["datafusion-functions/datetime_expressions"] -dirs = ["dep:dirs"] default = [ "nested_expressions", "crypto_expressions", @@ -60,7 +59,6 @@ default = [ "unicode_expressions", "compression", "parquet", - "dirs", ] encoding_expressions = ["datafusion-functions/encoding_expressions"] # Used for testing ONLY: causes all values to hash to the same value (test for collisions) @@ -79,6 +77,7 @@ unicode_expressions = [ "datafusion-sql/unicode_expressions", "datafusion-functions/unicode_expressions", ] +home_dir = ["datafusion-catalog/home_dir"] [dependencies] ahash = { workspace = true } @@ -115,7 +114,6 @@ datafusion-physical-expr-functions-aggregate = { workspace = true } datafusion-physical-optimizer = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } -dirs = { version = "4.0.0", optional = true } flate2 = { version = "1.0.24", optional = true } futures = { workspace = true } glob = "0.3.0" diff --git a/datafusion/core/src/catalog_common/dynamic_file.rs b/datafusion/core/src/catalog_common/dynamic_file.rs deleted file mode 100644 index 13bbeac9fab9..000000000000 --- a/datafusion/core/src/catalog_common/dynamic_file.rs +++ /dev/null @@ -1,270 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths - -use std::any::Any; -use std::sync::Arc; - -use async_trait::async_trait; -#[cfg(feature = "dirs")] -use dirs::home_dir; -use datafusion_catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; -use datafusion_common::plan_datafusion_err; - -use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; -use crate::datasource::TableProvider; -use crate::error::Result; -use crate::execution::session_state::StateStore; - -pub struct DynamicFileCatalog { - inner: Arc, - state_store: Arc, -} - -impl DynamicFileCatalog { - pub fn new( - inner: Arc, - state_store: Arc, - ) -> Self { - Self { inner, state_store } - } -} - -impl CatalogProviderList for DynamicFileCatalog { - fn as_any(&self) -> &dyn Any { - self - } - - fn register_catalog( - &self, - name: String, - catalog: Arc, - ) -> Option> { - self.inner.register_catalog(name, catalog) - } - - fn catalog_names(&self) -> Vec { - self.inner.catalog_names() - } - - fn catalog(&self, name: &str) -> Option> { - self.inner.catalog(name) - .map(|catalog| Arc::new(DynamicFileCatalogProvider::new(catalog, Arc::clone(&self.state_store))) as _) - } -} - - -/// Wraps another catalog provider -struct DynamicFileCatalogProvider { - inner: Arc, - state_store: Arc, -} - -impl DynamicFileCatalogProvider { - pub fn new(inner: Arc, state_store: Arc) -> Self { - Self { - inner, - state_store, - } - } -} - -impl CatalogProvider for DynamicFileCatalogProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema_names(&self) -> Vec { - self.schema_names() - } - - fn schema(&self, name: &str) -> Option> { - self.inner.schema(name).map(|schema| Arc::new(DynamicFileSchemaProvider::new( - schema, - Arc::new(DynamicListTableFactory::new(Arc::clone(&self.state_store))), - )) as _) - } - - fn register_schema( - &self, - name: &str, - schema: Arc, - ) -> Result>> { - self.inner.register_schema(name, schema) - } -} - -/// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. -/// -/// The provider will try to create a table provider from the file path if the table provider -/// isn't exist in the inner schema provider. The required object store must be registered in the session context. -pub struct DynamicFileSchemaProvider { - inner: Arc, - factory: Arc, -} - -impl DynamicFileSchemaProvider { - /// Create a new [DynamicFileSchemaProvider] with the given inner schema provider. - pub fn new( - inner: Arc, - factory: Arc, - ) -> Self { - Self { inner, factory } - } -} - -#[async_trait] -impl SchemaProvider for DynamicFileSchemaProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn table_names(&self) -> Vec { - self.inner.table_names() - } - - fn register_table( - &self, - name: String, - table: Arc, - ) -> Result>> { - self.inner.register_table(name, table) - } - - async fn table(&self, name: &str) -> Result>> { - let optimized_url = substitute_tilde(name.to_owned()); - self.factory.try_new(optimized_url.as_str()).await - } - - fn deregister_table(&self, name: &str) -> Result>> { - self.deregister_table(name) - } - - fn table_exist(&self, name: &str) -> bool { - self.inner.table_exist(name) - } -} - -/// Substitute the tilde character in the file path with the user home directory. -#[cfg(feature = "dirs")] -pub fn substitute_tilde(cur: String) -> String { - if let Some(usr_dir_path) = home_dir() { - if let Some(usr_dir) = usr_dir_path.to_str() { - if cur.starts_with('~') && !usr_dir.is_empty() { - return cur.replacen('~', usr_dir, 1); - } - } - } - cur -} - -/// Do nothing if the feature "dirs" is disabled. -#[cfg(not(feature = "dirs"))] -pub fn substitute_tilde(cur: String) -> String { - cur -} - -/// [UrlTableFactory] is a factory that can create a table provider from the given url. -#[async_trait] -pub trait UrlTableFactory: Sync + Send { - /// create a new table provider from the provided url - async fn try_new(&self, url: &str) -> Result>>; -} - -/// [DynamicListTableFactory] is a factory that can create a [ListingTable] from the given url. -#[derive(Default)] -pub struct DynamicListTableFactory { - state_store: Arc, -} - -impl DynamicListTableFactory { - /// Create a new [DynamicListTableFactory] with the given state store. - pub fn new(state_store: Arc) -> Self { - Self { state_store } - } -} - -#[async_trait] -impl UrlTableFactory for DynamicListTableFactory { - async fn try_new(&self, url: &str) -> Result>> { - let Ok(table_url) = ListingTableUrl::parse(url) else { - return Ok(None); - }; - - let state = &self - .state_store - .get_state() - .upgrade() - .ok_or_else(|| plan_datafusion_err!("locking error"))? - .read() - .clone(); - if let Ok(cfg) = ListingTableConfig::new(table_url.clone()) - .infer(state) - .await - { - ListingTable::try_new(cfg) - .map(|table| Some(Arc::new(table) as Arc)) - } else { - Ok(None) - } - } -} - -#[cfg(all(not(target_os = "windows"), not(feature = "dirs")))] -#[cfg(test)] -mod tests { - use crate::catalog::dynamic_file_schema::substitute_tilde; - use dirs::home_dir; - - #[test] - fn test_substitute_tilde() { - use std::env; - use std::path::MAIN_SEPARATOR; - let original_home = home_dir(); - let test_home_path = if cfg!(windows) { - "C:\\Users\\user" - } else { - "/home/user" - }; - env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - test_home_path, - ); - let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; - let expected = format!( - "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", - test_home_path, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR - ); - let actual = substitute_tilde(input.to_string()); - assert_eq!(actual, expected); - match original_home { - Some(home_path) => env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - home_path.to_str().unwrap(), - ), - None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), - } - } -} diff --git a/datafusion/core/src/catalog_common/mod.rs b/datafusion/core/src/catalog_common/mod.rs index f2422331f391..b8414378862e 100644 --- a/datafusion/core/src/catalog_common/mod.rs +++ b/datafusion/core/src/catalog_common/mod.rs @@ -25,7 +25,6 @@ pub mod information_schema; pub mod listing_schema; pub mod memory; -pub mod dynamic_file; pub use crate::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; pub use memory::{ diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs new file mode 100644 index 000000000000..999c8e58b65e --- /dev/null +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths + +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion_catalog::{SessionStore, UrlTableFactory}; +use datafusion_common::plan_datafusion_err; + +use crate::datasource::listing::{ListingTable, ListingTableConfig, ListingTableUrl}; +use crate::datasource::TableProvider; +use crate::error::Result; +use crate::execution::context::SessionState; + +/// [DynamicListTableFactory] is a factory that can create a [ListingTable] from the given url. +#[derive(Default)] +pub struct DynamicListTableFactory { + session_store: Arc, +} + +impl DynamicListTableFactory { + /// Create a new [DynamicListTableFactory] with the given state store. + pub fn new(session_store: Arc) -> Self { + Self { session_store } + } + + fn session_store(&self) -> Arc { + Arc::clone(&self.session_store) + } +} + +#[async_trait] +impl UrlTableFactory for DynamicListTableFactory { + async fn try_new(&self, url: &str) -> Result>> { + let Ok(table_url) = ListingTableUrl::parse(url) else { + return Ok(None); + }; + + let state = &self + .session_store() + .get_session() + .upgrade() + .and_then(|session| { + session + .read() + .as_any() + .downcast_ref::() + .cloned() + }) + .ok_or_else(|| plan_datafusion_err!("get current SessionStore error"))?; + + let cfg = ListingTableConfig::new(table_url.clone()) + .infer(state) + .await?; + ListingTable::try_new(cfg) + .map(|table| Some(Arc::new(table) as Arc)) + } +} diff --git a/datafusion/core/src/datasource/mod.rs b/datafusion/core/src/datasource/mod.rs index 1c9924735735..0c7983d1dc40 100644 --- a/datafusion/core/src/datasource/mod.rs +++ b/datafusion/core/src/datasource/mod.rs @@ -22,6 +22,7 @@ pub mod avro_to_arrow; pub mod cte_worktable; pub mod default_table_source; +pub mod dynamic_file; pub mod empty; pub mod file_format; pub mod function; diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index ffa7d8625214..3269a9167ec5 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -69,17 +69,18 @@ use datafusion_expr::{ // backwards compatibility pub use crate::execution::session_state::SessionState; +use crate::datasource::dynamic_file::DynamicListTableFactory; +use crate::execution::session_state::SessionStateBuilder; use async_trait::async_trait; use chrono::{DateTime, Utc}; -use object_store::ObjectStore; -use parking_lot::RwLock; -use url::Url; -use crate::execution::session_state::{SessionStateBuilder, StateStore}; +use datafusion_catalog::{DynamicFileCatalog, SessionStore}; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; use datafusion_optimizer::{AnalyzerRule, OptimizerRule}; -use crate::catalog_common::dynamic_file::DynamicFileCatalog; +use object_store::ObjectStore; +use parking_lot::RwLock; +use url::Url; mod avro; mod csv; @@ -354,13 +355,17 @@ impl SessionContext { /// ``` pub fn enable_url_table(&self) -> Self { let state_ref = self.state(); - let state_store = Arc::new(StateStore::new()); - let catalog_list = Arc::new(DynamicFileCatalog::new(Arc::clone(state_ref.catalog_list()), Arc::clone(&state_store))); + let session_store = Arc::new(SessionStore::new()); + let factory = DynamicListTableFactory::new(Arc::clone(&session_store)); + let catalog_list = Arc::new(DynamicFileCatalog::new( + Arc::clone(state_ref.catalog_list()), + Arc::new(factory), + )); let new_state = SessionStateBuilder::new_from_existing(self.state()) .with_catalog_list(catalog_list) .build(); let ctx = SessionContext::new_with_state(new_state); - state_store.with_state(ctx.state_weak_ref()); + session_store.with_state(ctx.state_weak_ref()); ctx } @@ -1800,7 +1805,10 @@ mod tests { let path = path.join("tests/tpch-csv/customer.csv"); let url = format!("file://{}", path.display()); let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); - let session_state = SessionStateBuilder::new().with_config(cfg).build(); + let session_state = SessionStateBuilder::new() + .with_default_features() + .with_config(cfg) + .build(); let ctx = SessionContext::new_with_state(session_state).enable_url_table(); let result = plan_and_collect( &ctx, diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 193d3850b6c0..0a057d6f1417 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -66,7 +66,6 @@ use datafusion_physical_optimizer::PhysicalOptimizerRule; use datafusion_physical_plan::ExecutionPlan; use datafusion_sql::parser::{DFParser, Statement}; use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}; -use parking_lot::{Mutex, RwLock}; use itertools::Itertools; use log::{debug, info}; use sqlparser::ast::Expr as SQLExpr; @@ -75,7 +74,7 @@ use std::any::Any; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::fmt::Debug; -use std::sync::{Arc, Weak}; +use std::sync::Arc; use uuid::Uuid; /// `SessionState` contains all the necessary state to plan and execute queries, @@ -1800,37 +1799,6 @@ impl<'a> SimplifyInfo for SessionSimplifyProvider<'a> { } } -/// The state store that stores the reference of the runtime session state. -pub struct StateStore { - state: Arc>>>>, -} - -impl StateStore { - /// Create a new [StateStore] - pub fn new() -> Self { - Self { - state: Arc::new(Mutex::new(None)), - } - } - - /// Set the session state of the store - pub fn with_state(&self, state: Weak>) { - let mut lock = self.state.lock(); - *lock = Some(state); - } - - /// Get the current session state of the store - pub fn get_state(&self) -> Weak> { - self.state.lock().clone().unwrap() - } -} - -impl Default for StateStore { - fn default() -> Self { - Self::new() - } -} - #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 46af4b0bff86..1eb89b270a4f 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -91,7 +91,7 @@ impl TestContext { { info!("Registering avro tables"); register_avro_tables(&mut test_ctx).await; - test_ctx.ctx.enable_url_table().ok(); + test_ctx.ctx = test_ctx.ctx.enable_url_table(); } #[cfg(not(feature = "avro"))] { @@ -101,7 +101,8 @@ impl TestContext { } "describe.slt" | "arrow_files.slt" | "csv_files.slt" | "json.slt" | "parquet.slt" => { - test_ctx.ctx.enable_url_table().ok(); + // TODO: duplicate enable url table + test_ctx.ctx = test_ctx.ctx.enable_url_table(); } "joins.slt" => { info!("Registering partition table tables"); From 3e8d094c93f245ee2448f8a72a8d462691e0ea7f Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 01:52:43 +0800 Subject: [PATCH 29/53] add copyright --- datafusion/catalog/src/dynamic_file/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/catalog/src/dynamic_file/mod.rs b/datafusion/catalog/src/dynamic_file/mod.rs index 41cb7e416f1d..59142333dd54 100644 --- a/datafusion/catalog/src/dynamic_file/mod.rs +++ b/datafusion/catalog/src/dynamic_file/mod.rs @@ -1 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + pub(crate) mod catalog; From 4eb8ca572cbe6a501ca41971517ef3fc0df1012d Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 22:02:24 +0800 Subject: [PATCH 30/53] fix tests --- .../examples/external_dependency/query-aws-s3.rs | 2 +- datafusion/catalog/src/dynamic_file/catalog.rs | 2 +- datafusion/core/src/datasource/dynamic_file.rs | 11 +++++++---- datafusion/core/src/execution/context/mod.rs | 2 +- datafusion/sqllogictest/src/test_context.rs | 1 - 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/datafusion-examples/examples/external_dependency/query-aws-s3.rs b/datafusion-examples/examples/external_dependency/query-aws-s3.rs index 11c517d3b3b6..9c4d76703c9c 100644 --- a/datafusion-examples/examples/external_dependency/query-aws-s3.rs +++ b/datafusion-examples/examples/external_dependency/query-aws-s3.rs @@ -64,7 +64,7 @@ async fn main() -> Result<()> { df.show().await?; // dynamic query by the file path - ctx.enable_url_table()?; + ctx.enable_url_table(); let df = ctx .sql(format!(r#"SELECT * FROM '{}' LIMIT 10"#, &path).as_str()) .await?; diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index 0a9cfdd13f1d..44f9a5fd5d03 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths +//! dynamic_file contains a SchemaProvider that creates tables from file paths use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; use async_trait::async_trait; diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index 999c8e58b65e..154a24e84cba 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -65,10 +65,13 @@ impl UrlTableFactory for DynamicListTableFactory { }) .ok_or_else(|| plan_datafusion_err!("get current SessionStore error"))?; - let cfg = ListingTableConfig::new(table_url.clone()) + match ListingTableConfig::new(table_url.clone()) .infer(state) - .await?; - ListingTable::try_new(cfg) - .map(|table| Some(Arc::new(table) as Arc)) + .await + { + Ok(cfg) => ListingTable::try_new(cfg) + .map(|table| Some(Arc::new(table) as Arc)), + Err(_) => Ok(None), + } } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 3269a9167ec5..ce3d317a986a 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -334,7 +334,7 @@ impl SessionContext { /// # #[tokio::main] /// # async fn main() -> Result<()> { /// let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); - /// let ctx = SessionContext::new_with_config(cfg).enable_url_table()?; + /// let ctx = SessionContext::new_with_config(cfg).enable_url_table(); /// let results = ctx /// .sql("SELECT a, MIN(b) FROM 'tests/data/example.csv' as example GROUP BY a LIMIT 100") /// .await? diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 1eb89b270a4f..4e4b83216876 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -101,7 +101,6 @@ impl TestContext { } "describe.slt" | "arrow_files.slt" | "csv_files.slt" | "json.slt" | "parquet.slt" => { - // TODO: duplicate enable url table test_ctx.ctx = test_ctx.ctx.enable_url_table(); } "joins.slt" => { From 991340540391df538e22b77fb6b0aa33ad3d0ffe Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 22:19:36 +0800 Subject: [PATCH 31/53] rename catalog in cli and update lock --- datafusion-cli/Cargo.lock | 142 ++++++++++++++++++---------------- datafusion-cli/src/catalog.rs | 46 ++++++----- datafusion-cli/src/main.rs | 10 +-- 3 files changed, 103 insertions(+), 95 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 973cfc1201f2..0170ee8dfb33 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -272,7 +272,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.3.0", + "indexmap 2.4.0", "lexical-core", "num", "serde", @@ -347,13 +347,14 @@ dependencies = [ [[package]] name = "assert_cmd" -version = "2.0.15" +version = "2.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc65048dd435533bb1baf2ed9956b9a278fbfdcf90301b39ee117f06c0199d37" +checksum = "dc1835b7f27878de8525dc71410b5a31cdcc5f230aed5ba5df968e09c201b23d" dependencies = [ "anstyle", "bstr", "doc-comment", + "libc", "predicates", "predicates-core", "predicates-tree", @@ -386,7 +387,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -874,12 +875,13 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.8" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549" +checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" dependencies = [ "jobserver", "libc", + "shlex", ] [[package]] @@ -1022,9 +1024,9 @@ dependencies = [ [[package]] name = "core-foundation-sys" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" [[package]] name = "core2" @@ -1037,9 +1039,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "51e852e6dc9a5bed1fae92dd2375037bf2b768725bf3be87811edee3249d09ad" dependencies = [ "libc", ] @@ -1103,7 +1105,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1157,13 +1159,12 @@ dependencies = [ "datafusion-physical-optimizer", "datafusion-physical-plan", "datafusion-sql", - "dirs", "flate2", "futures", "glob", "half", "hashbrown 0.14.5", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.12.1", "log", "num-traits", @@ -1194,6 +1195,8 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "dirs", + "parking_lot", ] [[package]] @@ -1385,7 +1388,7 @@ dependencies = [ "datafusion-expr", "datafusion-physical-expr", "hashbrown 0.14.5", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.12.1", "log", "paste", @@ -1414,7 +1417,7 @@ dependencies = [ "half", "hashbrown 0.14.5", "hex", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.12.1", "log", "paste", @@ -1482,7 +1485,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.5", - "indexmap 2.3.0", + "indexmap 2.4.0", "itertools 0.12.1", "log", "once_cell", @@ -1763,7 +1766,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -1847,7 +1850,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.12", - "indexmap 2.3.0", + "indexmap 2.4.0", "slab", "tokio", "tokio-util", @@ -1866,7 +1869,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.1.0", - "indexmap 2.3.0", + "indexmap 2.4.0", "slab", "tokio", "tokio-util", @@ -2158,9 +2161,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de3fc2e30ba82dd1b3911c8de1ffc143c74a914a14e99514d7637e3099df5ea0" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown 0.14.5", @@ -2225,9 +2228,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -2304,9 +2307,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.155" +version = "0.2.156" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "a5f43f184355eefb8d17fc948dbecf6c13be3c141f20d834ae842193a448c72a" [[package]] name = "libflate" @@ -2442,9 +2445,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4569e456d394deccd22ce1c1913e6ea0e54519f577285001215d33557431afe4" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ "hermit-abi 0.3.9", "libc", @@ -2728,7 +2731,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset", - "indexmap 2.3.0", + "indexmap 2.4.0", ] [[package]] @@ -2786,7 +2789,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3387,29 +3390,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.205" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.205" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] name = "serde_json" -version = "1.0.122" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", "memchr", @@ -3440,6 +3443,12 @@ dependencies = [ "digest", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -3538,7 +3547,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3584,7 +3593,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3597,7 +3606,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3619,9 +3628,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.72" +version = "2.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af" +checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" dependencies = [ "proc-macro2", "quote", @@ -3685,7 +3694,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3779,7 +3788,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3846,15 +3855,15 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -3876,7 +3885,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -3921,7 +3930,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] @@ -4056,34 +4065,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -4093,9 +4103,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -4103,22 +4113,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-streams" @@ -4135,9 +4145,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", @@ -4384,7 +4394,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.72", + "syn 2.0.74", ] [[package]] diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index 9d1f1b6a75e3..b74a5fd0f1ee 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -20,7 +20,9 @@ use std::sync::{Arc, Weak}; use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; -use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; +use datafusion::catalog::{ + substitute_tilde, CatalogProvider, CatalogProviderList, SchemaProvider, +}; use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::ListingTableUrl; @@ -30,19 +32,15 @@ use datafusion::execution::context::SessionState; use datafusion::execution::session_state::SessionStateBuilder; use async_trait::async_trait; -use dirs::home_dir; use parking_lot::RwLock; -use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; - -/// Wraps another catalog, automatically creating table providers -/// for local files if needed -pub struct DynamicFileCatalog { +/// Wraps another catalog, automatically register require object stores for the file locations +pub struct DynamicObjectStoreCatalog { inner: Arc, state: Weak>, } -impl DynamicFileCatalog { +impl DynamicObjectStoreCatalog { pub fn new( inner: Arc, state: Weak>, @@ -51,7 +49,7 @@ impl DynamicFileCatalog { } } -impl CatalogProviderList for DynamicFileCatalog { +impl CatalogProviderList for DynamicObjectStoreCatalog { fn as_any(&self) -> &dyn Any { self } @@ -70,19 +68,19 @@ impl CatalogProviderList for DynamicFileCatalog { fn catalog(&self, name: &str) -> Option> { let state = self.state.clone(); - self.inner - .catalog(name) - .map(|catalog| Arc::new(DynamicFileCatalogProvider::new(catalog, state)) as _) + self.inner.catalog(name).map(|catalog| { + Arc::new(DynamicObjectStoreCatalogProvider::new(catalog, state)) as _ + }) } } /// Wraps another catalog provider -struct DynamicFileCatalogProvider { +struct DynamicObjectStoreCatalogProvider { inner: Arc, state: Weak>, } -impl DynamicFileCatalogProvider { +impl DynamicObjectStoreCatalogProvider { pub fn new( inner: Arc, state: Weak>, @@ -91,7 +89,7 @@ impl DynamicFileCatalogProvider { } } -impl CatalogProvider for DynamicFileCatalogProvider { +impl CatalogProvider for DynamicObjectStoreCatalogProvider { fn as_any(&self) -> &dyn Any { self } @@ -102,9 +100,9 @@ impl CatalogProvider for DynamicFileCatalogProvider { fn schema(&self, name: &str) -> Option> { let state = self.state.clone(); - self.inner - .schema(name) - .map(|schema| Arc::new(DynamicFileSchemaProvider::new(schema, state)) as _) + self.inner.schema(name).map(|schema| { + Arc::new(DynamicObjectStoreSchemaProvider::new(schema, state)) as _ + }) } fn register_schema( @@ -116,14 +114,14 @@ impl CatalogProvider for DynamicFileCatalogProvider { } } -/// Wraps another schema provider. [DynamicFileSchemaProvider] is responsible for registering the required +/// Wraps another schema provider. [DynamicObjectStoreSchemaProvider] is responsible for registering the required /// object stores for the file locations. -struct DynamicFileSchemaProvider { +struct DynamicObjectStoreSchemaProvider { inner: Arc, state: Weak>, } -impl DynamicFileSchemaProvider { +impl DynamicObjectStoreSchemaProvider { pub fn new( inner: Arc, state: Weak>, @@ -133,7 +131,7 @@ impl DynamicFileSchemaProvider { } #[async_trait] -impl SchemaProvider for DynamicFileSchemaProvider { +impl SchemaProvider for DynamicObjectStoreSchemaProvider { fn as_any(&self) -> &dyn Any { self } @@ -229,12 +227,12 @@ mod tests { fn setup_context() -> (SessionContext, Arc) { let ctx = SessionContext::new(); - ctx.register_catalog_list(Arc::new(DynamicFileCatalog::new( + ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( ctx.state().catalog_list().clone(), ctx.state_weak_ref(), ))); - let provider = &DynamicFileCatalog::new( + let provider = &DynamicObjectStoreCatalog::new( ctx.state().catalog_list().clone(), ctx.state_weak_ref(), ) as &dyn CatalogProviderList; diff --git a/datafusion-cli/src/main.rs b/datafusion-cli/src/main.rs index 29f407066ab1..380c4228db09 100644 --- a/datafusion-cli/src/main.rs +++ b/datafusion-cli/src/main.rs @@ -26,7 +26,7 @@ use datafusion::execution::context::SessionConfig; use datafusion::execution::memory_pool::{FairSpillPool, GreedyMemoryPool}; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion::prelude::SessionContext; -use datafusion_cli::catalog::DynamicFileCatalog; +use datafusion_cli::catalog::DynamicObjectStoreCatalog; use datafusion_cli::functions::ParquetMetadataFunc; use datafusion_cli::{ exec, @@ -175,13 +175,13 @@ async fn main_inner() -> Result<()> { let runtime_env = create_runtime_env(rt_config.clone())?; + // enable dynamic file query let ctx = - SessionContext::new_with_config_rt(session_config.clone(), Arc::new(runtime_env)); + SessionContext::new_with_config_rt(session_config.clone(), Arc::new(runtime_env)) + .enable_url_table(); ctx.refresh_catalogs().await?; - // enable dynamic file query - ctx.enable_url_table()?; // install dynamic catalog provider that can register required object stores - ctx.register_catalog_list(Arc::new(DynamicFileCatalog::new( + ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( ctx.state().catalog_list().clone(), ctx.state_weak_ref(), ))); From 5d861b897b72307f2f832140c7d18fedad131d00 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 22:19:52 +0800 Subject: [PATCH 32/53] enable home_dir feature --- datafusion-cli/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index cbd9ffd0feba..a83b7c231dc7 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -44,6 +44,7 @@ datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ "regex_expressions", "unicode_expressions", "compression", + "home_dir" ] } dirs = "4.0.0" env_logger = "0.9" From 16be2e7dcfa4f0557201c889c2302f2c53e73ad0 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 22:29:43 +0800 Subject: [PATCH 33/53] update lock --- datafusion-cli/Cargo.lock | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 5aa03a0ac490..da7165703f6f 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -4387,7 +4387,6 @@ dependencies = [ "proc-macro2", "quote", "syn 2.0.72", -checksum = "e9e8aabfac534be767c909e0690571677d49f41bd8465ae876fe043d52ba5292" ] [[package]] From db90c28c4cab7ba54d434e6753272c32f3141922 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Fri, 16 Aug 2024 22:48:07 +0800 Subject: [PATCH 34/53] fix compile --- datafusion/catalog/src/dynamic_file/catalog.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index 44f9a5fd5d03..628bfde4170a 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -196,10 +196,11 @@ pub trait UrlTableFactory: Sync + Send { ) -> datafusion_common::Result>>; } -#[cfg(all(not(target_os = "windows"), not(feature = "home_dir")))] +#[cfg(all(not(target_os = "windows"), feature = "home_dir"))] #[cfg(test)] mod tests { use crate::dynamic_file::catalog::substitute_tilde; + #[cfg(feature = "home_dir")] use dirs::home_dir; #[test] From 9353123e3ff7c807d3c4c0e6c3e7f662c231716b Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 00:32:22 +0800 Subject: [PATCH 35/53] fix clippy --- datafusion/catalog/src/session.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/catalog/src/session.rs b/datafusion/catalog/src/session.rs index e444b00fb7c1..61d9c2d8a71e 100644 --- a/datafusion/catalog/src/session.rs +++ b/datafusion/catalog/src/session.rs @@ -137,10 +137,10 @@ impl From<&dyn Session> for TaskContext { ) } } - +type SessionRefLock = Arc>>>>; /// The state store that stores the reference of the runtime session state. pub struct SessionStore { - session: Arc>>>>, + session: SessionRefLock, } impl SessionStore { From daa7ed8aa0258d3f4ac60af1b0d21eb86f9dcd8b Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 00:32:57 +0800 Subject: [PATCH 36/53] fmt toml --- datafusion-cli/Cargo.toml | 2 +- datafusion/catalog/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index a83b7c231dc7..f47face1aaaa 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -44,7 +44,7 @@ datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ "regex_expressions", "unicode_expressions", "compression", - "home_dir" + "home_dir", ] } dirs = "4.0.0" env_logger = "0.9" diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 402dbe789508..eb0041021c40 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -28,7 +28,7 @@ rust-version.workspace = true version.workspace = true [features] -home_dir= ["dep:dirs"] +home_dir = ["dep:dirs"] [dependencies] arrow-schema = { workspace = true } From e4a21740e01c4eee81f96cdab23e36df81ff4715 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 00:43:45 +0800 Subject: [PATCH 37/53] fix doc test and add more doc --- datafusion/catalog/src/dynamic_file/catalog.rs | 10 +++++++++- datafusion/core/src/datasource/dynamic_file.rs | 4 +++- datafusion/core/src/execution/context/mod.rs | 4 ++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index 628bfde4170a..97853eb0bc23 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! dynamic_file contains a SchemaProvider that creates tables from file paths +//! dynamic_file contains [`DynamicFileCatalog`] that creates tables from file paths +//! if the wrapped [`CatalogProviderList`] doesn't have the table provider. use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; use async_trait::async_trait; @@ -24,8 +25,11 @@ use dirs::home_dir; use std::any::Any; use std::sync::Arc; +/// Wrap another catalog provider list pub struct DynamicFileCatalog { + /// The inner catalog provider list inner: Arc, + /// The factory that can create a table provider from the file path factory: Arc, } @@ -67,7 +71,9 @@ impl CatalogProviderList for DynamicFileCatalog { /// Wraps another catalog provider struct DynamicFileCatalogProvider { + /// The inner catalog provider inner: Arc, + /// The factory that can create a table provider from the file path factory: Arc, } @@ -112,7 +118,9 @@ impl CatalogProvider for DynamicFileCatalogProvider { /// The provider will try to create a table provider from the file path if the table provider /// isn't exist in the inner schema provider. The required object store must be registered in the session context. pub struct DynamicFileSchemaProvider { + /// The inner schema provider inner: Arc, + /// The factory that can create a table provider from the file path factory: Arc, } diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index 154a24e84cba..acee2bd3d000 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -15,7 +15,8 @@ // specific language governing permissions and limitations // under the License. -//! dynamic_file_schema contains a SchemaProvider that creates tables from file paths +//! dynamic_file_schema contains an [`UrlTableFactory`] implementation that +//! can create a [`ListingTable`] from the given url. use std::sync::Arc; @@ -31,6 +32,7 @@ use crate::execution::context::SessionState; /// [DynamicListTableFactory] is a factory that can create a [ListingTable] from the given url. #[derive(Default)] pub struct DynamicListTableFactory { + /// The session store that contains the current session. session_store: Arc, } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 2083f8a03de3..1249c7bf901a 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -324,7 +324,7 @@ impl SessionContext { } /// Enable the dynamic file query for the current session. - /// See [DynamicFileSchemaProvider] for more details + /// See [DynamicFileCatalog] for more details /// /// # Example: query the url table /// @@ -343,7 +343,7 @@ impl SessionContext { /// assert_batches_eq!( /// &[ /// "+---+----------------+", - /// "| a | MIN(example.b) |", + /// "| a | min(example.b) |", /// "+---+----------------+", /// "| 1 | 2 |", /// "+---+----------------+", From 506d1d6c9b91ebad0782184c39b7526cc638351d Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 01:08:03 +0800 Subject: [PATCH 38/53] fix clippy --- datafusion-cli/src/catalog.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index b74a5fd0f1ee..d59dad7dfa9e 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -223,8 +223,6 @@ mod tests { use datafusion::catalog::SchemaProvider; use datafusion::prelude::SessionContext; - use super::*; - fn setup_context() -> (SessionContext, Arc) { let ctx = SessionContext::new(); ctx.register_catalog_list(Arc::new(DynamicObjectStoreCatalog::new( From 72ce464c1ebbf8712d49ddbcee77c59797a3fce5 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 01:28:21 +0800 Subject: [PATCH 39/53] add home_dir feature doc --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b1d38b61109f..16dd348774ec 100644 --- a/README.md +++ b/README.md @@ -91,6 +91,7 @@ Optional features: - `backtrace`: include backtrace information in error messages - `pyarrow`: conversions between PyArrow and DataFusion types - `serde`: enable arrow-schema's `serde` feature +- `home_dir` : enable support for substituting the tilde character in the file path with the user home directory for the URL table [apache avro]: https://avro.apache.org/ [apache parquet]: https://parquet.apache.org/ From fb1b6ce0ff3a7ba65c78a2d127051cf343ca7d52 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 17 Aug 2024 01:44:41 +0800 Subject: [PATCH 40/53] rollback the unused changed --- datafusion/core/src/execution/context/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 1249c7bf901a..1fb855cc1ac5 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -319,7 +319,7 @@ impl SessionContext { Self { session_id: state.session_id().to_string(), session_start_time: Utc::now(), - state: Arc::new(RwLock::new(state.clone())), + state: Arc::new(RwLock::new(state)), } } From f062fecc610da78c71aebd5d57c94fd32c4033f7 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sun, 18 Aug 2024 15:15:31 +0800 Subject: [PATCH 41/53] update lock --- datafusion-cli/Cargo.lock | 48 ++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 52e4a000355d..6ed938af354a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -124,9 +124,9 @@ checksum = "9d151e35f61089500b617991b791fc8bfd237ae50cd5950803758a179b41e67a" [[package]] name = "arrayvec" -version = "0.7.4" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" @@ -387,7 +387,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1105,7 +1105,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -1196,6 +1196,8 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "dirs", + "parking_lot", ] [[package]] @@ -1777,7 +1779,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -2318,9 +2320,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.156" +version = "0.2.157" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f43f184355eefb8d17fc948dbecf6c13be3c141f20d834ae842193a448c72a" +checksum = "374af5f94e54fa97cf75e945cce8a6b201e88a1a07e688b47dfd2a59c66dbd86" [[package]] name = "libflate" @@ -2800,7 +2802,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3416,7 +3418,7 @@ checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3558,7 +3560,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3604,7 +3606,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3617,7 +3619,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3639,9 +3641,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.74" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fceb41e3d546d0bd83421d3409b1460cc7444cd389341a4c880fe7a042cb3d7" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3705,7 +3707,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3775,9 +3777,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.2" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "daa4fb1bc778bd6f04cbfc4bb2d06a7396a8f299dc33ea1900cedaa316f467b1" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -3799,7 +3801,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3896,7 +3898,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -3941,7 +3943,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] @@ -4096,7 +4098,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -4130,7 +4132,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4405,7 +4407,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.74", + "syn 2.0.75", ] [[package]] From b1baa8415b2d2912bb1ac2da4d3eca6c0a88dac7 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sun, 18 Aug 2024 15:32:16 +0800 Subject: [PATCH 42/53] fix sqllogictest --- datafusion/sqllogictest/test_files/csv_files.slt | 1 - datafusion/sqllogictest/test_files/describe.slt | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index a0ddc1a883e8..2b2e3cd6c244 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -54,7 +54,6 @@ id9 value9 query TT select * from '../core/tests/data/quote.csv'; ---- -c1 c2 ~id0~ ~value0~ ~id1~ ~value1~ ~id2~ ~value2~ diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index edbc40ebca01..c1f93dbed654 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -63,9 +63,9 @@ DROP TABLE aggregate_simple; query TTT DESCRIBE '../core/tests/data/aggregate_simple.csv'; ---- -column_1 Utf8 YES -column_2 Utf8 YES -column_3 Utf8 YES +c1 Float64 YES +c2 Float64 YES +c3 Boolean YES ########## # Describe command From f0f070b9b98b262a396e528d0ca5d26b704429c1 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Tue, 20 Aug 2024 23:26:53 +0800 Subject: [PATCH 43/53] separate dynamic file test to another slt --- datafusion/sqllogictest/src/test_context.rs | 4 +-- .../sqllogictest/test_files/arrow_files.slt | 15 ----------- datafusion/sqllogictest/test_files/avro.slt | 25 ------------------- .../sqllogictest/test_files/csv_files.slt | 16 ------------ .../sqllogictest/test_files/describe.slt | 8 ++---- datafusion/sqllogictest/test_files/json.slt | 16 ------------ .../sqllogictest/test_files/parquet.slt | 12 --------- 7 files changed, 3 insertions(+), 93 deletions(-) diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 4e4b83216876..ef2fa863e6b0 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -91,7 +91,6 @@ impl TestContext { { info!("Registering avro tables"); register_avro_tables(&mut test_ctx).await; - test_ctx.ctx = test_ctx.ctx.enable_url_table(); } #[cfg(not(feature = "avro"))] { @@ -99,8 +98,7 @@ impl TestContext { return None; } } - "describe.slt" | "arrow_files.slt" | "csv_files.slt" | "json.slt" - | "parquet.slt" => { + "dynamic_file.slt" => { test_ctx.ctx = test_ctx.ctx.enable_url_table(); } "joins.slt" => { diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index b9acb6bdc487..8cf3550fdb25 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -43,14 +43,6 @@ SELECT * FROM arrow_simple 3 baz false 4 NULL true -query ITB -SELECT * FROM '../core/tests/data/example.arrow'; ----- -1 foo true -2 bar NULL -3 baz false -4 NULL true - # ARROW partitioned table statement ok CREATE EXTERNAL TABLE arrow_partitioned ( @@ -72,13 +64,6 @@ SELECT * FROM arrow_partitioned ORDER BY f0; 3 baz true 456 4 NULL NULL 456 -# dynamic select arrow file in the folder -query ITB -SELECT * FROM '../core/tests/data/partitioned_table_arrow/part=123' ORDER BY f0; ----- -1 foo true -2 bar false - # select all fields query IITB SELECT part, f0, f1, f2 FROM arrow_partitioned ORDER BY f0; diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index f21c36f317a6..f8ef81a8ba2b 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -136,18 +136,6 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 -query IT -SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.avro' ----- -4 0 -5 1 -6 0 -7 1 -2 0 -3 1 -0 0 -1 1 - # test avro query with snappy query IT SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_snappy @@ -161,19 +149,6 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_snappy 0 0 1 1 -# dynamic query snappy avro file -query IT -SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.snappy.avro' ----- -4 0 -5 1 -6 0 -7 1 -2 0 -3 1 -0 0 -1 1 - # test avro query with bzip2 query IT SELECT id, CAST(string_col AS varchar) FROM alltypes_plain_bzip2 diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 2b2e3cd6c244..7cb21abdba10 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -50,22 +50,6 @@ id7 value7 id8 value8 id9 value9 -# query the csv file dynamically with the config of current session -query TT -select * from '../core/tests/data/quote.csv'; ----- -~id0~ ~value0~ -~id1~ ~value1~ -~id2~ ~value2~ -~id3~ ~value3~ -~id4~ ~value4~ -~id5~ ~value5~ -~id6~ ~value6~ -~id7~ ~value7~ -~id8~ ~value8~ -~id9~ ~value9~ - - query TT select * from csv_with_escape; ---- diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index c1f93dbed654..077e8e6474d1 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -57,15 +57,11 @@ statement ok DROP TABLE aggregate_simple; ########## -# Describe file +# Describe file (we can only describe file if the default catalog is `DynamicFileCatalog`) ########## -query TTT +statement error Error during planning: table 'datafusion.public.../core/tests/data/aggregate_simple.csv' not found DESCRIBE '../core/tests/data/aggregate_simple.csv'; ----- -c1 Float64 YES -c2 Float64 YES -c3 Boolean YES ########## # Describe command diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index cb4f1c6ad8a4..0b9508310b00 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -45,22 +45,6 @@ SELECT a, b FROM json_test 5 -3.5 7 -3.5 -query IR rowsort -SELECT a, b FROM '../core/tests/data/2.json' ----- --10 -3.5 -1 -3.5 -1 0.6 -1 0.6 -1 2 -1 2 -1 2 -1 2 -100000000000000 0.6 -2 0.6 -5 -3.5 -7 -3.5 - query TT EXPLAIN SELECT count(*) from json_test ---- diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index bcf949e01d27..34d4ed6ff284 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -202,18 +202,6 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 -query IT -SELECT id, CAST(string_col AS varchar) FROM '../../parquet-testing/data/alltypes_plain.parquet'; ----- -4 0 -5 1 -6 0 -7 1 -2 0 -3 1 -0 0 -1 1 - # Clean up statement ok DROP TABLE alltypes_plain; From 6b77b6b3941ba0b05248752f32f7ad7c6055bd5f Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 21 Aug 2024 00:01:22 +0800 Subject: [PATCH 44/53] add test for querying url table but disabled this feature --- datafusion/sqllogictest/test_files/arrow_files.slt | 4 ++++ datafusion/sqllogictest/test_files/csv_files.slt | 4 ++++ datafusion/sqllogictest/test_files/json.slt | 4 ++++ datafusion/sqllogictest/test_files/parquet.slt | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 8cf3550fdb25..95dabc3f612e 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -43,6 +43,10 @@ SELECT * FROM arrow_simple 3 baz false 4 NULL true +# url table is only supported by DynamicFileCatalog +statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/example.arrow' not found +SELECT * FROM '../core/tests/data/example.arrow'; + # ARROW partitioned table statement ok CREATE EXTERNAL TABLE arrow_partitioned ( diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 7cb21abdba10..61bdd71ed1e4 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -50,6 +50,10 @@ id7 value7 id8 value8 id9 value9 +# url table is only supported by DynamicFileCatalog +statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/quote.csv' not found +select * from '../core/tests/data/quote.csv'; + query TT select * from csv_with_escape; ---- diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index 0b9508310b00..2057f76b3ebc 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -45,6 +45,10 @@ SELECT a, b FROM json_test 5 -3.5 7 -3.5 +# url table is only supported by DynamicFileCatalog +statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/2.json' not found +SELECT a, b FROM '../core/tests/data/2.json' + query TT EXPLAIN SELECT count(*) from json_test ---- diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 34d4ed6ff284..fccb9448a421 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -202,6 +202,10 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 +# url table is only supported by DynamicFileCatalog +statement error DataFusion error: Error during planning: table 'datafusion.public.../../parquet-testing/data/alltypes_plain.parquet' not found +SELECT id, CAST(string_col AS varchar) FROM '../../parquet-testing/data/alltypes_plain.parquet'; + # Clean up statement ok DROP TABLE alltypes_plain; From 4e51a77d35341c4c2cc310f94f927a33fb12ce16 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 21 Aug 2024 00:01:49 +0800 Subject: [PATCH 45/53] add dynamic_file.slt --- .../sqllogictest/test_files/dynamic_file.slt | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 datafusion/sqllogictest/test_files/dynamic_file.slt diff --git a/datafusion/sqllogictest/test_files/dynamic_file.slt b/datafusion/sqllogictest/test_files/dynamic_file.slt new file mode 100644 index 000000000000..af5e5df4ae96 --- /dev/null +++ b/datafusion/sqllogictest/test_files/dynamic_file.slt @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# dynamic select arrow file in the folder +query ITB +SELECT * FROM '../core/tests/data/partitioned_table_arrow/part=123' ORDER BY f0; +---- +1 foo true +2 bar false + +# dynamic file query doesn't support partitioned table +statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/partitioned_table_arrow' not found +SELECT * FROM '../core/tests/data/partitioned_table_arrow' ORDER BY f0; + +# read avro file +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.avro' +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + +# dynamic query snappy avro file +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../testing/data/avro/alltypes_plain.snappy.avro' +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 + +# query the csv file dynamically with the config of current session +query TT +select * from '../core/tests/data/quote.csv'; +---- +~id0~ ~value0~ +~id1~ ~value1~ +~id2~ ~value2~ +~id3~ ~value3~ +~id4~ ~value4~ +~id5~ ~value5~ +~id6~ ~value6~ +~id7~ ~value7~ +~id8~ ~value8~ +~id9~ ~value9~ + +query TTT +DESCRIBE '../core/tests/data/aggregate_simple.csv'; +---- +c1 Float64 YES +c2 Float64 YES +c3 Boolean YES + +query IR rowsort +SELECT a, b FROM '../core/tests/data/2.json' +---- +-10 -3.5 +1 -3.5 +1 0.6 +1 0.6 +1 2 +1 2 +1 2 +1 2 +100000000000000 0.6 +2 0.6 +5 -3.5 +7 -3.5 + +query IT +SELECT id, CAST(string_col AS varchar) FROM '../../parquet-testing/data/alltypes_plain.parquet'; +---- +4 0 +5 1 +6 0 +7 1 +2 0 +3 1 +0 0 +1 1 From fafc9dc547707ca799c79ff74fe863c380304b87 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 21 Aug 2024 00:26:37 +0800 Subject: [PATCH 46/53] remove home_dir feature --- README.md | 1 - datafusion-cli/Cargo.lock | 101 ++++++++++++------ datafusion-cli/Cargo.toml | 1 - datafusion-cli/src/catalog.rs | 53 ++++++++- datafusion/catalog/Cargo.toml | 4 - .../catalog/src/dynamic_file/catalog.rs | 66 ------------ datafusion/core/Cargo.toml | 1 - 7 files changed, 119 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index 16dd348774ec..b1d38b61109f 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,6 @@ Optional features: - `backtrace`: include backtrace information in error messages - `pyarrow`: conversions between PyArrow and DataFusion types - `serde`: enable arrow-schema's `serde` feature -- `home_dir` : enable support for substituting the tilde character in the file path with the user home directory for the URL table [apache avro]: https://avro.apache.org/ [apache parquet]: https://parquet.apache.org/ diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index bcbe2bd900e7..6c3afd03ad21 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -17,6 +17,12 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + [[package]] name = "adler32" version = "1.2.0" @@ -722,7 +728,7 @@ dependencies = [ "cc", "cfg-if", "libc", - "miniz_oxide", + "miniz_oxide 0.7.4", "object", "rustc-demangle", ] @@ -772,9 +778,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.5.3" +version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9ec96fe9a81b5e365f9db71fe00edc4fe4ca2cc7dcb7861f0603012a7caa210" +checksum = "d82033247fd8e890df8f740e407ad4d038debb9eb1f40533fffb32e7d17dc6f7" dependencies = [ "arrayref", "arrayvec", @@ -1196,7 +1202,6 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", - "dirs", "parking_lot", ] @@ -1692,12 +1697,12 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.31" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f211bbe8e69bbd0cfdea405084f128ae8b4aaa6b0b522fc8f2b009084797920" +checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" dependencies = [ "crc32fast", - "miniz_oxide", + "miniz_oxide 0.8.0", ] [[package]] @@ -1873,9 +1878,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", @@ -2066,7 +2071,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "httparse", @@ -2103,7 +2108,7 @@ dependencies = [ "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2321,9 +2326,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.157" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374af5f94e54fa97cf75e945cce8a6b201e88a1a07e688b47dfd2a59c66dbd86" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libflate" @@ -2457,6 +2462,15 @@ dependencies = [ "adler", ] +[[package]] +name = "miniz_oxide" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2d80299ef12ff69b16a84bb182e3b9df68b5a91574d3d4fa6e41b65deec4df1" +dependencies = [ + "adler2", +] + [[package]] name = "mio" version = "1.0.2" @@ -3026,9 +3040,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", @@ -3072,15 +3086,15 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64 0.22.1", "bytes", "futures-core", "futures-util", - "h2 0.4.5", + "h2 0.4.6", "http 1.1.0", "http-body 1.0.1", "http-body-util", @@ -3096,7 +3110,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.1", + "rustls-native-certs 0.7.2", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3112,7 +3126,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "winreg", + "windows-registry", ] [[package]] @@ -3251,9 +3265,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3656,6 +3670,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "tempfile" @@ -4217,6 +4234,36 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -4365,16 +4412,6 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "xmlparser" version = "0.13.6" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 52a365becb64..252d056e8b83 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -44,7 +44,6 @@ datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ "regex_expressions", "unicode_expressions", "compression", - "home_dir", ] } dirs = "4.0.0" env_logger = "0.9" diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index d59dad7dfa9e..9b9afc1c2420 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -20,9 +20,7 @@ use std::sync::{Arc, Weak}; use crate::object_storage::{get_object_store, AwsOptions, GcpOptions}; -use datafusion::catalog::{ - substitute_tilde, CatalogProvider, CatalogProviderList, SchemaProvider, -}; +use datafusion::catalog::{CatalogProvider, CatalogProviderList, SchemaProvider}; use datafusion::common::plan_datafusion_err; use datafusion::datasource::listing::ListingTableUrl; @@ -32,6 +30,7 @@ use datafusion::execution::context::SessionState; use datafusion::execution::session_state::SessionStateBuilder; use async_trait::async_trait; +use dirs::home_dir; use parking_lot::RwLock; /// Wraps another catalog, automatically register require object stores for the file locations @@ -215,6 +214,16 @@ impl SchemaProvider for DynamicObjectStoreSchemaProvider { } } +pub fn substitute_tilde(cur: String) -> String { + if let Some(usr_dir_path) = home_dir() { + if let Some(usr_dir) = usr_dir_path.to_str() { + if cur.starts_with('~') && !usr_dir.is_empty() { + return cur.replacen('~', usr_dir, 1); + } + } + } + cur +} #[cfg(test)] mod tests { @@ -321,4 +330,42 @@ mod tests { assert!(schema.table(location).await.is_err()); } + + #[cfg(not(target_os = "windows"))] + #[test] + fn test_substitute_tilde() { + use std::env; + use std::path::MAIN_SEPARATOR; + let original_home = home_dir(); + let test_home_path = if cfg!(windows) { + "C:\\Users\\user" + } else { + "/home/user" + }; + env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + test_home_path, + ); + let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; + let expected = format!( + "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", + test_home_path, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR, + MAIN_SEPARATOR + ); + let actual = substitute_tilde(input.to_string()); + assert_eq!(actual, expected); + match original_home { + Some(home_path) => env::set_var( + if cfg!(windows) { "USERPROFILE" } else { "HOME" }, + home_path.to_str().unwrap(), + ), + None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), + } + } } diff --git a/datafusion/catalog/Cargo.toml b/datafusion/catalog/Cargo.toml index 26de3fa48661..f9801352087d 100644 --- a/datafusion/catalog/Cargo.toml +++ b/datafusion/catalog/Cargo.toml @@ -27,9 +27,6 @@ repository.workspace = true rust-version.workspace = true version.workspace = true -[features] -home_dir = ["dep:dirs"] - [dependencies] arrow-schema = { workspace = true } async-trait = { workspace = true } @@ -37,7 +34,6 @@ datafusion-common = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-physical-plan = { workspace = true } -dirs = { version = "4.0.0", optional = true } parking_lot = { workspace = true } [lints] diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index 97853eb0bc23..4c4400540e52 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -20,8 +20,6 @@ use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; use async_trait::async_trait; -#[cfg(feature = "home_dir")] -use dirs::home_dir; use std::any::Any; use std::sync::Arc; @@ -175,25 +173,6 @@ impl SchemaProvider for DynamicFileSchemaProvider { } } -/// Substitute the tilde character in the file path with the user home directory. -#[cfg(feature = "home_dir")] -pub fn substitute_tilde(cur: String) -> String { - if let Some(usr_dir_path) = home_dir() { - if let Some(usr_dir) = usr_dir_path.to_str() { - if cur.starts_with('~') && !usr_dir.is_empty() { - return cur.replacen('~', usr_dir, 1); - } - } - } - cur -} - -/// Do nothing if the feature "home_dir" is disabled. -#[cfg(not(feature = "home_dir"))] -pub fn substitute_tilde(cur: String) -> String { - cur -} - /// [UrlTableFactory] is a factory that can create a table provider from the given url. #[async_trait] pub trait UrlTableFactory: Sync + Send { @@ -203,48 +182,3 @@ pub trait UrlTableFactory: Sync + Send { url: &str, ) -> datafusion_common::Result>>; } - -#[cfg(all(not(target_os = "windows"), feature = "home_dir"))] -#[cfg(test)] -mod tests { - use crate::dynamic_file::catalog::substitute_tilde; - #[cfg(feature = "home_dir")] - use dirs::home_dir; - - #[test] - fn test_substitute_tilde() { - use std::env; - use std::path::MAIN_SEPARATOR; - let original_home = home_dir(); - let test_home_path = if cfg!(windows) { - "C:\\Users\\user" - } else { - "/home/user" - }; - env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - test_home_path, - ); - let input = "~/Code/datafusion/benchmarks/data/tpch_sf1/part/part-0.parquet"; - let expected = format!( - "{}{}Code{}datafusion{}benchmarks{}data{}tpch_sf1{}part{}part-0.parquet", - test_home_path, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR, - MAIN_SEPARATOR - ); - let actual = substitute_tilde(input.to_string()); - assert_eq!(actual, expected); - match original_home { - Some(home_path) => env::set_var( - if cfg!(windows) { "USERPROFILE" } else { "HOME" }, - home_path.to_str().unwrap(), - ), - None => env::remove_var(if cfg!(windows) { "USERPROFILE" } else { "HOME" }), - } - } -} diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 7229dc04def0..adbba3eb31d6 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -77,7 +77,6 @@ unicode_expressions = [ "datafusion-sql/unicode_expressions", "datafusion-functions/unicode_expressions", ] -home_dir = ["datafusion-catalog/home_dir"] [dependencies] ahash = { workspace = true } From f7b4b8cdd8cad1c6d0a76952e57fb7dbd6c4de19 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 4 Sep 2024 22:18:54 +0800 Subject: [PATCH 47/53] update cli lock --- datafusion-cli/Cargo.lock | 161 +++++++++++++++++++++----------------- 1 file changed, 88 insertions(+), 73 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ddc6242977d3..a0b756ac2154 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -430,13 +430,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.81" +version = "0.1.82" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" +checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -503,9 +503,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -515,14 +515,15 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.0" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f42c2d4218de4dcd890a109461e2f799a1a2ba3bcd2cde9af88360f5df9266c6" +checksum = "2424565416eef55906f9f8cece2072b6b6a76075e3ff81483ebe938a89a4c05f" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", "aws-smithy-http", + "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", @@ -539,9 +540,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.39.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11822090cf501c316c6f75711d77b96fba30658e3867a7762e5e2f5d32d31e81" +checksum = "af0a3f676cba2c079c9563acc9233998c8951cdbe38629a0bef3c8c1b02f3658" dependencies = [ "aws-credential-types", "aws-runtime", @@ -561,9 +562,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.40.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78a2a06ff89176123945d1bbe865603c4d7101bea216a550bb4d2e4e9ba74d74" +checksum = "c91b6a04495547162cf52b075e3c15a17ab6608bf9c5785d3e5a5509b3f09f5c" dependencies = [ "aws-credential-types", "aws-runtime", @@ -583,9 +584,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.39.0" +version = "1.41.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a20a91795850826a6f456f4a48eff1dfa59a0e69bdbf5b8c50518fd372106574" +checksum = "99c56bcd6a56cab7933980a54148b476a5a69a7694e3874d9aa2a566f150447d" dependencies = [ "aws-credential-types", "aws-runtime", @@ -930,9 +931,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.13" +version = "1.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" +checksum = "e9d013ecb737093c0e86b151a7b837993cf9ec6c502946cfb44bedc392421e0b" dependencies = [ "jobserver", "libc", @@ -1011,7 +1012,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1070,9 +1071,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" [[package]] name = "core-foundation" @@ -1167,7 +1168,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -1257,6 +1258,7 @@ dependencies = [ "datafusion-execution", "datafusion-expr", "datafusion-physical-plan", + "parking_lot", ] [[package]] @@ -1697,9 +1699,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" [[package]] name = "fd-lock" @@ -1730,9 +1732,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.32" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" +checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" dependencies = [ "crc32fast", "miniz_oxide 0.8.0", @@ -1818,7 +1820,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -2127,16 +2129,16 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.27.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" dependencies = [ "futures-util", "http 1.1.0", "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.7.2", + "rustls-native-certs 0.8.0", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2198,9 +2200,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.4.0" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" +checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" dependencies = [ "equivalent", "hashbrown", @@ -2616,9 +2618,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.3" +version = "0.36.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" +checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" dependencies = [ "memchr", ] @@ -2826,7 +2828,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -2919,9 +2921,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.3" +version = "0.11.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" +checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" dependencies = [ "bytes", "pin-project-lite", @@ -2937,9 +2939,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.6" +version = "0.11.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" +checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" dependencies = [ "bytes", "rand", @@ -2954,22 +2956,22 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.4" +version = "0.5.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" +checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" dependencies = [ "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -3084,7 +3086,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.4.1", - "hyper-rustls 0.27.2", + "hyper-rustls 0.27.3", "hyper-util", "ipnet", "js-sys", @@ -3095,7 +3097,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.2", + "rustls-native-certs 0.7.3", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3175,18 +3177,18 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" dependencies = [ "bitflags 2.6.0", "errno", @@ -3216,7 +3218,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.6", + "rustls-webpki 0.102.7", "subtle", "zeroize", ] @@ -3235,9 +3237,22 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" +checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" +dependencies = [ + "openssl-probe", + "rustls-pemfile 2.1.3", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3283,9 +3298,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.102.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" dependencies = [ "ring", "rustls-pki-types", @@ -3398,29 +3413,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.208" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] name = "serde_json" -version = "1.0.125" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" dependencies = [ "itoa", "memchr", @@ -3549,7 +3564,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3595,7 +3610,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3608,7 +3623,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3630,9 +3645,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.75" +version = "2.0.77" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" +checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" dependencies = [ "proc-macro2", "quote", @@ -3693,7 +3708,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3763,9 +3778,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.39.3" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" +checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" dependencies = [ "backtrace", "bytes", @@ -3787,7 +3802,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3870,7 +3885,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -3915,7 +3930,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] @@ -4064,7 +4079,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-shared", ] @@ -4098,7 +4113,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4383,7 +4398,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.75", + "syn 2.0.77", ] [[package]] From 25d0ff635b1a5fb0001ff199445878432f79ee2f Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 4 Sep 2024 23:20:59 +0800 Subject: [PATCH 48/53] fix msrv check --- datafusion-cli/Cargo.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index f477bad69a2c..64eb27628c1e 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -33,6 +33,8 @@ readme = "README.md" arrow = { version = "52.2.0" } async-trait = "0.1.73" aws-config = "1.5.5" +# Lock the version to pass the MSRV check +aws-sdk-sts = "1.39.0" aws-credential-types = "1.2.0" clap = { version = "4.5.16", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ From a78bd3ce5d12271bf0cdf73f8a0baf6250abc15a Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 4 Sep 2024 23:28:13 +0800 Subject: [PATCH 49/53] fix msrv check --- datafusion-cli/Cargo.lock | 4 ++-- datafusion-cli/Cargo.toml | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index a0b756ac2154..4835b127e136 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -584,9 +584,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.41.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c56bcd6a56cab7933980a54148b476a5a69a7694e3874d9aa2a566f150447d" +checksum = "a20a91795850826a6f456f4a48eff1dfa59a0e69bdbf5b8c50518fd372106574" dependencies = [ "aws-credential-types", "aws-runtime", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 64eb27628c1e..f477bad69a2c 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -33,8 +33,6 @@ readme = "README.md" arrow = { version = "52.2.0" } async-trait = "0.1.73" aws-config = "1.5.5" -# Lock the version to pass the MSRV check -aws-sdk-sts = "1.39.0" aws-credential-types = "1.2.0" clap = { version = "4.5.16", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ From edeff334bb6adf46576e609caba50e923f9c3f3b Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Wed, 4 Sep 2024 23:41:41 +0800 Subject: [PATCH 50/53] rollback the lock change --- datafusion-cli/Cargo.lock | 156 +++++++++++++++++--------------------- 1 file changed, 71 insertions(+), 85 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4835b127e136..4f7d1d0e09b9 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -430,13 +430,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -503,9 +503,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" +checksum = "e16838e6c9e12125face1c1eff1343c75e3ff540de98ff7ebd61874a89bcfeb9" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -515,15 +515,14 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.4.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2424565416eef55906f9f8cece2072b6b6a76075e3ff81483ebe938a89a4c05f" +checksum = "f42c2d4218de4dcd890a109461e2f799a1a2ba3bcd2cde9af88360f5df9266c6" dependencies = [ "aws-credential-types", "aws-sigv4", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", @@ -540,9 +539,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.41.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af0a3f676cba2c079c9563acc9233998c8951cdbe38629a0bef3c8c1b02f3658" +checksum = "11822090cf501c316c6f75711d77b96fba30658e3867a7762e5e2f5d32d31e81" dependencies = [ "aws-credential-types", "aws-runtime", @@ -562,9 +561,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.42.0" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91b6a04495547162cf52b075e3c15a17ab6608bf9c5785d3e5a5509b3f09f5c" +checksum = "78a2a06ff89176123945d1bbe865603c4d7101bea216a550bb4d2e4e9ba74d74" dependencies = [ "aws-credential-types", "aws-runtime", @@ -931,9 +930,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.16" +version = "1.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9d013ecb737093c0e86b151a7b837993cf9ec6c502946cfb44bedc392421e0b" +checksum = "72db2f7947ecee9b03b510377e8bb9077afa27176fdbff55c51027e976fdcc48" dependencies = [ "jobserver", "libc", @@ -1012,7 +1011,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -1071,9 +1070,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.3.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" +checksum = "f7144d30dcf0fafbce74250a3963025d8d52177934239851c917d29f1df280c2" [[package]] name = "core-foundation" @@ -1168,7 +1167,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -1699,9 +1698,9 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.1" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "fd-lock" @@ -1732,9 +1731,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.0.33" +version = "1.0.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324a1be68054ef05ad64b861cc9eaf1d623d2d8cb25b4bf2cb9cdd902b4bf253" +checksum = "9c0596c1eac1f9e04ed902702e9878208b336edc9d6fddc8a48387349bab3666" dependencies = [ "crc32fast", "miniz_oxide 0.8.0", @@ -1820,7 +1819,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -2129,16 +2128,16 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.3" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08afdbb5c31130e3034af566421053ab03787c640246a446327f550d11bcb333" +checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", "http 1.1.0", "hyper 1.4.1", "hyper-util", "rustls 0.23.12", - "rustls-native-certs 0.8.0", + "rustls-native-certs 0.7.2", "rustls-pki-types", "tokio", "tokio-rustls 0.26.0", @@ -2200,9 +2199,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown", @@ -2618,9 +2617,9 @@ dependencies = [ [[package]] name = "object" -version = "0.36.4" +version = "0.36.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "084f1a5821ac4c651660a94a7153d27ac9d8a53736203f58b31945ded098070a" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" dependencies = [ "memchr", ] @@ -2828,7 +2827,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -2921,9 +2920,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.5" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c7c5fdde3cdae7203427dc4f0a68fe0ed09833edc525a03456b153b79828684" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" dependencies = [ "bytes", "pin-project-lite", @@ -2939,9 +2938,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.8" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" dependencies = [ "bytes", "rand", @@ -2956,22 +2955,22 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.5" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fe68c2e9e1a1234e218683dbdf9f9dfcb094113c5ac2b938dfcb9bab4c4140b" +checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" dependencies = [ "libc", "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] name = "quote" -version = "1.0.37" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -3086,7 +3085,7 @@ dependencies = [ "http-body 1.0.1", "http-body-util", "hyper 1.4.1", - "hyper-rustls 0.27.3", + "hyper-rustls 0.27.2", "hyper-util", "ipnet", "js-sys", @@ -3097,7 +3096,7 @@ dependencies = [ "pin-project-lite", "quinn", "rustls 0.23.12", - "rustls-native-certs 0.7.3", + "rustls-native-certs 0.7.2", "rustls-pemfile 2.1.3", "rustls-pki-types", "serde", @@ -3177,18 +3176,18 @@ checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustc_version" -version = "0.4.1" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "rustix" -version = "0.38.35" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ "bitflags 2.6.0", "errno", @@ -3218,7 +3217,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.102.7", + "rustls-webpki 0.102.6", "subtle", "zeroize", ] @@ -3237,22 +3236,9 @@ dependencies = [ [[package]] name = "rustls-native-certs" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bfb394eeed242e909609f56089eecfe5fda225042e8b171791b9c95f5931e5" -dependencies = [ - "openssl-probe", - "rustls-pemfile 2.1.3", - "rustls-pki-types", - "schannel", - "security-framework", -] - -[[package]] -name = "rustls-native-certs" -version = "0.8.0" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcaf18a4f2be7326cd874a5fa579fae794320a0f388d365dca7e480e55f83f8a" +checksum = "04182dffc9091a404e0fc069ea5cd60e5b866c3adf881eff99a32d048242dffa" dependencies = [ "openssl-probe", "rustls-pemfile 2.1.3", @@ -3298,9 +3284,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.102.7" +version = "0.102.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84678086bd54edf2b415183ed7a94d0efb049f1b646a33e22a36f3794be6ae56" +checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" dependencies = [ "ring", "rustls-pki-types", @@ -3413,29 +3399,29 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.209" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +checksum = "cff085d2cb684faa248efb494c39b68e522822ac0de72ccf08109abde717cfb2" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.209" +version = "1.0.208" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +checksum = "24008e81ff7613ed8e5ba0cfaf24e2c2f1e5b8a0495711e44fcd4882fca62bcf" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] name = "serde_json" -version = "1.0.127" +version = "1.0.125" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" +checksum = "83c8e735a073ccf5be70aa8066aa984eaf2fa000db6c8d0100ae605b366d31ed" dependencies = [ "itoa", "memchr", @@ -3564,7 +3550,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3610,7 +3596,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3623,7 +3609,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3645,9 +3631,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.77" +version = "2.0.75" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f35bcdf61fd8e7be6caf75f429fdca8beb3ed76584befb503b1569faee373ed" +checksum = "f6af063034fc1935ede7be0122941bafa9bacb949334d090b77ca98b5817c7d9" dependencies = [ "proc-macro2", "quote", @@ -3708,7 +3694,7 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3778,9 +3764,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", @@ -3802,7 +3788,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3885,7 +3871,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -3930,7 +3916,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] @@ -4079,7 +4065,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", "wasm-bindgen-shared", ] @@ -4113,7 +4099,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4398,7 +4384,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.77", + "syn 2.0.75", ] [[package]] From b1a922c1c43d03e50ee523518390b247a11f3dac Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 7 Sep 2024 23:22:32 +0800 Subject: [PATCH 51/53] address review comment and enhance the doc --- .../catalog/src/dynamic_file/catalog.rs | 3 +-- .../core/src/datasource/dynamic_file.rs | 8 +++---- datafusion/core/src/execution/context/mod.rs | 23 ++++++++++--------- .../sqllogictest/test_files/arrow_files.slt | 3 ++- .../sqllogictest/test_files/csv_files.slt | 3 ++- .../sqllogictest/test_files/dynamic_file.slt | 3 +++ datafusion/sqllogictest/test_files/json.slt | 3 ++- .../sqllogictest/test_files/parquet.slt | 3 ++- 8 files changed, 28 insertions(+), 21 deletions(-) diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index 4c4400540e52..b14a0ece48bc 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! dynamic_file contains [`DynamicFileCatalog`] that creates tables from file paths -//! if the wrapped [`CatalogProviderList`] doesn't have the table provider. +//! [`DynamicFileCatalog`] that creates tables from file paths use crate::{CatalogProvider, CatalogProviderList, SchemaProvider, TableProvider}; use async_trait::async_trait; diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index acee2bd3d000..69f070b16aec 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -33,17 +33,17 @@ use crate::execution::context::SessionState; #[derive(Default)] pub struct DynamicListTableFactory { /// The session store that contains the current session. - session_store: Arc, + session_store: SessionStore, } impl DynamicListTableFactory { /// Create a new [DynamicListTableFactory] with the given state store. - pub fn new(session_store: Arc) -> Self { + pub fn new(session_store: SessionStore) -> Self { Self { session_store } } - fn session_store(&self) -> Arc { - Arc::clone(&self.session_store) + pub fn session_store(&self) -> &SessionStore { + &self.session_store } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 0b21ad3a5e77..621b214818e9 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -73,7 +73,7 @@ use crate::datasource::dynamic_file::DynamicListTableFactory; use crate::execution::session_state::SessionStateBuilder; use async_trait::async_trait; use chrono::{DateTime, Utc}; -use datafusion_catalog::{DynamicFileCatalog, SessionStore}; +use datafusion_catalog::{DynamicFileCatalog, SessionStore, UrlTableFactory}; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; @@ -357,18 +357,20 @@ impl SessionContext { } } - /// Enable the dynamic file query for the current session. - /// See [DynamicFileCatalog] for more details + /// Enable dynamic file querying for the current session. + /// + /// This allows queries to directly access arbitrary file names via SQL like + /// `SELECT * from 'my_file.parquet'` + /// so it should only be enabled for systems that such access is not a security risk /// - /// # Example: query the url table + /// See [DynamicFileCatalog] for more details /// /// ``` /// # use datafusion::prelude::*; /// # use datafusion::{error::Result, assert_batches_eq}; /// # #[tokio::main] /// # async fn main() -> Result<()> { - /// let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); - /// let ctx = SessionContext::new_with_config(cfg).enable_url_table(); + /// let ctx = SessionContext::new().enable_url_table(); /// let results = ctx /// .sql("SELECT a, MIN(b) FROM 'tests/data/example.csv' as example GROUP BY a LIMIT 100") /// .await? @@ -389,17 +391,16 @@ impl SessionContext { /// ``` pub fn enable_url_table(&self) -> Self { let state_ref = self.state(); - let session_store = Arc::new(SessionStore::new()); - let factory = DynamicListTableFactory::new(Arc::clone(&session_store)); + let factory = Arc::new(DynamicListTableFactory::new(SessionStore::new())); let catalog_list = Arc::new(DynamicFileCatalog::new( Arc::clone(state_ref.catalog_list()), - Arc::new(factory), + Arc::clone(&factory) as Arc, )); let new_state = SessionStateBuilder::new_from_existing(self.state()) .with_catalog_list(catalog_list) .build(); let ctx = SessionContext::new_with_state(new_state); - session_store.with_state(ctx.state_weak_ref()); + factory.session_store().with_state(ctx.state_weak_ref()); ctx } @@ -1842,7 +1843,7 @@ mod tests { let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); let path = path.join("tests/tpch-csv/customer.csv"); let url = format!("file://{}", path.display()); - let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let cfg = SessionConfig::new(); let session_state = SessionStateBuilder::new() .with_default_features() .with_config(cfg) diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 95dabc3f612e..e66ba7477fc4 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -43,7 +43,8 @@ SELECT * FROM arrow_simple 3 baz false 4 NULL true -# url table is only supported by DynamicFileCatalog +# Ensure that local files can not be read by default (a potential security issue) +# (url table is only supported when DynamicFileCatalog is enabled) statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/example.arrow' not found SELECT * FROM '../core/tests/data/example.arrow'; diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 61bdd71ed1e4..dd1f025ed783 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -50,7 +50,8 @@ id7 value7 id8 value8 id9 value9 -# url table is only supported by DynamicFileCatalog +# Ensure that local files can not be read by default (a potential security issue) +# (url table is only supported when DynamicFileCatalog is enabled) statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/quote.csv' not found select * from '../core/tests/data/quote.csv'; diff --git a/datafusion/sqllogictest/test_files/dynamic_file.slt b/datafusion/sqllogictest/test_files/dynamic_file.slt index af5e5df4ae96..e177fd3de243 100644 --- a/datafusion/sqllogictest/test_files/dynamic_file.slt +++ b/datafusion/sqllogictest/test_files/dynamic_file.slt @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +# +# Note: This file runs with a SessionContext that has the `enable_url_table` flag set +# # dynamic select arrow file in the folder query ITB SELECT * FROM '../core/tests/data/partitioned_table_arrow/part=123' ORDER BY f0; diff --git a/datafusion/sqllogictest/test_files/json.slt b/datafusion/sqllogictest/test_files/json.slt index 2057f76b3ebc..0903c2427649 100644 --- a/datafusion/sqllogictest/test_files/json.slt +++ b/datafusion/sqllogictest/test_files/json.slt @@ -45,7 +45,8 @@ SELECT a, b FROM json_test 5 -3.5 7 -3.5 -# url table is only supported by DynamicFileCatalog +# Ensure that local files can not be read by default (a potential security issue) +# (url table is only supported when DynamicFileCatalog is enabled) statement error DataFusion error: Error during planning: table 'datafusion.public.../core/tests/data/2.json' not found SELECT a, b FROM '../core/tests/data/2.json' diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 7697b1cbdc66..f8b163adc796 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -202,7 +202,8 @@ SELECT id, CAST(string_col AS varchar) FROM alltypes_plain 0 0 1 1 -# url table is only supported by DynamicFileCatalog +# Ensure that local files can not be read by default (a potential security issue) +# (url table is only supported when DynamicFileCatalog is enabled) statement error DataFusion error: Error during planning: table 'datafusion.public.../../parquet-testing/data/alltypes_plain.parquet' not found SELECT id, CAST(string_col AS varchar) FROM '../../parquet-testing/data/alltypes_plain.parquet'; From e5ab14d974b691edf9ce336a5edb358a1052c40a Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sat, 7 Sep 2024 23:26:42 +0800 Subject: [PATCH 52/53] remove the legacy comment --- datafusion/catalog/src/dynamic_file/catalog.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/catalog/src/dynamic_file/catalog.rs b/datafusion/catalog/src/dynamic_file/catalog.rs index b14a0ece48bc..cd586446f82c 100644 --- a/datafusion/catalog/src/dynamic_file/catalog.rs +++ b/datafusion/catalog/src/dynamic_file/catalog.rs @@ -113,7 +113,7 @@ impl CatalogProvider for DynamicFileCatalogProvider { /// Implements the [DynamicFileSchemaProvider] that can create tables provider from the file path. /// /// The provider will try to create a table provider from the file path if the table provider -/// isn't exist in the inner schema provider. The required object store must be registered in the session context. +/// isn't exist in the inner schema provider. pub struct DynamicFileSchemaProvider { /// The inner schema provider inner: Arc, From 87d75039dede2e1234620dc94583a7f8ccf0c439 Mon Sep 17 00:00:00 2001 From: Jia-Xuan Liu Date: Sun, 8 Sep 2024 00:38:49 +0800 Subject: [PATCH 53/53] add missing doc --- datafusion/core/src/datasource/dynamic_file.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/core/src/datasource/dynamic_file.rs b/datafusion/core/src/datasource/dynamic_file.rs index 69f070b16aec..a95f3abb939b 100644 --- a/datafusion/core/src/datasource/dynamic_file.rs +++ b/datafusion/core/src/datasource/dynamic_file.rs @@ -42,6 +42,7 @@ impl DynamicListTableFactory { Self { session_store } } + /// Get the session store. pub fn session_store(&self) -> &SessionStore { &self.session_store }