spiceai · phillipleblanc · Aug 28, 2024 · Aug 23, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -55,7 +55,7 @@ jobs:
       - uses: arduino/setup-protoc@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
-      - run: cargo clippy -- -Dwarnings
+      - run: cargo clippy -- -D warnings
 
   package:
     name: Package
@@ -69,5 +69,5 @@ jobs:
       - uses: arduino/setup-protoc@v3
         with:
           repo-token: ${{ secrets.GITHUB_TOKEN }}
-      - run: cargo build
+      - run: cargo build --all 
       - run: cargo package -p datafusion-federation --allow-dirty
diff --git a/Cargo.toml b/Cargo.toml
@@ -3,23 +3,17 @@ resolver = "2"
 
 members = [
     "datafusion-federation",
-    "examples",
-    "sources/sql",
-    "sources/flight-sql",
+    "datafusion-flight-sql-server",
+    "datafusion-flight-sql-table-provider",
 ]
 
-[patch.crates-io]
-# connectorx = { path = "../connector-x/connectorx" }
-# datafusion = { path = "../arrow-datafusion/datafusion/core" }
-
 [workspace.package]
 version = "0.1.3"
 edition = "2021"
 license = "MIT"
 readme = "README.md"
 
-
 [workspace.dependencies]
-async-trait = "0.1.77"
-datafusion = "37.0.0"
-datafusion-substrait = "37.0.0"
+async-trait = "0.1.81"
+datafusion = "41.0.0"
+datafusion-substrait = "41.0.0"
diff --git a/README.md b/README.md
@@ -1,22 +1,135 @@
-## DataFusion Federation
+# DataFusion Federation
 
 [![crates.io](https://img.shields.io/crates/v/datafusion-federation.svg)](https://crates.io/crates/datafusion-federation)
 [![docs.rs](https://docs.rs/datafusion-federation/badge.svg)](https://docs.rs/datafusion-federation)
 
-The goal of this repo is to allow [DataFusion](https://github.com/apache/arrow-datafusion) to resolve queries across remote query engines while pushing down as much compute as possible down.
+DataFusion Federation allows
+[DataFusion](https://github.com/apache/arrow-datafusion) to execute (part of) a
+query plan by a remote execution engine.
 
-Check out [the examples](./examples/) to get a feel for how it works.
+                                        ┌────────────────┐
+                   ┌────────────┐       │ Remote DBMS(s) │
+    SQL Query ───> │ DataFusion │  ───> │  ( execution   │
+                   └────────────┘       │ happens here ) │
+                                        └────────────────┘
 
-Potential use-cases:
+The goal is to allow resolving queries across remote query engines while
+pushing down as much compute as possible to the remote database(s). This allows
+execution to happen as close to the storage as possible. This concept is
+referred to as 'query federation'.
+
+> [!TIP]
+> This repository implements the federation framework itself. If you want to
+> connect to a specific database, check out the compatible providers available
+> in
+> [datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/).
+
+## Usage
+
+Check out the [examples](./datafusion-federation/examples/) to get a feel for
+how it works.
+
+## Potential use-cases:
 
 - Querying across SQLite, MySQL, PostgreSQL, ...
 - Pushing down SQL or [Substrait](https://substrait.io/) plans.
 - DataFusion -> Flight SQL -> DataFusion
 - ..
 
-#### Status
+## Design concept
+
+Say you have a query plan as follows:
+
+                   ┌────────────┐
+                   │    Join    │
+                   └────────────┘
+                          ▲
+                  ┌───────┴────────┐
+           ┌────────────┐   ┌────────────┐
+           │   Scan A   │   │    Join    │
+           └────────────┘   └────────────┘
+                                   ▲
+                           ┌───────┴────────┐
+                    ┌────────────┐   ┌────────────┐
+                    │   Scan B   │   │   Scan C   │
+                    └────────────┘   └────────────┘
+
+DataFusion Federation will identify the largest possible sub-plans that
+can be executed by an external database:
+
+                   ┌────────────┐      Optimizer recognizes
+                   │    Join    │      that B and C are
+                   └────────────┘      available in an
+                          ▲            external database
+           ┌──────────────┴────────┐
+           │       ┌ ─  ─ ─ ─  ─ ─ ┴ ─ ── ─ ─ ─  ─ ─┐
+    ┌────────────┐          ┌────────────┐          │
+    │   Scan A   │ │        │    Join    │
+    └────────────┘          └────────────┘          │
+                   │               ▲
+                           ┌───────┴────────┐       │
+                    ┌────────────┐   ┌────────────┐ │
+                   ││   Scan B   │   │   Scan C   │
+                    └────────────┘   └────────────┘ │
+                    ─ ── ─ ─ ── ─ ─ ─ ─  ─ ─ ─ ── ─ ┘
+
+The sub-plans are cut out and replaced by an opaque federation node in the plan:
+
+                   ┌────────────┐
+                   │    Join    │
+                   └────────────┘    Rewritten Plan
+                          ▲
+                 ┌────────┴───────────┐
+                 │                    │
+          ┌────────────┐    ┏━━━━━━━━━━━━━━━━━━┓
+          │   Scan A   │    ┃     Scan B+C     ┃
+          └────────────┘    ┃  (TableProvider  ┃
+                            ┃ that can execute ┃
+                            ┃ sub-plan in an   ┃
+                            ┃external database)┃
+                            ┗━━━━━━━━━━━━━━━━━━┛
+
+Different databases may have different query languages and execution
+capabilities. To accommodate for this, we allow each 'federation provider' to
+self-determine what part of a sub-plan it will actually federate. This is done
+by letting each federation provider define its own optimizer rule. When a
+sub-plan is 'cut out' of the overall plan, it is first passed the federation
+provider's optimizer rule. This optimizer rule determines the part of the plan
+that is cut out, based on the execution capabilities of the database it
+represents.
+
+## Implementation
+
+A remote database is represented by the `FederationProvider` trait. To identify
+table scans that are available in the same database, they implement
+`FederatedTableSource` trait. This trait allows lookup of the corresponding
+`FederationProvider`.
+
+Identifying sub-plans to federate is done by the `FederationOptimizerRule`.
+This rule needs to be registered in your DataFusion SessionState. One easy way
+to do this is using `default_session_state`. To do its job, the
+`FederationOptimizerRule` currently requires that all TableProviders that need
+to be federated are `FederatedTableProviderAdaptor`s. The
+`FederatedTableProviderAdaptor` also has a fallback mechanism that allows
+implementations to fallback to a 'vanilla' TableProvider in case the
+`FederationOptimizerRule` isn't registered.
+
+The `FederationProvider` can provide a `compute_context`. This allows it to
+differentiate between multiple remote execution context of the same type. For
+example two different mysql instances, database schemas, access level, etc. The
+`FederationProvider` also returns the `Optimizer` that is allows it to
+self-determine what part of a sub-plan it can federate.
+
+The `sql` module implements a generic `FederationProvider` for SQL execution
+engines. A specific SQL engine implements the `SQLExecutor` trait for its
+engine specific execution. There are a number of compatible providers available
+in
+[datafusion-contrib/datafusion-table-providers](https://github.com/datafusion-contrib/datafusion-table-providers/).
+
+## Status
 
-The project is in alpha status. Contributions welcome; land a PR = commit access.
+The project is in alpha status. Contributions welcome; land a PR = commit
+access.
 
 - [Docs (release)](https://docs.rs/datafusion-federation)
 - [Docs (main)](https://datafusion-contrib.github.io/datafusion-federation/)
diff --git a/commitlint.config.js b/commitlint.config.js
diff --git a/datafusion-federation/Cargo.toml b/datafusion-federation/Cargo.toml
@@ -10,14 +10,25 @@ description = "Datafusion federation."
 name = "datafusion_federation"
 path = "src/lib.rs"
 
+[package.metadata.docs.rs]
+# Whether to pass `--all-features` to Cargo (default: false)
+all-features = true
+# Whether to pass `--no-default-features` to Cargo (default: false)
+no-default-features = true
+
+[features]
+sql = ["futures"]
+
 [dependencies]
 async-trait.workspace = true
 datafusion.workspace = true
 
-[package.metadata.docs.rs]
+futures = { version = "0.3.30", optional = true }
 
-# Whether to pass `--all-features` to Cargo (default: false)
-all-features = true
+[dev-dependencies]
+tokio = { version = "1.39.3", features = ["full"] }
 
-# Whether to pass `--no-default-features` to Cargo (default: false)
-no-default-features = true
+[[example]]
+name = "df-csv"
+path = "examples/df-csv.rs"
+required-features = ["sql"]
diff --git a/datafusion-federation/examples/df-csv.rs b/datafusion-federation/examples/df-csv.rs
@@ -0,0 +1,115 @@
+use std::sync::Arc;
+
+use async_trait::async_trait;
+use datafusion::{
+    arrow::datatypes::SchemaRef,
+    catalog::SchemaProvider,
+    error::{DataFusionError, Result},
+    execution::{
+        context::{SessionContext, SessionState},
+        options::CsvReadOptions,
+    },
+    physical_plan::{stream::RecordBatchStreamAdapter, SendableRecordBatchStream},
+    sql::sqlparser::dialect::{Dialect, GenericDialect},
+};
+use datafusion_federation::sql::{SQLExecutor, SQLFederationProvider, SQLSchemaProvider};
+use futures::TryStreamExt;
+
+const CSV_PATH: &str = "./examples/test.csv";
+const TABLE_NAME: &str = "test";
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    // Create a remote context
+    let remote_ctx = Arc::new(SessionContext::new());
+
+    // Registers a CSV file
+    remote_ctx
+        .register_csv(TABLE_NAME, CSV_PATH, CsvReadOptions::new())
+        .await?;
+    let known_tables: Vec<String> = [TABLE_NAME].iter().map(|&x| x.into()).collect();
+
+    // Register schema
+    let executor = Arc::new(InMemorySQLExecutor::new(remote_ctx));
+    let provider = Arc::new(SQLFederationProvider::new(executor));
+    let schema_provider =
+        Arc::new(SQLSchemaProvider::new_with_tables(provider, known_tables).await?);
+
+    // Local context
+    let state = datafusion_federation::default_session_state();
+    overwrite_default_schema(&state, schema_provider)?;
+    let ctx = SessionContext::new_with_state(state);
+
+    // Run query
+    let query = r#"SELECT * from test"#;
+    let df = ctx.sql(query).await?;
+
+    // let explain = df.clone().explain(true, false)?;
+    // explain.show().await?;
+
+    df.show().await
+}
+
+fn overwrite_default_schema(state: &SessionState, schema: Arc<dyn SchemaProvider>) -> Result<()> {
+    let options = &state.config().options().catalog;
+    let catalog = state
+        .catalog_list()
+        .catalog(options.default_catalog.as_str())
+        .unwrap();
+
+    catalog.register_schema(options.default_schema.as_str(), schema)?;
+
+    Ok(())
+}
+
+pub struct InMemorySQLExecutor {
+    session: Arc<SessionContext>,
+}
+
+impl InMemorySQLExecutor {
+    pub fn new(session: Arc<SessionContext>) -> Self {
+        Self { session }
+    }
+}
+
+#[async_trait]
+impl SQLExecutor for InMemorySQLExecutor {
+    fn name(&self) -> &str {
+        "in_memory_sql_executor"
+    }
+
+    fn compute_context(&self) -> Option<String> {
+        None
+    }
+
+    fn execute(&self, sql: &str, schema: SchemaRef) -> Result<SendableRecordBatchStream> {
+        // Execute it using the remote datafusion session context
+        let future_stream = _execute(self.session.clone(), sql.to_string());
+        let stream = futures::stream::once(future_stream).try_flatten();
+        Ok(Box::pin(RecordBatchStreamAdapter::new(
+            schema.clone(),
+            stream,
+        )))
+    }
+
+    async fn table_names(&self) -> Result<Vec<String>> {
+        Err(DataFusionError::NotImplemented(
+            "table inference not implemented".to_string(),
+        ))
+    }
+
+    async fn get_table_schema(&self, table_name: &str) -> Result<SchemaRef> {
+        let sql = format!("select * from {table_name} limit 1");
+        let df = self.session.sql(&sql).await?;
+        let schema = df.schema().as_arrow().clone();
+        Ok(Arc::new(schema))
+    }
+
+    fn dialect(&self) -> Arc<dyn Dialect> {
+        Arc::new(GenericDialect {})
+    }
+}
+
+async fn _execute(ctx: Arc<SessionContext>, sql: String) -> Result<SendableRecordBatchStream> {
+    ctx.sql(&sql).await?.execute_stream().await
+}
diff --git a/datafusion-federation/examples/test.csv b/datafusion-federation/examples/test.csv
@@ -0,0 +1,4 @@
+foo,bar
+a,1
+b,2
+c,3