From e7e0b15389db78e524769ceca4d8f77b19101052 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Oct 2024 06:58:28 -0400 Subject: [PATCH 01/10] Initial commit for example on using FFI Table provider in rust as a module loading system --- Cargo.toml | 3 ++ .../ffi/ffi_example_table_provider/Cargo.toml | 17 +++++++ .../ffi/ffi_example_table_provider/src/lib.rs | 44 +++++++++++++++++++ .../ffi/ffi_module_interface/Cargo.toml | 8 ++++ .../ffi/ffi_module_interface/src/lib.rs | 27 ++++++++++++ .../examples/ffi/ffi_module_loader/Cargo.toml | 11 +++++ .../ffi/ffi_module_loader/src/main.rs | 39 ++++++++++++++++ 7 files changed, 149 insertions(+) create mode 100644 datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml create mode 100644 datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs create mode 100644 datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml create mode 100644 datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs create mode 100644 datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml create mode 100644 datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs diff --git a/Cargo.toml b/Cargo.toml index 2f8896f7d90c..760c3a5971e5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,9 @@ members = [ "datafusion/substrait", "datafusion/wasmtest", "datafusion-examples", + "datafusion-examples/examples/ffi/ffi_example_table_provider", + "datafusion-examples/examples/ffi/ffi_module_interface", + "datafusion-examples/examples/ffi/ffi_module_loader", "test-utils", "benchmarks", ] diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml new file mode 100644 index 000000000000..3121a28a7801 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "ffi_example_table_provider" +version = "0.1.0" +edition = "2021" + +[dependencies] +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +abi_stable = "0.11.3" +arrow = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +ffi_module_interface = { path = "../ffi_module_interface" } + +[lib] +name = "ffi_example_table_provider" +crate-type = ["cdylib",'rlib'] \ No newline at end of file diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs new file mode 100644 index 000000000000..e8af63478518 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs @@ -0,0 +1,44 @@ +use std::sync::Arc; + +use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait}; +use arrow_array::RecordBatch; +use datafusion::{ + arrow::datatypes::{DataType, Field, Schema}, + common::record_batch, + datasource::MemTable, +}; +use datafusion_ffi::table_provider::FFI_TableProvider; +use ffi_module_interface::{TableProviderModule, TableProviderModuleRef}; + +fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch { + let end_value = start_value + num_values as i32; + let a_vals: Vec = (start_value..end_value).collect(); + let b_vals: Vec = a_vals.iter().map(|v| *v as f64).collect(); + + record_batch!(("a", Int32, a_vals), ("b", Float64, b_vals)).unwrap() +} + +extern "C" fn construct_simple_table_provider() -> FFI_TableProvider { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Float64, true), + ])); + + let batches = vec![ + create_record_batch(1, 5), + create_record_batch(6, 1), + create_record_batch(7, 5), + ]; + + let table_provider = MemTable::try_new(schema, vec![batches]).unwrap(); + + FFI_TableProvider::new(Arc::new(table_provider), true) +} + +#[export_root_module] +pub fn get_simple_memory_table() -> TableProviderModuleRef { + TableProviderModule { + create_table: construct_simple_table_provider, + } + .leak_into_prefix() +} diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml new file mode 100644 index 000000000000..7938af611ab6 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "ffi_module_interface" +version = "0.1.0" +edition = "2021" + +[dependencies] +abi_stable = "0.11.3" +datafusion-ffi = { workspace = true } diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs new file mode 100644 index 000000000000..a86b37791c0f --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs @@ -0,0 +1,27 @@ +use abi_stable::{ + declare_root_module_statics, + library::{LibraryError, RootModule}, + package_version_strings, + sabi_types::VersionStrings, + StableAbi, +}; +use datafusion_ffi::table_provider::FFI_TableProvider; + +#[repr(C)] +#[derive(StableAbi)] +#[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))] +pub struct TableProviderModule { + /// Constructs the table provider + pub create_table: extern "C" fn() -> FFI_TableProvider, +} + +impl RootModule for TableProviderModuleRef { + declare_root_module_statics! {TableProviderModuleRef} + const BASE_NAME: &'static str = "ffi_example_table_provider"; + const NAME: &'static str = "ffi_example_table_provider"; + const VERSION_STRINGS: VersionStrings = package_version_strings!(); + + fn initialization(self) -> Result { + Ok(self) + } +} diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml new file mode 100644 index 000000000000..6fab89975fa4 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "ffi_module_loader" +version = "0.1.0" +edition = "2021" + +[dependencies] +tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } +ffi_module_interface = { path = "../ffi_module_interface" } +abi_stable = "0.11.3" diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs new file mode 100644 index 000000000000..382a5e172d20 --- /dev/null +++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs @@ -0,0 +1,39 @@ +use std::sync::Arc; + +use datafusion::{ + error::{DataFusionError, Result}, + prelude::SessionContext, +}; + +use abi_stable::library::{development_utils::compute_library_path, RootModule}; +use datafusion_ffi::table_provider::ForeignTableProvider; +use ffi_module_interface::TableProviderModuleRef; + +#[tokio::main] +async fn main() -> Result<()> { + let target: &std::path::Path = "../../../../target/".as_ref(); + let library_path = compute_library_path::(target).unwrap(); + + let table_provider_module = + TableProviderModuleRef::load_from_directory(&library_path) + .unwrap_or_else(|e| panic!("{}", e)); + + let ffi_table_provider = + table_provider_module + .create_table() + .ok_or(DataFusionError::NotImplemented( + "External table provider failed to implement create_table".to_string(), + ))?(); + + let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into(); + + let ctx = SessionContext::new(); + + ctx.register_table("external_table", Arc::new(foreign_table_provider))?; + + let df = ctx.table("external_table").await?; + + df.show().await?; + + Ok(()) +} From d692a1f15fc2d905cfb4a147c1c4672a73fd2439 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Oct 2024 07:17:01 -0400 Subject: [PATCH 02/10] Add license text --- .../ffi/ffi_example_table_provider/Cargo.toml | 17 +++++++++++++++++ .../ffi/ffi_example_table_provider/src/lib.rs | 17 +++++++++++++++++ .../ffi/ffi_module_interface/Cargo.toml | 17 +++++++++++++++++ .../ffi/ffi_module_interface/src/lib.rs | 17 +++++++++++++++++ .../examples/ffi/ffi_module_loader/Cargo.toml | 17 +++++++++++++++++ .../examples/ffi/ffi_module_loader/src/main.rs | 17 +++++++++++++++++ 6 files changed, 102 insertions(+) diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml index 3121a28a7801..6fbd75ccfba3 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + [package] name = "ffi_example_table_provider" version = "0.1.0" diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs index e8af63478518..9c2b4daec3b1 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::sync::Arc; use abi_stable::{export_root_module, prefix_type::PrefixTypeTrait}; diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml index 7938af611ab6..33c7ba1394c4 100644 --- a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + [package] name = "ffi_module_interface" version = "0.1.0" diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs index a86b37791c0f..5c39b31f24a9 100644 --- a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs +++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use abi_stable::{ declare_root_module_statics, library::{LibraryError, RootModule}, diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml index 6fab89975fa4..8d103e5ec403 100644 --- a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + [package] name = "ffi_module_loader" version = "0.1.0" diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs index 382a5e172d20..945bfeeb7b43 100644 --- a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs +++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::sync::Arc; use datafusion::{ From 9c8dc801246aa1702fd2cffcae0dd44e399efa5d Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Oct 2024 08:21:15 -0400 Subject: [PATCH 03/10] formatting cargo.toml files --- .../examples/ffi/ffi_example_table_provider/Cargo.toml | 6 +++--- .../examples/ffi/ffi_module_loader/Cargo.toml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml index 6fbd75ccfba3..339be1cdb82d 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -21,14 +21,14 @@ version = "0.1.0" edition = "2021" [dependencies] -datafusion = { workspace = true } -datafusion-ffi = { workspace = true } abi_stable = "0.11.3" arrow = { workspace = true } arrow-array = { workspace = true } arrow-schema = { workspace = true } +datafusion = { workspace = true } +datafusion-ffi = { workspace = true } ffi_module_interface = { path = "../ffi_module_interface" } [lib] name = "ffi_example_table_provider" -crate-type = ["cdylib",'rlib'] \ No newline at end of file +crate-type = ["cdylib", 'rlib'] diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml index 8d103e5ec403..0f0aafbe2aa5 100644 --- a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml @@ -21,8 +21,8 @@ version = "0.1.0" edition = "2021" [dependencies] -tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } +abi_stable = "0.11.3" datafusion = { workspace = true } datafusion-ffi = { workspace = true } ffi_module_interface = { path = "../ffi_module_interface" } -abi_stable = "0.11.3" +tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } From 19b4a647947a57d8ff0448599da80cc9653f36cc Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 31 Oct 2024 06:59:13 -0400 Subject: [PATCH 04/10] Correct typos in readme --- datafusion/ffi/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index ba4bb8b961a1..4b88d2234a11 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -25,7 +25,7 @@ with functions from other languages using a stable interface. See [API Docs] for details and examples. We expect this crate may be used by both sides of the FFI. This allows users -to create modules that can interoperate with the necessity of using the same +to create modules that can interoperate without the necessity of using the same version of DataFusion. The driving use case has been the `datafusion-python` repository, but many other use cases may exist. We envision at least two use cases. @@ -43,7 +43,7 @@ In this crate we have a variety of structs which closely mimic the behavior of their internal counterparts. In the following example, we will refer to the `TableProvider`, but the same pattern exists for other structs. -Each of the exposted structs in this crate is provided with a variant prefixed +Each of the exposed structs in this crate is provided with a variant prefixed with `Foreign`. This variant is designed to be used by the consumer of the foreign code. The `Foreign` structs should _never_ access the `private_data` fields. Instead they should only access the data returned through the function From 29746b5f1b24d81bf5db2bd878fc32c4039b859a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 31 Oct 2024 08:37:04 -0400 Subject: [PATCH 05/10] Update documentation per PR feedback --- datafusion/ffi/README.md | 112 ++++++++++++++++----------- datafusion/ffi/src/table_provider.rs | 38 +++++++++ 2 files changed, 106 insertions(+), 44 deletions(-) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index 4b88d2234a11..6b82e00b24df 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -19,63 +19,87 @@ # `datafusion-ffi`: Apache DataFusion Foreign Function Interface -This crate contains code to allow interoperability of Apache [DataFusion] -with functions from other languages using a stable interface. +This crate contains code to allow interoperability of Apache [DataFusion] with +functions from other libraries and/or [DataFusion] versions using a stable +interface. + +One of the limitations of the Rust programming language is that there is no +stable [Rust ABI] (Application Binary Interface). If a library is compiled with +one version of the Rust compiler and you attempt to use that library with a +program compiled by a different Rust compiler, there is no guarantee that you +can access the data structures. In order to share code between libraries loaded +at runtime, you need to use Rust's [FFI](Foreign Function Interface (FFI)). + +The purpose of this crate is to define interfaces between [DataFusion] libraries +that will remain stable across different versions of [DataFusion]. This allows +users to write libraries that can interface between each other at runtime rather +than require compiling all of the code into a single executable. + +In general, it is recommended to run the same version of DataFusion by both the +producer and consumer of the data and functions shared across the [FFI], but +this is not strictly required. See [API Docs] for details and examples. -We expect this crate may be used by both sides of the FFI. This allows users -to create modules that can interoperate without the necessity of using the same -version of DataFusion. The driving use case has been the `datafusion-python` -repository, but many other use cases may exist. We envision at least two -use cases. +## Use Cases + +Two use cases have been identified for this crate, but they are not intended to +be all inclusive. 1. `datafusion-python` which will use the FFI to provide external services such as a `TableProvider` without needing to re-export the entire `datafusion-python` code base. With `datafusion-ffi` these packages do not need `datafusion-python` as a dependency at all. 2. Users may want to create a modular interface that allows runtime loading of - libraries. + libraries. For example, you may wish to design a program that only uses the + built in table sources, but also allows for extension from the community led + [datafusion-contrib] repositories. You could enable module loading so that + users could at runtime load a library to access additional data sources. + Alternatively, you could use this approach so that customers could interface + with their own proprietary data sources. + +## Limitations + +One limitation of the approach in this crate is that it is designed specifically +to work across Rust libraries. In general, you can use Rust's [FFI] to +operate across different programming languages, but that is not the design +intent of this crate. Instead, we are using external crates that provide +stable interfaces that closely mirror the Rust native approach. To learn more +about this approach see the [abi_stable] and [async-ffi] crates. + +If you have a library in another language that you wish to interface to +[DataFusion] the recommendation is to create a Rust wrapper crate to interface +with your library and then to connect it to [DataFusion] using this crate. +Alternatively, you could use [bindgen] to interface directly to the [FFI] provided +by this crate, but that is currently not supported. + +## FFI Boundary + +We expect this crate to be used by both sides of the FFI Boundary. This should +provide ergonamic ways to both produce and consume structs and functions across +this layer. + +For example, if you have a library that provides a custom `TableProvider`, you +can expose it by using `FFI_TableProvider::new()`. When you need to consume a +`FFI_TableProvider`, you can access it by converting using +`ForeignTableProvider::from()` which will create a struct that implements +`TableProvider`. + +There is a complete end to end demonstration in the +[examples](https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/ffi). ## Struct Layout In this crate we have a variety of structs which closely mimic the behavior of -their internal counterparts. In the following example, we will refer to the -`TableProvider`, but the same pattern exists for other structs. - -Each of the exposed structs in this crate is provided with a variant prefixed -with `Foreign`. This variant is designed to be used by the consumer of the -foreign code. The `Foreign` structs should _never_ access the `private_data` -fields. Instead they should only access the data returned through the function -calls defined on the `FFI_` structs. The second purpose of the `Foreign` -structs is to contain additional data that may be needed by the traits that -are implemented on them. Some of these traits require borrowing data which -can be far more convienent to be locally stored. - -For example, we have a struct `FFI_TableProvider` to give access to the -`TableProvider` functions like `table_type()` and `scan()`. If we write a -library that wishes to expose it's `TableProvider`, then we can access the -private data that contains the Arc reference to the `TableProvider` via -`FFI_TableProvider`. This data is local to the library. - -If we have a program that accesses a `TableProvider` via FFI, then it -will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** -not attempt to access the `private_data` field in `FFI_TableProvider`. If a -user is testing locally, you may be able to successfully access this field, but -it will only work if you are building against the exact same version of -`DataFusion` for both libraries **and** the same compiler. It will not work -in general. - -It is worth noting that which library is the `local` and which is `foreign` -depends on which interface we are considering. For example, suppose we have a -Python library called `my_provider` that exposes a `TableProvider` called -`MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can -access the `private_data` via `FFI_TableProvider`. We connect this to -`datafusion-python`, where we access it as a `ForeignTableProvider`. Now when -we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. -The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. -It is important to be careful when expanding these functions to be certain which -side of the interface each object refers to. +their internal counterparts. To see detailed notes about how to use them, see +the example in `FFI_TableProvider`. [datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-ffi/latest +[Rust ABI]: https://doc.rust-lang.org/reference/abi.html +[FFI]: https://doc.rust-lang.org/nomicon/ffi.html +[abi_stable]: https://crates.io/crates/abi_stable +[async-ffi]: https://crates.io/crates/async-ffi +[bindgen]: https://crates.io/crates/bindgen +[datafusion-python]: https://datafusion.apache.org/python/ +[datafusion-contrib]: https://github.com/datafusion-contrib diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 011ad96e423d..01f7c46106a2 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -54,6 +54,44 @@ use super::{ use datafusion::error::Result; /// A stable struct for sharing [`TableProvider`] across FFI boundaries. +/// +/// # Struct Layout +/// +/// The following description applies to all structs provided in this crate. +/// +/// Each of the exposed structs in this crate is provided with a variant prefixed +/// with `Foreign`. This variant is designed to be used by the consumer of the +/// foreign code. The `Foreign` structs should _never_ access the `private_data` +/// fields. Instead they should only access the data returned through the function +/// calls defined on the `FFI_` structs. The second purpose of the `Foreign` +/// structs is to contain additional data that may be needed by the traits that +/// are implemented on them. Some of these traits require borrowing data which +/// can be far more convienent to be locally stored. +/// +/// For example, we have a struct `FFI_TableProvider` to give access to the +/// `TableProvider` functions like `table_type()` and `scan()`. If we write a +/// library that wishes to expose it's `TableProvider`, then we can access the +/// private data that contains the Arc reference to the `TableProvider` via +/// `FFI_TableProvider`. This data is local to the library. +/// +/// If we have a program that accesses a `TableProvider` via FFI, then it +/// will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** +/// not attempt to access the `private_data` field in `FFI_TableProvider`. If a +/// user is testing locally, you may be able to successfully access this field, but +/// it will only work if you are building against the exact same version of +/// `DataFusion` for both libraries **and** the same compiler. It will not work +/// in general. +/// +/// It is worth noting that which library is the `local` and which is `foreign` +/// depends on which interface we are considering. For example, suppose we have a +/// Python library called `my_provider` that exposes a `TableProvider` called +/// `MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can +/// access the `private_data` via `FFI_TableProvider`. We connect this to +/// `datafusion-python`, where we access it as a `ForeignTableProvider`. Now when +/// we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. +/// The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. +/// It is important to be careful when expanding these functions to be certain which +/// side of the interface each object refers to. #[repr(C)] #[derive(Debug, StableAbi)] #[allow(non_camel_case_types)] From 114b239ec1d1cefe0861e5984ebc5749fc4a037a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 31 Oct 2024 08:37:25 -0400 Subject: [PATCH 06/10] Add additional documentation to example --- datafusion-examples/examples/ffi/README.md | 48 +++++++++++++++++++ .../ffi/ffi_example_table_provider/src/lib.rs | 5 ++ .../ffi/ffi_module_interface/src/lib.rs | 5 ++ .../ffi/ffi_module_loader/src/main.rs | 15 ++++-- 4 files changed, 69 insertions(+), 4 deletions(-) create mode 100644 datafusion-examples/examples/ffi/README.md diff --git a/datafusion-examples/examples/ffi/README.md b/datafusion-examples/examples/ffi/README.md new file mode 100644 index 000000000000..f29e0012f318 --- /dev/null +++ b/datafusion-examples/examples/ffi/README.md @@ -0,0 +1,48 @@ + + +# Example FFI Usage + +The purpose of these crates is to provide an example of how one can use the +DataFusion Foreign Function Interface (FFI). See [API Docs] for detailed +usage. + +This example is broken into three crates. + +- `ffi_module_interface` is a common library to be shared by both the module + to be loaded and the program that will load it. It defines how the module + is to be structured. +- `ffi_example_table_provider` creates a library to exposes the module. +- `ffi_module_loader` is an example program that loads the module, gets data + from it, and displays this data to the user. + +## Building and running + +In order for the program to run successfully, the module to be loaded must be +built first. This example expects both the module and the program to be +built using the same build mode (debug or release). + +```shell +cd ffi_example_table_provider +cargo build +cd ../ffi_module_loader +cargo run +``` + +[api docs]: http://docs.rs/datafusion-ffi/latest diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs index 9c2b4daec3b1..c7eea8a8070b 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/src/lib.rs @@ -35,12 +35,16 @@ fn create_record_batch(start_value: i32, num_values: usize) -> RecordBatch { record_batch!(("a", Int32, a_vals), ("b", Float64, b_vals)).unwrap() } +/// Here we only wish to create a simple table provider as an example. +/// We create an in-memory table and convert it to it's FFI counterpart. extern "C" fn construct_simple_table_provider() -> FFI_TableProvider { let schema = Arc::new(Schema::new(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Float64, true), ])); + // It is useful to create these as multiple record batches + // so that we can demonstrate the FFI stream. let batches = vec![ create_record_batch(1, 5), create_record_batch(6, 1), @@ -53,6 +57,7 @@ extern "C" fn construct_simple_table_provider() -> FFI_TableProvider { } #[export_root_module] +/// This defines the entry point for using the module. pub fn get_simple_memory_table() -> TableProviderModuleRef { TableProviderModule { create_table: construct_simple_table_provider, diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs index 5c39b31f24a9..88690e929713 100644 --- a/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs +++ b/datafusion-examples/examples/ffi/ffi_module_interface/src/lib.rs @@ -27,6 +27,11 @@ use datafusion_ffi::table_provider::FFI_TableProvider; #[repr(C)] #[derive(StableAbi)] #[sabi(kind(Prefix(prefix_ref = TableProviderModuleRef)))] +/// This struct defines the module interfaces. It is to be shared by +/// both the module loading program and library that implements the +/// module. It is possible to move this definition into the loading +/// program and reference it in the modules, but this example shows +/// how a user may wish to separate these concerns. pub struct TableProviderModule { /// Constructs the table provider pub create_table: extern "C" fn() -> FFI_TableProvider, diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs index 945bfeeb7b43..6e376ca866e8 100644 --- a/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs +++ b/datafusion-examples/examples/ffi/ffi_module_loader/src/main.rs @@ -28,13 +28,19 @@ use ffi_module_interface::TableProviderModuleRef; #[tokio::main] async fn main() -> Result<()> { + // Find the location of the library. This is specific to the build environment, + // so you will need to change the approach here based on your use case. let target: &std::path::Path = "../../../../target/".as_ref(); - let library_path = compute_library_path::(target).unwrap(); + let library_path = compute_library_path::(target) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + // Load the module let table_provider_module = TableProviderModuleRef::load_from_directory(&library_path) - .unwrap_or_else(|e| panic!("{}", e)); + .map_err(|e| DataFusionError::External(Box::new(e)))?; + // By calling the code below, the table provided will be created within + // the module's code. let ffi_table_provider = table_provider_module .create_table() @@ -42,14 +48,15 @@ async fn main() -> Result<()> { "External table provider failed to implement create_table".to_string(), ))?(); + // In order to access the table provider within this executable, we need to + // turn it into a `ForeignTableProvider`. let foreign_table_provider: ForeignTableProvider = (&ffi_table_provider).into(); let ctx = SessionContext::new(); + // Display the data to show the full cycle works. ctx.register_table("external_table", Arc::new(foreign_table_provider))?; - let df = ctx.table("external_table").await?; - df.show().await?; Ok(()) From 41fa74ad189de2a7848d26ef95b3b40ea32d7ac5 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 31 Oct 2024 08:44:24 -0400 Subject: [PATCH 07/10] Do not publish example --- .../examples/ffi/ffi_example_table_provider/Cargo.toml | 1 + datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml | 1 + datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml | 1 + 3 files changed, 3 insertions(+) diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml index 339be1cdb82d..2ed773cf4a8a 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -19,6 +19,7 @@ name = "ffi_example_table_provider" version = "0.1.0" edition = "2021" +publish = false [dependencies] abi_stable = "0.11.3" diff --git a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml index 33c7ba1394c4..612a21932476 100644 --- a/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_module_interface/Cargo.toml @@ -19,6 +19,7 @@ name = "ffi_module_interface" version = "0.1.0" edition = "2021" +publish = false [dependencies] abi_stable = "0.11.3" diff --git a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml index 0f0aafbe2aa5..028a366aab1c 100644 --- a/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_module_loader/Cargo.toml @@ -19,6 +19,7 @@ name = "ffi_module_loader" version = "0.1.0" edition = "2021" +publish = false [dependencies] abi_stable = "0.11.3" From 5fd8f6cfb071fb796301671c34e0619e440cd039 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 31 Oct 2024 08:52:13 -0400 Subject: [PATCH 08/10] apply prettier --- datafusion/ffi/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index 6b82e00b24df..b0910be1b552 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -51,7 +51,7 @@ be all inclusive. code base. With `datafusion-ffi` these packages do not need `datafusion-python` as a dependency at all. 2. Users may want to create a modular interface that allows runtime loading of - libraries. For example, you may wish to design a program that only uses the + libraries. For example, you may wish to design a program that only uses the built in table sources, but also allows for extension from the community led [datafusion-contrib] repositories. You could enable module loading so that users could at runtime load a library to access additional data sources. @@ -96,8 +96,8 @@ the example in `FFI_TableProvider`. [datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-ffi/latest -[Rust ABI]: https://doc.rust-lang.org/reference/abi.html -[FFI]: https://doc.rust-lang.org/nomicon/ffi.html +[rust abi]: https://doc.rust-lang.org/reference/abi.html +[ffi]: https://doc.rust-lang.org/nomicon/ffi.html [abi_stable]: https://crates.io/crates/abi_stable [async-ffi]: https://crates.io/crates/async-ffi [bindgen]: https://crates.io/crates/bindgen From ac400bd223d4adf18b3a54131bae10a54ee226ed Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 1 Nov 2024 07:39:43 -0400 Subject: [PATCH 09/10] Add text describing async calls --- datafusion/ffi/README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index b0910be1b552..48283f4cfdc1 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -88,6 +88,13 @@ can expose it by using `FFI_TableProvider::new()`. When you need to consume a There is a complete end to end demonstration in the [examples](https://github.com/apache/datafusion/tree/main/datafusion-examples/examples/ffi). +## Asynchronous Calls + +Some of the functions with this crate require asynchronous operation. These +will perform similar to their pure rust counterparts by using the [async-ffi] +crate. In general, any call to an asynchronous function in this interface will +not block the rest of the program's execution. + ## Struct Layout In this crate we have a variety of structs which closely mimic the behavior of From f0695be6258880af48f23000faf89039b749fdc4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 5 Nov 2024 11:58:52 -0500 Subject: [PATCH 10/10] Update datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml Co-authored-by: Alexander Hirner <6055037+ahirner@users.noreply.github.com> --- .../examples/ffi/ffi_example_table_provider/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml index 2ed773cf4a8a..52efdb7461ab 100644 --- a/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml +++ b/datafusion-examples/examples/ffi/ffi_example_table_provider/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "ffi_example_table_provider" version = "0.1.0" -edition = "2021" +edition = { workspace = true } publish = false [dependencies]