From 4bb3d584b049ee8b83adf1157c5e137c80a9c6aa Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 09:16:06 -0400 Subject: [PATCH 01/28] Initial commit of FFI table provider code --- Cargo.toml | 2 + datafusion/ffi/Cargo.toml | 45 ++ datafusion/ffi/README.md | 28 ++ datafusion/ffi/src/execution_plan.rs | 573 ++++++++++++++++++++++ datafusion/ffi/src/lib.rs | 26 + datafusion/ffi/src/record_batch_stream.rs | 182 +++++++ datafusion/ffi/src/session_config.rs | 48 ++ datafusion/ffi/src/table_provider.rs | 296 +++++++++++ 8 files changed, 1200 insertions(+) create mode 100644 datafusion/ffi/Cargo.toml create mode 100644 datafusion/ffi/README.md create mode 100644 datafusion/ffi/src/execution_plan.rs create mode 100644 datafusion/ffi/src/lib.rs create mode 100644 datafusion/ffi/src/record_batch_stream.rs create mode 100644 datafusion/ffi/src/session_config.rs create mode 100644 datafusion/ffi/src/table_provider.rs diff --git a/Cargo.toml b/Cargo.toml index 0a7184ad2d99..2f8896f7d90c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ members = [ "datafusion/expr", "datafusion/expr-common", "datafusion/execution", + "datafusion/ffi", "datafusion/functions", "datafusion/functions-aggregate", "datafusion/functions-aggregate-common", @@ -99,6 +100,7 @@ datafusion-common-runtime = { path = "datafusion/common-runtime", version = "42. datafusion-execution = { path = "datafusion/execution", version = "42.1.0" } datafusion-expr = { path = "datafusion/expr", version = "42.1.0" } datafusion-expr-common = { path = "datafusion/expr-common", version = "42.1.0" } +datafusion-ffi = { path = "datafusion/ffi", version = "42.1.0" } datafusion-functions = { path = "datafusion/functions", version = "42.1.0" } datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "42.1.0" } datafusion-functions-aggregate-common = { path = "datafusion/functions-aggregate-common", version = "42.1.0" } diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml new file mode 100644 index 000000000000..189b4d0cb1d0 --- /dev/null +++ b/datafusion/ffi/Cargo.toml @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "datafusion-ffi" +description = "Foreign Function Interface implementation for DataFusion" +readme = "README.md" +version = { workspace = true } +edition = { workspace = true } +homepage = { workspace = true } +repository = { workspace = true } +license = { workspace = true } +authors = { workspace = true } +# Specify MSRV here as `cargo msrv` doesn't support workspace version +rust-version = "1.76" + +[lints] +workspace = true + +[lib] +name = "datafusion_ffi" +path = "src/lib.rs" + +[dependencies] +arrow = { workspace = true, features = ["ffi"] } +datafusion = { workspace = true, default-features = true } +datafusion-proto = { workspace = true } +futures = { workspace = true } +async-trait = { workspace = true } +tokio = { workspace = true } +prost = { workspace = true } diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md new file mode 100644 index 000000000000..a4353f1a0c4f --- /dev/null +++ b/datafusion/ffi/README.md @@ -0,0 +1,28 @@ + + +# `datafusion-ffi`: Apache DataFusion Foreign Function Interface + +This crate contains code to allow interoperability of Apache [DataFusion] +with functions from other languages using a stable interface. + +See [API Docs] for details and examples. + +[datafusion]: https://datafusion.apache.org +[api docs]: http://docs.rs/datafusion-ffi/latest diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs new file mode 100644 index 000000000000..b04709ae8118 --- /dev/null +++ b/datafusion/ffi/src/execution_plan.rs @@ -0,0 +1,573 @@ +use std::{ + ffi::{c_char, c_void, CString}, + pin::Pin, + ptr::null_mut, + slice, + sync::Arc, +}; + +use arrow::{datatypes::Schema, ffi::FFI_ArrowSchema, ffi_stream::FFI_ArrowArrayStream}; +use datafusion::{ + error::DataFusionError, + execution::{SendableRecordBatchStream, TaskContext}, + physical_plan::{DisplayAs, ExecutionMode, ExecutionPlan, PlanProperties}, +}; +use datafusion::{error::Result, physical_expr::EquivalenceProperties, prelude::SessionContext}; +use datafusion_proto::{ + physical_plan::{ + from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, + to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, + DefaultPhysicalExtensionCodec, + }, + protobuf::{Partitioning, PhysicalSortExprNodeCollection}, +}; +use prost::Message; + +use super::record_batch_stream::{record_batch_to_arrow_stream, ConsumerRecordBatchStream}; + +#[repr(C)] +#[derive(Debug)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_ExecutionPlan { + pub properties: + Option FFI_PlanProperties>, + pub children: Option< + unsafe extern "C" fn( + plan: *const FFI_ExecutionPlan, + num_children: &mut usize, + err_code: &mut i32, + ) -> *mut *const FFI_ExecutionPlan, + >, + pub name: unsafe extern "C" fn(plan: *const FFI_ExecutionPlan) -> *const c_char, + + pub execute: unsafe extern "C" fn( + plan: *const FFI_ExecutionPlan, + partition: usize, + err_code: &mut i32, + ) -> FFI_ArrowArrayStream, + + pub private_data: *mut c_void, +} + +pub struct ExecutionPlanPrivateData { + pub plan: Arc, + pub last_error: Option, + pub children: Vec<*const FFI_ExecutionPlan>, + pub context: Arc, +} + +unsafe extern "C" fn properties_fn_wrapper(plan: *const FFI_ExecutionPlan) -> FFI_PlanProperties { + let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + let properties = (*private_data).plan.properties(); + properties.clone().into() +} + +unsafe extern "C" fn children_fn_wrapper( + plan: *const FFI_ExecutionPlan, + num_children: &mut usize, + err_code: &mut i32, +) -> *mut *const FFI_ExecutionPlan { + let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + + *num_children = (*private_data).children.len(); + *err_code = 0; + + let mut children: Vec<_> = (*private_data).children.to_owned(); + let children_ptr = children.as_mut_ptr(); + + std::mem::forget(children); + + children_ptr +} + +unsafe extern "C" fn execute_fn_wrapper( + plan: *const FFI_ExecutionPlan, + partition: usize, + err_code: &mut i32, +) -> FFI_ArrowArrayStream { + let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + + let record_batch_stream = match (*private_data) + .plan + .execute(partition, (*private_data).context.clone()) + { + Ok(rbs) => rbs, + Err(_e) => { + *err_code = 1; + return FFI_ArrowArrayStream::empty(); + } + }; + + record_batch_to_arrow_stream(record_batch_stream) +} +unsafe extern "C" fn name_fn_wrapper(plan: *const FFI_ExecutionPlan) -> *const c_char { + let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + + let name = (*private_data).plan.name(); + + CString::new(name) + .unwrap_or(CString::new("unable to parse execution plan name").unwrap()) + .into_raw() +} + +// Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. +// This struct exists on the consumer side (datafusion-python, for example) and not +// in the provider's side. +#[derive(Debug)] +pub struct ExportedExecutionPlan { + name: String, + plan: *const FFI_ExecutionPlan, + properties: PlanProperties, + children: Vec>, +} + +unsafe impl Send for ExportedExecutionPlan {} +unsafe impl Sync for ExportedExecutionPlan {} + +impl DisplayAs for ExportedExecutionPlan { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!( + f, + "FFI_ExecutionPlan(number_of_children={})", + self.children.len(), + ) + } +} + +impl FFI_ExecutionPlan { + /// This function is called on the provider's side. + pub fn new(plan: Arc, context: Arc) -> Self { + let children = plan + .children() + .into_iter() + .map(|child| Box::new(FFI_ExecutionPlan::new(child.clone(), context.clone()))) + .map(|child| Box::into_raw(child) as *const FFI_ExecutionPlan) + .collect(); + println!("children collected"); + + let private_data = Box::new(ExecutionPlanPrivateData { + plan, + children, + context, + last_error: None, + }); + println!("generated private data, ready to return"); + + Self { + properties: Some(properties_fn_wrapper), + children: Some(children_fn_wrapper), + name: name_fn_wrapper, + execute: execute_fn_wrapper, + private_data: Box::into_raw(private_data) as *mut c_void, + } + } + + // pub fn empty() -> Self { + // Self { + // properties: None, + // children: None, + // private_data: std::ptr::null_mut(), + // } + // } +} + +impl ExportedExecutionPlan { + /// Wrap a FFI Execution Plan + /// + /// # Safety + /// + /// The caller must ensure the pointer provided points to a valid implementation + /// of FFI_ExecutionPlan + pub unsafe fn new(plan: *const FFI_ExecutionPlan) -> Result { + let name_fn = (*plan).name; + let name_cstr = name_fn(plan); + let name = CString::from_raw(name_cstr as *mut c_char) + .to_str() + .unwrap_or("Unable to parse FFI_ExecutionPlan name") + .to_string(); + + println!("entered ExportedExecutionPlan::new"); + let properties = unsafe { + let properties_fn = (*plan).properties.ok_or(DataFusionError::NotImplemented( + "properties not implemented on FFI_ExecutionPlan".to_string(), + ))?; + println!("About to call properties fn"); + properties_fn(plan).try_into()? + }; + + println!("created properties"); + let children = unsafe { + let children_fn = (*plan).children.ok_or(DataFusionError::NotImplemented( + "children not implemented on FFI_ExecutionPlan".to_string(), + ))?; + let mut num_children = 0; + let mut err_code = 0; + let children_ptr = children_fn(plan, &mut num_children, &mut err_code); + + println!( + "We called the FFI function children so the provider told us we have {} children", + num_children + ); + + if err_code != 0 { + return Err(DataFusionError::Plan( + "Error getting children for FFI_ExecutionPlan".to_string(), + )); + } + + let ffi_vec = Vec::from_raw_parts(children_ptr, num_children, num_children); + let maybe_children: Result> = ffi_vec + .into_iter() + .map(|child| { + println!("Ok, we are about to examine a child ffi_executionplan"); + if let Some(props_fn) = (*child).properties { + println!("We do have properties on the child "); + let child_props = props_fn(child); + println!("Child schema {:?}", child_props.schema); + } + + let child_plan = ExportedExecutionPlan::new(child); + + child_plan.map(|c| Arc::new(c) as Arc) + }) + .collect(); + println!("finsihed maybe children"); + + maybe_children? + }; + + println!("About to return ExportedExecurtionPlan"); + + Ok(Self { + name, + plan, + properties, + children, + }) + } +} + +impl ExecutionPlan for ExportedExecutionPlan { + fn name(&self) -> &str { + &self.name + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + self.children + .iter() + .map(|p| p as &Arc) + .collect() + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> datafusion::error::Result> { + Ok(Arc::new(ExportedExecutionPlan { + plan: self.plan, + name: self.name.clone(), + children, + properties: self.properties.clone(), + })) + } + + fn execute( + &self, + partition: usize, + _context: Arc, + ) -> datafusion::error::Result { + unsafe { + let execute_fn = (*self.plan).execute; + let mut err_code = 0; + let arrow_stream = execute_fn(self.plan, partition, &mut err_code); + + match err_code { + 0 => ConsumerRecordBatchStream::try_from(arrow_stream) + .map(|v| Pin::new(Box::new(v)) as SendableRecordBatchStream), + _ => Err(DataFusionError::Execution( + "Error occurred during FFI call to FFI_ExecutionPlan execute.".to_string(), + )), + } + } + } +} + +#[repr(C)] +#[derive(Debug)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_PlanProperties { + // Returns protobuf serialized bytes of the partitioning + pub output_partitioning: Option< + unsafe extern "C" fn( + plan: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, + ) -> i32, + >, + + pub execution_mode: + Option FFI_ExecutionMode>, + + // PhysicalSortExprNodeCollection proto + pub output_ordering: Option< + unsafe extern "C" fn( + plan: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, + ) -> i32, + >, + + pub schema: Option FFI_ArrowSchema>, + + pub private_data: *mut c_void, +} + +unsafe extern "C" fn output_partitioning_fn_wrapper( + properties: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, +) -> i32 { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let partitioning = (*private_data).output_partitioning(); + + let codec = DefaultPhysicalExtensionCodec {}; + let partitioning_data = match serialize_partitioning(partitioning, &codec) { + Ok(p) => p, + Err(_) => return 1, + }; + + let mut partition_bytes = partitioning_data.encode_to_vec(); + *buffer_size = partition_bytes.len(); + *buffer_bytes = partition_bytes.as_mut_ptr(); + + std::mem::forget(partition_bytes); + + 0 +} + +unsafe extern "C" fn execution_mode_fn_wrapper( + properties: *const FFI_PlanProperties, +) -> FFI_ExecutionMode { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let execution_mode = (*private_data).execution_mode(); + + execution_mode.into() +} + +unsafe extern "C" fn output_ordering_fn_wrapper( + properties: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, +) -> i32 { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let output_ordering = match (*private_data).output_ordering() { + Some(o) => o, + None => { + *buffer_size = 0; + return 0; + } + } + .to_owned(); + + let codec = DefaultPhysicalExtensionCodec {}; + let physical_sort_expr_nodes = match serialize_physical_sort_exprs(output_ordering, &codec) { + Ok(p) => p, + Err(_) => return 1, + }; + + let ordering_data = PhysicalSortExprNodeCollection { + physical_sort_expr_nodes, + }; + + let mut ordering_bytes = ordering_data.encode_to_vec(); + *buffer_size = ordering_bytes.len(); + *buffer_bytes = ordering_bytes.as_mut_ptr(); + std::mem::forget(ordering_bytes); + + 0 +} + +// pub schema: Option FFI_ArrowSchema>, +unsafe extern "C" fn schema_fn_wrapper(properties: *const FFI_PlanProperties) -> FFI_ArrowSchema { + let private_data = (*properties).private_data as *const PlanProperties; + let schema = (*private_data).eq_properties.schema(); + + // This does silently fail because TableProvider does not return a result + // so we expect it to always pass. Maybe some logging should be added. + FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()) +} + +impl From for FFI_PlanProperties { + fn from(value: PlanProperties) -> Self { + let private_data = Box::new(value); + + Self { + output_partitioning: Some(output_partitioning_fn_wrapper), + execution_mode: Some(execution_mode_fn_wrapper), + output_ordering: Some(output_ordering_fn_wrapper), + schema: Some(schema_fn_wrapper), + private_data: Box::into_raw(private_data) as *mut c_void, + } + } +} + +// /// Creates a new [`FFI_TableProvider`]. +// pub fn new(provider: Box) -> Self { +// let private_data = Box::new(ProviderPrivateData { +// provider, +// last_error: None, +// }); + +// Self { +// version: 2, +// schema: Some(provider_schema), +// scan: Some(provider_scan), +// private_data: Box::into_raw(private_data) as *mut c_void, +// } +// } + +impl TryFrom for PlanProperties { + type Error = DataFusionError; + + fn try_from(value: FFI_PlanProperties) -> std::result::Result { + unsafe { + let schema_fn = value.schema.ok_or(DataFusionError::NotImplemented( + "schema() not implemented on FFI_PlanProperties".to_string(), + ))?; + let ffi_schema = schema_fn(&value); + let schema: Schema = (&ffi_schema).try_into()?; + + let ordering_fn = value + .output_ordering + .ok_or(DataFusionError::NotImplemented( + "output_ordering() not implemented on FFI_PlanProperties".to_string(), + ))?; + let mut buff_size = 0; + let mut buff = null_mut(); + if ordering_fn(&value, &mut buff_size, &mut buff) != 0 { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_ordering in FFI_PlanProperties" + .to_string(), + )); + } + + // TODO we will need to get these, but unsure if it happesn on the provider or consumer right now. + let default_ctx = SessionContext::new(); + let codex = DefaultPhysicalExtensionCodec {}; + + let orderings = match buff_size == 0 { + true => None, + false => { + let data = slice::from_raw_parts(buff, buff_size); + + let proto_output_ordering = PhysicalSortExprNodeCollection::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Some(parse_physical_sort_exprs( + &proto_output_ordering.physical_sort_expr_nodes, + &default_ctx, + &schema, + &codex, + )?) + } + }; + + let partitioning_fn = + value + .output_partitioning + .ok_or(DataFusionError::NotImplemented( + "output_partitioning() not implemented on FFI_PlanProperties".to_string(), + ))?; + if partitioning_fn(&value, &mut buff_size, &mut buff) != 0 { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" + .to_string(), + )); + } + let data = slice::from_raw_parts(buff, buff_size); + + let proto_partitioning = + Partitioning::decode(data).map_err(|e| DataFusionError::External(Box::new(e)))?; + // TODO: Validate this unwrap is safe. + let partitioning = parse_protobuf_partitioning( + Some(&proto_partitioning), + &default_ctx, + &schema, + &codex, + )? + .unwrap(); + + let execution_mode_fn = value.execution_mode.ok_or(DataFusionError::NotImplemented( + "execution_mode() not implemented on FFI_PlanProperties".to_string(), + ))?; + let execution_mode = execution_mode_fn(&value).into(); + + let eq_properties = match orderings { + Some(ordering) => { + EquivalenceProperties::new_with_orderings(Arc::new(schema), &[ordering]) + } + None => EquivalenceProperties::new(Arc::new(schema)), + }; + + Ok(Self::new(eq_properties, partitioning, execution_mode)) + } + } + // fn from(value: FFI_PlanProperties) -> Self { + // let schema = self.schema() + + // let equiv_prop = EquivalenceProperties::new_with_orderings(schema, orderings); + // } +} + +#[repr(C)] +#[allow(non_camel_case_types)] +pub enum FFI_ExecutionMode { + Bounded, + + Unbounded, + + PipelineBreaking, +} + +impl From for FFI_ExecutionMode { + fn from(value: ExecutionMode) -> Self { + match value { + ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, + ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, + ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, + } + } +} + +impl From for ExecutionMode { + fn from(value: FFI_ExecutionMode) -> Self { + match value { + FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, + FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, + FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, + } + } +} diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs new file mode 100644 index 000000000000..e6767e76007e --- /dev/null +++ b/datafusion/ffi/src/lib.rs @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 +#![deny(clippy::clone_on_ref_ptr)] + +pub mod execution_plan; +pub mod record_batch_stream; +pub mod session_config; +pub mod table_provider; + +#[cfg(doctest)] +doc_comment::doctest!("../README.md", readme_example_test); diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs new file mode 100644 index 000000000000..5e1d4ff2acdf --- /dev/null +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -0,0 +1,182 @@ +use std::{ + ffi::{c_char, c_int, c_void, CString}, + ptr::addr_of, +}; + +use arrow::{ + array::StructArray, + ffi::{FFI_ArrowArray, FFI_ArrowSchema}, + ffi_stream::FFI_ArrowArrayStream, +}; +use arrow::{ + array::{Array, RecordBatch, RecordBatchReader}, + ffi_stream::ArrowArrayStreamReader, +}; +use datafusion::error::Result; +use datafusion::{ + error::DataFusionError, + execution::{RecordBatchStream, SendableRecordBatchStream}, +}; +use futures::{executor::block_on, Stream, TryStreamExt}; + +pub fn record_batch_to_arrow_stream(stream: SendableRecordBatchStream) -> FFI_ArrowArrayStream { + let private_data = Box::new(RecoredBatchStreamPrivateData { + stream, + last_error: None, + }); + + FFI_ArrowArrayStream { + get_schema: Some(get_schema), + get_next: Some(get_next), + get_last_error: Some(get_last_error), + release: Some(release_stream), + private_data: Box::into_raw(private_data) as *mut c_void, + } +} + +struct RecoredBatchStreamPrivateData { + stream: SendableRecordBatchStream, + last_error: Option, +} + +// callback used to drop [FFI_ArrowArrayStream] when it is exported. +unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { + if stream.is_null() { + return; + } + let stream = &mut *stream; + + stream.get_schema = None; + stream.get_next = None; + stream.get_last_error = None; + + let private_data = Box::from_raw(stream.private_data as *mut RecoredBatchStreamPrivateData); + drop(private_data); + + stream.release = None; +} + +// The callback used to get array schema +unsafe extern "C" fn get_schema( + stream: *mut FFI_ArrowArrayStream, + schema: *mut FFI_ArrowSchema, +) -> c_int { + ExportedRecordBatchStream { stream }.get_schema(schema) +} + +// The callback used to get next array +unsafe extern "C" fn get_next( + stream: *mut FFI_ArrowArrayStream, + array: *mut FFI_ArrowArray, +) -> c_int { + ExportedRecordBatchStream { stream }.get_next(array) +} + +// The callback used to get the error from last operation on the `FFI_ArrowArrayStream` +unsafe extern "C" fn get_last_error(stream: *mut FFI_ArrowArrayStream) -> *const c_char { + let mut ffi_stream = ExportedRecordBatchStream { stream }; + // The consumer should not take ownership of this string, we should return + // a const pointer to it. + match ffi_stream.get_last_error() { + Some(err_string) => err_string.as_ptr(), + None => std::ptr::null(), + } +} + +struct ExportedRecordBatchStream { + stream: *mut FFI_ArrowArrayStream, +} + +impl ExportedRecordBatchStream { + fn get_private_data(&mut self) -> &mut RecoredBatchStreamPrivateData { + unsafe { &mut *((*self.stream).private_data as *mut RecoredBatchStreamPrivateData) } + } + + pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 { + let private_data = self.get_private_data(); + let stream = &private_data.stream; + + let schema = FFI_ArrowSchema::try_from(stream.schema().as_ref()); + + match schema { + Ok(schema) => { + unsafe { std::ptr::copy(addr_of!(schema), out, 1) }; + std::mem::forget(schema); + 0 + } + Err(ref err) => { + private_data.last_error = Some( + CString::new(err.to_string()).expect("Error string has a null byte in it."), + ); + 1 + } + } + } + + pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 { + let private_data = self.get_private_data(); + + let maybe_batch = block_on(private_data.stream.try_next()); + + match maybe_batch { + Ok(None) => { + // Marks ArrowArray released to indicate reaching the end of stream. + unsafe { std::ptr::write(out, FFI_ArrowArray::empty()) } + 0 + } + Ok(Some(batch)) => { + let struct_array = StructArray::from(batch); + let array = FFI_ArrowArray::new(&struct_array.to_data()); + + unsafe { std::ptr::write_unaligned(out, array) }; + 0 + } + Err(err) => { + private_data.last_error = Some( + CString::new(err.to_string()).expect("Error string has a null byte in it."), + ); + 1 + } + } + } + + pub fn get_last_error(&mut self) -> Option<&CString> { + self.get_private_data().last_error.as_ref() + } +} + +pub struct ConsumerRecordBatchStream { + reader: ArrowArrayStreamReader, +} + +impl TryFrom for ConsumerRecordBatchStream { + type Error = DataFusionError; + + fn try_from(value: FFI_ArrowArrayStream) -> std::result::Result { + let reader = ArrowArrayStreamReader::try_new(value)?; + + Ok(Self { reader }) + } +} + +impl RecordBatchStream for ConsumerRecordBatchStream { + fn schema(&self) -> arrow::datatypes::SchemaRef { + self.reader.schema() + } +} + +impl Stream for ConsumerRecordBatchStream { + type Item = Result; + + fn poll_next( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll> { + let batch = self + .reader + .next() + .map(|v| v.map_err(|e| DataFusionError::ArrowError(e, None))); + + std::task::Poll::Ready(batch) + } +} diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs new file mode 100644 index 000000000000..121f7e0db807 --- /dev/null +++ b/datafusion/ffi/src/session_config.rs @@ -0,0 +1,48 @@ +use std::{ + ffi::{c_void, CString}, +}; + +use datafusion::{catalog::Session, prelude::SessionConfig}; + +#[repr(C)] +#[derive(Debug)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_SessionConfig { + pub version: i64, + + pub private_data: *mut c_void, +} + +unsafe impl Send for FFI_SessionConfig {} + +pub struct SessionConfigPrivateData { + pub config: SessionConfig, + pub last_error: Option, +} + +struct ExportedSessionConfig { + session: *mut FFI_SessionConfig, +} + +impl ExportedSessionConfig { + fn get_private_data(&mut self) -> &mut SessionConfigPrivateData { + unsafe { &mut *((*self.session).private_data as *mut SessionConfigPrivateData) } + } +} + +impl FFI_SessionConfig { + /// Creates a new [`FFI_TableProvider`]. + pub fn new(session: &dyn Session) -> Self { + let config = session.config().clone(); + let private_data = Box::new(SessionConfigPrivateData { + config, + last_error: None, + }); + + Self { + version: 2, + private_data: Box::into_raw(private_data) as *mut c_void, + } + } +} diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs new file mode 100644 index 000000000000..60fd8d677a6f --- /dev/null +++ b/datafusion/ffi/src/table_provider.rs @@ -0,0 +1,296 @@ +use std::{ + any::Any, + ffi::{c_char, c_int, c_void, CStr, CString}, + ptr::null_mut, + sync::Arc, +}; + +use arrow::{ + datatypes::{Schema, SchemaRef}, + ffi::FFI_ArrowSchema, +}; +use async_trait::async_trait; +use datafusion::{ + catalog::{Session, TableProvider}, + common::DFSchema, + datasource::TableType, + error::DataFusionError, + execution::session_state::SessionStateBuilder, + logical_expr::TableProviderFilterPushDown, + physical_plan::ExecutionPlan, + prelude::{Expr, SessionContext}, +}; +use futures::executor::block_on; + +use super::{ + execution_plan::{ExportedExecutionPlan, FFI_ExecutionPlan}, + session_config::{FFI_SessionConfig, SessionConfigPrivateData}, +}; +use datafusion::error::Result; + +#[repr(C)] +#[derive(Debug)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_TableProvider { + pub version: i64, + pub schema: Option FFI_ArrowSchema>, + pub scan: Option< + unsafe extern "C" fn( + provider: *const FFI_TableProvider, + session_config: *const FFI_SessionConfig, + n_projections: c_int, + projections: *const c_int, + n_filters: c_int, + filters: *const *const c_char, + limit: c_int, + err_code: *mut c_int, + ) -> *mut FFI_ExecutionPlan, + >, + pub private_data: *mut c_void, +} + +unsafe impl Send for FFI_TableProvider {} +unsafe impl Sync for FFI_TableProvider {} + +struct ProviderPrivateData { + provider: Box, +} + +struct ExportedTableProvider(*const FFI_TableProvider); + +// The callback used to get array schema +unsafe extern "C" fn provider_schema(provider: *const FFI_TableProvider) -> FFI_ArrowSchema { + ExportedTableProvider(provider).provider_schema() +} + +unsafe extern "C" fn scan_fn_wrapper( + provider: *const FFI_TableProvider, + session_config: *const FFI_SessionConfig, + n_projections: c_int, + projections: *const c_int, + n_filters: c_int, + filters: *const *const c_char, + limit: c_int, + err_code: *mut c_int, +) -> *mut FFI_ExecutionPlan { + println!("entered scan_fn_wrapper"); + let config = unsafe { (*session_config).private_data as *const SessionConfigPrivateData }; + let session = SessionStateBuilder::new() + .with_config((*config).config.clone()) + .build(); + let ctx = SessionContext::new_with_state(session); + + let num_projections: usize = n_projections.try_into().unwrap_or(0); + + let projections: Vec = std::slice::from_raw_parts(projections, num_projections) + .iter() + .filter_map(|v| (*v).try_into().ok()) + .collect(); + let maybe_projections = match projections.is_empty() { + true => None, + false => Some(&projections), + }; + + let filters_slice = std::slice::from_raw_parts(filters, n_filters as usize); + let filters_vec: Vec = filters_slice + .iter() + .map(|&s| CStr::from_ptr(s).to_string_lossy().to_string()) + .collect(); + + let limit = limit.try_into().ok(); + + let plan = + ExportedTableProvider(provider).provider_scan(&ctx, maybe_projections, filters_vec, limit); + + println!("leaving scan_fn_wrapper, has plan? {}", plan.is_ok()); + + match plan { + Ok(plan) => { + *err_code = 0; + plan + } + Err(_) => { + *err_code = 1; + null_mut() + } + } +} + +impl ExportedTableProvider { + fn get_private_data(&self) -> &ProviderPrivateData { + unsafe { &*((*self.0).private_data as *const ProviderPrivateData) } + } + + pub fn provider_schema(&self) -> FFI_ArrowSchema { + let private_data = self.get_private_data(); + let provider = &private_data.provider; + + // This does silently fail because TableProvider does not return a result + // so we expect it to always pass. Maybe some logging should be added. + FFI_ArrowSchema::try_from(provider.schema().as_ref()).unwrap_or(FFI_ArrowSchema::empty()) + } + + pub fn provider_scan( + &mut self, + ctx: &SessionContext, + projections: Option<&Vec>, + filters: Vec, + limit: Option, + ) -> Result<*mut FFI_ExecutionPlan> { + let private_data = self.get_private_data(); + let provider = &private_data.provider; + + let schema = provider.schema(); + let df_schema: DFSchema = schema.try_into()?; + + let filter_exprs = filters + .into_iter() + .map(|expr_str| ctx.state().create_logical_expr(&expr_str, &df_schema)) + .collect::>>()?; + + let plan = + block_on(provider.scan(&ctx.state(), projections, &filter_exprs, limit))?; + + let plan_boxed = Box::new(FFI_ExecutionPlan::new(plan, ctx.task_ctx())); + Ok(Box::into_raw(plan_boxed)) + } +} + +impl FFI_TableProvider { + /// Creates a new [`FFI_TableProvider`]. + pub fn new(provider: Box) -> Self { + let private_data = Box::new(ProviderPrivateData { provider }); + + Self { + version: 2, + schema: Some(provider_schema), + scan: Some(scan_fn_wrapper), + private_data: Box::into_raw(private_data) as *mut c_void, + } + } + + /** + Replace temporary pointer with updated + # Safety + User must validate the raw pointer is valid. + */ + pub unsafe fn from_raw(raw_provider: *mut FFI_TableProvider) -> Self { + std::ptr::replace(raw_provider, Self::empty()) + } + + /// Creates a new empty [FFI_ArrowArrayStream]. Used to import from the C Stream Interface. + pub fn empty() -> Self { + Self { + version: 0, + schema: None, + scan: None, + private_data: std::ptr::null_mut(), + } + } +} + +#[async_trait] +impl TableProvider for FFI_TableProvider { + /// Returns the table provider as [`Any`](std::any::Any) so that it can be + /// downcast to a specific implementation. + fn as_any(&self) -> &dyn Any { + self + } + + /// Get a reference to the schema for this table + fn schema(&self) -> SchemaRef { + let schema = match self.schema { + Some(func) => unsafe { Schema::try_from(&func(self)).ok() }, + None => None, + }; + Arc::new(schema.unwrap_or(Schema::empty())) + } + + /// Get the type of this table for metadata/catalog purposes. + fn table_type(&self) -> TableType { + todo!() + } + + /// Create an ExecutionPlan that will scan the table. + /// The table provider will be usually responsible of grouping + /// the source data into partitions that can be efficiently + /// parallelized or distributed. + async fn scan( + &self, + session: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + // limit can be used to reduce the amount scanned + // from the datasource as a performance optimization. + // If set, it contains the amount of rows needed by the `LogicalPlan`, + // The datasource should return *at least* this number of rows if available. + limit: Option, + ) -> Result> { + let scan_fn = self.scan.ok_or(DataFusionError::NotImplemented( + "Scan not defined on FFI_TableProvider".to_string(), + ))?; + + let session_config = FFI_SessionConfig::new(session); + + let n_projections = projection.map(|p| p.len()).unwrap_or(0) as c_int; + let projections: Vec = projection + .map(|p| p.iter().map(|v| *v as c_int).collect()) + .unwrap_or_default(); + let projections_ptr = projections.as_ptr(); + + let n_filters = filters.len() as c_int; + let filters: Vec = filters + .iter() + .filter_map(|f| CString::new(f.to_string()).ok()) + .collect(); + let filters_ptr: Vec<*const i8> = filters.iter().map(|s| s.as_ptr()).collect(); + + let limit = match limit { + Some(l) => l as c_int, + None => -1, + }; + + println!("Within scan about to call unsafe scan_fn"); + let mut err_code = 0; + let plan = unsafe { + let plan_ptr = scan_fn( + self, + &session_config, + n_projections, + projections_ptr, + n_filters, + filters_ptr.as_ptr(), + limit, + &mut err_code, + ); + + if 0 != err_code { + return Err(datafusion::error::DataFusionError::Internal( + "Unable to perform scan via FFI".to_string(), + )); + } + + println!( + "Finished scan_fn inside FFI_TableProvider::scan {}", + plan_ptr.is_null() + ); + + let p = ExportedExecutionPlan::new(plan_ptr)?; + println!("ExportedExecutionPlan::new returned inside scan()"); + p + }; + println!("Scan returned with some plan."); + + Ok(Arc::new(plan)) + } + + /// Tests whether the table provider can make use of a filter expression + /// to optimise data retrieval. + fn supports_filters_pushdown( + &self, + filter: &[&Expr], + ) -> Result> { + todo!() + } +} From 4d31722ba58ec73cfdda1a33f589d6f9bf96c30e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 09:53:20 -0400 Subject: [PATCH 02/28] Add table type --- datafusion/ffi/src/lib.rs | 1 + datafusion/ffi/src/table_provider.rs | 91 +++++++++++++++------------- datafusion/ffi/src/table_source.rs | 60 ++++++++++++++++++ 3 files changed, 111 insertions(+), 41 deletions(-) create mode 100644 datafusion/ffi/src/table_source.rs diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index e6767e76007e..ef6825c43d42 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -21,6 +21,7 @@ pub mod execution_plan; pub mod record_batch_stream; pub mod session_config; pub mod table_provider; +pub mod table_source; #[cfg(doctest)] doc_comment::doctest!("../README.md", readme_example_test); diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 60fd8d677a6f..57d63a1a4be6 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -22,15 +22,17 @@ use datafusion::{ }; use futures::executor::block_on; +use crate::table_source::FFI_TableType; + use super::{ execution_plan::{ExportedExecutionPlan, FFI_ExecutionPlan}, session_config::{FFI_SessionConfig, SessionConfigPrivateData}, }; use datafusion::error::Result; +/// A stable interface for creating a DataFusion TableProvider. #[repr(C)] #[derive(Debug)] -#[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_TableProvider { pub version: i64, @@ -47,6 +49,8 @@ pub struct FFI_TableProvider { err_code: *mut c_int, ) -> *mut FFI_ExecutionPlan, >, + pub table_type: Option FFI_TableType>, + pub private_data: *mut c_void, } @@ -57,11 +61,17 @@ struct ProviderPrivateData { provider: Box, } +/// Wrapper struct to provide access functions from the FFI interface to the underlying +/// TableProvider. This struct is allowed to access `private_data` because it lives on +/// the provider's side of the FFI interace. struct ExportedTableProvider(*const FFI_TableProvider); -// The callback used to get array schema -unsafe extern "C" fn provider_schema(provider: *const FFI_TableProvider) -> FFI_ArrowSchema { - ExportedTableProvider(provider).provider_schema() +unsafe extern "C" fn schema_fn_wrapper(provider: *const FFI_TableProvider) -> FFI_ArrowSchema { + ExportedTableProvider(provider).schema() +} + +unsafe extern "C" fn table_type_fn_wrapper(provider: *const FFI_TableProvider) -> FFI_TableType { + ExportedTableProvider(provider).table_type() } unsafe extern "C" fn scan_fn_wrapper( @@ -74,7 +84,6 @@ unsafe extern "C" fn scan_fn_wrapper( limit: c_int, err_code: *mut c_int, ) -> *mut FFI_ExecutionPlan { - println!("entered scan_fn_wrapper"); let config = unsafe { (*session_config).private_data as *const SessionConfigPrivateData }; let session = SessionStateBuilder::new() .with_config((*config).config.clone()) @@ -103,8 +112,6 @@ unsafe extern "C" fn scan_fn_wrapper( let plan = ExportedTableProvider(provider).provider_scan(&ctx, maybe_projections, filters_vec, limit); - println!("leaving scan_fn_wrapper, has plan? {}", plan.is_ok()); - match plan { Ok(plan) => { *err_code = 0; @@ -122,15 +129,22 @@ impl ExportedTableProvider { unsafe { &*((*self.0).private_data as *const ProviderPrivateData) } } - pub fn provider_schema(&self) -> FFI_ArrowSchema { + pub fn schema(&self) -> FFI_ArrowSchema { let private_data = self.get_private_data(); let provider = &private_data.provider; // This does silently fail because TableProvider does not return a result - // so we expect it to always pass. Maybe some logging should be added. + // so we expect it to always pass. FFI_ArrowSchema::try_from(provider.schema().as_ref()).unwrap_or(FFI_ArrowSchema::empty()) } + pub fn table_type(&self) -> FFI_TableType { + let private_data = self.get_private_data(); + let provider = &private_data.provider; + + provider.table_type().into() + } + pub fn provider_scan( &mut self, ctx: &SessionContext, @@ -164,8 +178,9 @@ impl FFI_TableProvider { Self { version: 2, - schema: Some(provider_schema), + schema: Some(schema_fn_wrapper), scan: Some(scan_fn_wrapper), + table_type: Some(table_type_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } @@ -185,51 +200,54 @@ impl FFI_TableProvider { version: 0, schema: None, scan: None, + table_type: None, private_data: std::ptr::null_mut(), } } } +/// This wrapper struct exists on the reciever side of the FFI interface, so it has +/// no guarantees about being able to access the data in `private_data`. Any functions +/// defined on this struct must only use the stable functions provided in +/// FFI_TableProvider to interact with the foreign table provider. +#[derive(Debug)] +struct ForeignTableProvider(*const FFI_TableProvider); + +unsafe impl Send for ForeignTableProvider {} +unsafe impl Sync for ForeignTableProvider {} + #[async_trait] -impl TableProvider for FFI_TableProvider { - /// Returns the table provider as [`Any`](std::any::Any) so that it can be - /// downcast to a specific implementation. +impl TableProvider for ForeignTableProvider { fn as_any(&self) -> &dyn Any { self } - /// Get a reference to the schema for this table fn schema(&self) -> SchemaRef { - let schema = match self.schema { - Some(func) => unsafe { Schema::try_from(&func(self)).ok() }, - None => None, + let schema = unsafe { + let schema_fn = (*self.0).schema; + schema_fn.map(|func| func(self.0)).and_then(|s| Schema::try_from(&s).ok()).unwrap_or(Schema::empty()) }; - Arc::new(schema.unwrap_or(Schema::empty())) + + Arc::new(schema) } - /// Get the type of this table for metadata/catalog purposes. fn table_type(&self) -> TableType { - todo!() + unsafe { + let table_type_fn = (*self.0).table_type; + table_type_fn.map(|func| func(self.0)).unwrap_or(FFI_TableType::Base).into() + } } - /// Create an ExecutionPlan that will scan the table. - /// The table provider will be usually responsible of grouping - /// the source data into partitions that can be efficiently - /// parallelized or distributed. async fn scan( &self, session: &dyn Session, projection: Option<&Vec>, filters: &[Expr], - // limit can be used to reduce the amount scanned - // from the datasource as a performance optimization. - // If set, it contains the amount of rows needed by the `LogicalPlan`, - // The datasource should return *at least* this number of rows if available. limit: Option, ) -> Result> { - let scan_fn = self.scan.ok_or(DataFusionError::NotImplemented( + let scan_fn = unsafe { (*self.0).scan.ok_or(DataFusionError::NotImplemented( "Scan not defined on FFI_TableProvider".to_string(), - ))?; + ))?}; let session_config = FFI_SessionConfig::new(session); @@ -251,11 +269,10 @@ impl TableProvider for FFI_TableProvider { None => -1, }; - println!("Within scan about to call unsafe scan_fn"); let mut err_code = 0; let plan = unsafe { let plan_ptr = scan_fn( - self, + self.0, &session_config, n_projections, projections_ptr, @@ -271,16 +288,8 @@ impl TableProvider for FFI_TableProvider { )); } - println!( - "Finished scan_fn inside FFI_TableProvider::scan {}", - plan_ptr.is_null() - ); - - let p = ExportedExecutionPlan::new(plan_ptr)?; - println!("ExportedExecutionPlan::new returned inside scan()"); - p + ExportedExecutionPlan::new(plan_ptr)? }; - println!("Scan returned with some plan."); Ok(Arc::new(plan)) } diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs new file mode 100644 index 000000000000..f6dbf7bc5885 --- /dev/null +++ b/datafusion/ffi/src/table_source.rs @@ -0,0 +1,60 @@ +use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown}; + +// TODO Should we just define TableProviderFilterPushDown as repr(C)? +#[repr(C)] +#[allow(non_camel_case_types)] +pub enum FFI_TableProviderFilterPushDown { + Unsupported, + Inexact, + Exact, +} + +impl From for TableProviderFilterPushDown { + fn from(value: FFI_TableProviderFilterPushDown) -> Self { + match value { + FFI_TableProviderFilterPushDown::Unsupported => TableProviderFilterPushDown::Unsupported, + FFI_TableProviderFilterPushDown::Inexact => TableProviderFilterPushDown::Inexact, + FFI_TableProviderFilterPushDown::Exact => TableProviderFilterPushDown::Exact, + } + } +} + +impl From for FFI_TableProviderFilterPushDown { + fn from(value: TableProviderFilterPushDown) -> Self { + match value { + TableProviderFilterPushDown::Unsupported => FFI_TableProviderFilterPushDown::Unsupported, + TableProviderFilterPushDown::Inexact => FFI_TableProviderFilterPushDown::Inexact, + TableProviderFilterPushDown::Exact => FFI_TableProviderFilterPushDown::Exact, + } + } +} + +// TODO Should we just define FFI_TableType as repr(C)? +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FFI_TableType { + Base, + View, + Temporary, +} + +impl From for TableType { + fn from(value: FFI_TableType) -> Self { + match value { + FFI_TableType::Base => TableType::Base, + FFI_TableType::View => TableType::View, + FFI_TableType::Temporary => TableType::Temporary, + } + } +} + +impl From for FFI_TableType { + fn from(value: TableType) -> Self { + match value { + TableType::Base => FFI_TableType::Base, + TableType::View => FFI_TableType::View, + TableType::Temporary => FFI_TableType::Temporary, + } + } +} \ No newline at end of file From d5f541e486e6fcf3ff6c3342bfb0375b18fa81f8 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 10:00:04 -0400 Subject: [PATCH 03/28] Make struct pub --- datafusion/ffi/src/table_provider.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 57d63a1a4be6..25b7f12b03e8 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -35,7 +35,7 @@ use datafusion::error::Result; #[derive(Debug)] #[allow(non_camel_case_types)] pub struct FFI_TableProvider { - pub version: i64, + pub version: c_int, pub schema: Option FFI_ArrowSchema>, pub scan: Option< unsafe extern "C" fn( @@ -177,7 +177,7 @@ impl FFI_TableProvider { let private_data = Box::new(ProviderPrivateData { provider }); Self { - version: 2, + version: 1, schema: Some(schema_fn_wrapper), scan: Some(scan_fn_wrapper), table_type: Some(table_type_fn_wrapper), @@ -211,7 +211,7 @@ impl FFI_TableProvider { /// defined on this struct must only use the stable functions provided in /// FFI_TableProvider to interact with the foreign table provider. #[derive(Debug)] -struct ForeignTableProvider(*const FFI_TableProvider); +pub struct ForeignTableProvider(pub *const FFI_TableProvider); unsafe impl Send for ForeignTableProvider {} unsafe impl Sync for ForeignTableProvider {} From afeb4398c0496397b1566cb7e8e4076e06ca645b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 11:06:46 -0400 Subject: [PATCH 04/28] Implementing supports_filters_pushdown --- datafusion/ffi/src/execution_plan.rs | 86 +++++++----- datafusion/ffi/src/record_batch_stream.rs | 17 ++- datafusion/ffi/src/session_config.rs | 6 +- datafusion/ffi/src/table_provider.rs | 159 ++++++++++++++++++---- datafusion/ffi/src/table_source.rs | 26 ++-- 5 files changed, 220 insertions(+), 74 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index b04709ae8118..d32670457fea 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -12,7 +12,9 @@ use datafusion::{ execution::{SendableRecordBatchStream, TaskContext}, physical_plan::{DisplayAs, ExecutionMode, ExecutionPlan, PlanProperties}, }; -use datafusion::{error::Result, physical_expr::EquivalenceProperties, prelude::SessionContext}; +use datafusion::{ + error::Result, physical_expr::EquivalenceProperties, prelude::SessionContext, +}; use datafusion_proto::{ physical_plan::{ from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, @@ -23,15 +25,18 @@ use datafusion_proto::{ }; use prost::Message; -use super::record_batch_stream::{record_batch_to_arrow_stream, ConsumerRecordBatchStream}; +use super::record_batch_stream::{ + record_batch_to_arrow_stream, ConsumerRecordBatchStream, +}; #[repr(C)] #[derive(Debug)] #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_ExecutionPlan { - pub properties: - Option FFI_PlanProperties>, + pub properties: Option< + unsafe extern "C" fn(plan: *const FFI_ExecutionPlan) -> FFI_PlanProperties, + >, pub children: Option< unsafe extern "C" fn( plan: *const FFI_ExecutionPlan, @@ -57,7 +62,9 @@ pub struct ExecutionPlanPrivateData { pub context: Arc, } -unsafe extern "C" fn properties_fn_wrapper(plan: *const FFI_ExecutionPlan) -> FFI_PlanProperties { +unsafe extern "C" fn properties_fn_wrapper( + plan: *const FFI_ExecutionPlan, +) -> FFI_PlanProperties { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; let properties = (*private_data).plan.properties(); properties.clone().into() @@ -193,9 +200,10 @@ impl ExportedExecutionPlan { println!("entered ExportedExecutionPlan::new"); let properties = unsafe { - let properties_fn = (*plan).properties.ok_or(DataFusionError::NotImplemented( - "properties not implemented on FFI_ExecutionPlan".to_string(), - ))?; + let properties_fn = + (*plan).properties.ok_or(DataFusionError::NotImplemented( + "properties not implemented on FFI_ExecutionPlan".to_string(), + ))?; println!("About to call properties fn"); properties_fn(plan).try_into()? }; @@ -298,7 +306,8 @@ impl ExecutionPlan for ExportedExecutionPlan { 0 => ConsumerRecordBatchStream::try_from(arrow_stream) .map(|v| Pin::new(Box::new(v)) as SendableRecordBatchStream), _ => Err(DataFusionError::Execution( - "Error occurred during FFI call to FFI_ExecutionPlan execute.".to_string(), + "Error occurred during FFI call to FFI_ExecutionPlan execute." + .to_string(), )), } } @@ -319,8 +328,9 @@ pub struct FFI_PlanProperties { ) -> i32, >, - pub execution_mode: - Option FFI_ExecutionMode>, + pub execution_mode: Option< + unsafe extern "C" fn(plan: *const FFI_PlanProperties) -> FFI_ExecutionMode, + >, // PhysicalSortExprNodeCollection proto pub output_ordering: Option< @@ -331,7 +341,8 @@ pub struct FFI_PlanProperties { ) -> i32, >, - pub schema: Option FFI_ArrowSchema>, + pub schema: + Option FFI_ArrowSchema>, pub private_data: *mut c_void, } @@ -393,10 +404,11 @@ unsafe extern "C" fn output_ordering_fn_wrapper( .to_owned(); let codec = DefaultPhysicalExtensionCodec {}; - let physical_sort_expr_nodes = match serialize_physical_sort_exprs(output_ordering, &codec) { - Ok(p) => p, - Err(_) => return 1, - }; + let physical_sort_expr_nodes = + match serialize_physical_sort_exprs(output_ordering, &codec) { + Ok(p) => p, + Err(_) => return 1, + }; let ordering_data = PhysicalSortExprNodeCollection { physical_sort_expr_nodes, @@ -411,7 +423,9 @@ unsafe extern "C" fn output_ordering_fn_wrapper( } // pub schema: Option FFI_ArrowSchema>, -unsafe extern "C" fn schema_fn_wrapper(properties: *const FFI_PlanProperties) -> FFI_ArrowSchema { +unsafe extern "C" fn schema_fn_wrapper( + properties: *const FFI_PlanProperties, +) -> FFI_ArrowSchema { let private_data = (*properties).private_data as *const PlanProperties; let schema = (*private_data).eq_properties.schema(); @@ -460,11 +474,13 @@ impl TryFrom for PlanProperties { let ffi_schema = schema_fn(&value); let schema: Schema = (&ffi_schema).try_into()?; - let ordering_fn = value - .output_ordering - .ok_or(DataFusionError::NotImplemented( - "output_ordering() not implemented on FFI_PlanProperties".to_string(), - ))?; + let ordering_fn = + value + .output_ordering + .ok_or(DataFusionError::NotImplemented( + "output_ordering() not implemented on FFI_PlanProperties" + .to_string(), + ))?; let mut buff_size = 0; let mut buff = null_mut(); if ordering_fn(&value, &mut buff_size, &mut buff) != 0 { @@ -483,8 +499,9 @@ impl TryFrom for PlanProperties { false => { let data = slice::from_raw_parts(buff, buff_size); - let proto_output_ordering = PhysicalSortExprNodeCollection::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let proto_output_ordering = + PhysicalSortExprNodeCollection::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; Some(parse_physical_sort_exprs( &proto_output_ordering.physical_sort_expr_nodes, @@ -499,7 +516,8 @@ impl TryFrom for PlanProperties { value .output_partitioning .ok_or(DataFusionError::NotImplemented( - "output_partitioning() not implemented on FFI_PlanProperties".to_string(), + "output_partitioning() not implemented on FFI_PlanProperties" + .to_string(), ))?; if partitioning_fn(&value, &mut buff_size, &mut buff) != 0 { return Err(DataFusionError::Plan( @@ -509,8 +527,8 @@ impl TryFrom for PlanProperties { } let data = slice::from_raw_parts(buff, buff_size); - let proto_partitioning = - Partitioning::decode(data).map_err(|e| DataFusionError::External(Box::new(e)))?; + let proto_partitioning = Partitioning::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; // TODO: Validate this unwrap is safe. let partitioning = parse_protobuf_partitioning( Some(&proto_partitioning), @@ -520,15 +538,17 @@ impl TryFrom for PlanProperties { )? .unwrap(); - let execution_mode_fn = value.execution_mode.ok_or(DataFusionError::NotImplemented( - "execution_mode() not implemented on FFI_PlanProperties".to_string(), - ))?; + let execution_mode_fn = + value.execution_mode.ok_or(DataFusionError::NotImplemented( + "execution_mode() not implemented on FFI_PlanProperties".to_string(), + ))?; let execution_mode = execution_mode_fn(&value).into(); let eq_properties = match orderings { - Some(ordering) => { - EquivalenceProperties::new_with_orderings(Arc::new(schema), &[ordering]) - } + Some(ordering) => EquivalenceProperties::new_with_orderings( + Arc::new(schema), + &[ordering], + ), None => EquivalenceProperties::new(Arc::new(schema)), }; diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index 5e1d4ff2acdf..afe49c8c1328 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -19,7 +19,9 @@ use datafusion::{ }; use futures::{executor::block_on, Stream, TryStreamExt}; -pub fn record_batch_to_arrow_stream(stream: SendableRecordBatchStream) -> FFI_ArrowArrayStream { +pub fn record_batch_to_arrow_stream( + stream: SendableRecordBatchStream, +) -> FFI_ArrowArrayStream { let private_data = Box::new(RecoredBatchStreamPrivateData { stream, last_error: None, @@ -50,7 +52,8 @@ unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { stream.get_next = None; stream.get_last_error = None; - let private_data = Box::from_raw(stream.private_data as *mut RecoredBatchStreamPrivateData); + let private_data = + Box::from_raw(stream.private_data as *mut RecoredBatchStreamPrivateData); drop(private_data); stream.release = None; @@ -89,7 +92,9 @@ struct ExportedRecordBatchStream { impl ExportedRecordBatchStream { fn get_private_data(&mut self) -> &mut RecoredBatchStreamPrivateData { - unsafe { &mut *((*self.stream).private_data as *mut RecoredBatchStreamPrivateData) } + unsafe { + &mut *((*self.stream).private_data as *mut RecoredBatchStreamPrivateData) + } } pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 { @@ -106,7 +111,8 @@ impl ExportedRecordBatchStream { } Err(ref err) => { private_data.last_error = Some( - CString::new(err.to_string()).expect("Error string has a null byte in it."), + CString::new(err.to_string()) + .expect("Error string has a null byte in it."), ); 1 } @@ -133,7 +139,8 @@ impl ExportedRecordBatchStream { } Err(err) => { private_data.last_error = Some( - CString::new(err.to_string()).expect("Error string has a null byte in it."), + CString::new(err.to_string()) + .expect("Error string has a null byte in it."), ); 1 } diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 121f7e0db807..7a436ca3b288 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -1,6 +1,4 @@ -use std::{ - ffi::{c_void, CString}, -}; +use std::ffi::{c_void, CString}; use datafusion::{catalog::Session, prelude::SessionConfig}; @@ -21,7 +19,7 @@ pub struct SessionConfigPrivateData { pub last_error: Option, } -struct ExportedSessionConfig { +pub struct ExportedSessionConfig { session: *mut FFI_SessionConfig, } diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 25b7f12b03e8..638cb1c5dd5e 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -2,6 +2,7 @@ use std::{ any::Any, ffi::{c_char, c_int, c_void, CStr, CString}, ptr::null_mut, + slice, sync::Arc, }; @@ -20,9 +21,14 @@ use datafusion::{ physical_plan::ExecutionPlan, prelude::{Expr, SessionContext}, }; +use datafusion_proto::{ + logical_plan::{from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec}, + protobuf::LogicalExprList, +}; use futures::executor::block_on; +use prost::Message; -use crate::table_source::FFI_TableType; +use crate::table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}; use super::{ execution_plan::{ExportedExecutionPlan, FFI_ExecutionPlan}, @@ -35,8 +41,9 @@ use datafusion::error::Result; #[derive(Debug)] #[allow(non_camel_case_types)] pub struct FFI_TableProvider { - pub version: c_int, - pub schema: Option FFI_ArrowSchema>, + pub schema: Option< + unsafe extern "C" fn(provider: *const FFI_TableProvider) -> FFI_ArrowSchema, + >, pub scan: Option< unsafe extern "C" fn( provider: *const FFI_TableProvider, @@ -49,7 +56,18 @@ pub struct FFI_TableProvider { err_code: *mut c_int, ) -> *mut FFI_ExecutionPlan, >, - pub table_type: Option FFI_TableType>, + pub table_type: + Option FFI_TableType>, + + pub supports_filters_pushdown: Option< + unsafe extern "C" fn( + provider: *const FFI_TableProvider, + filter_buff_size: c_int, + filter_buffer: *const u8, + num_filters: &mut c_int, + out: *mut FFI_TableProviderFilterPushDown, + ) -> c_int, + >, pub private_data: *mut c_void, } @@ -66,14 +84,39 @@ struct ProviderPrivateData { /// the provider's side of the FFI interace. struct ExportedTableProvider(*const FFI_TableProvider); -unsafe extern "C" fn schema_fn_wrapper(provider: *const FFI_TableProvider) -> FFI_ArrowSchema { +unsafe extern "C" fn schema_fn_wrapper( + provider: *const FFI_TableProvider, +) -> FFI_ArrowSchema { ExportedTableProvider(provider).schema() } -unsafe extern "C" fn table_type_fn_wrapper(provider: *const FFI_TableProvider) -> FFI_TableType { +unsafe extern "C" fn table_type_fn_wrapper( + provider: *const FFI_TableProvider, +) -> FFI_TableType { ExportedTableProvider(provider).table_type() } +unsafe extern "C" fn supports_filters_pushdown_fn_wrapper( + provider: *const FFI_TableProvider, + filter_buff_size: c_int, + filter_buffer: *const u8, + num_filters: &mut c_int, + out: *mut FFI_TableProviderFilterPushDown, +) -> c_int { + let results = ExportedTableProvider(provider) + .supports_filters_pushdown(filter_buff_size, filter_buffer); + + match results { + Ok(pushdowns) => { + *num_filters = pushdowns.len() as c_int; + std::ptr::copy(pushdowns.as_ptr(), out, 1); + std::mem::forget(pushdowns); + 0 + } + Err(_e) => 1, + } +} + unsafe extern "C" fn scan_fn_wrapper( provider: *const FFI_TableProvider, session_config: *const FFI_SessionConfig, @@ -84,7 +127,8 @@ unsafe extern "C" fn scan_fn_wrapper( limit: c_int, err_code: *mut c_int, ) -> *mut FFI_ExecutionPlan { - let config = unsafe { (*session_config).private_data as *const SessionConfigPrivateData }; + let config = + unsafe { (*session_config).private_data as *const SessionConfigPrivateData }; let session = SessionStateBuilder::new() .with_config((*config).config.clone()) .build(); @@ -92,10 +136,11 @@ unsafe extern "C" fn scan_fn_wrapper( let num_projections: usize = n_projections.try_into().unwrap_or(0); - let projections: Vec = std::slice::from_raw_parts(projections, num_projections) - .iter() - .filter_map(|v| (*v).try_into().ok()) - .collect(); + let projections: Vec = + std::slice::from_raw_parts(projections, num_projections) + .iter() + .filter_map(|v| (*v).try_into().ok()) + .collect(); let maybe_projections = match projections.is_empty() { true => None, false => Some(&projections), @@ -109,8 +154,12 @@ unsafe extern "C" fn scan_fn_wrapper( let limit = limit.try_into().ok(); - let plan = - ExportedTableProvider(provider).provider_scan(&ctx, maybe_projections, filters_vec, limit); + let plan = ExportedTableProvider(provider).provider_scan( + &ctx, + maybe_projections, + filters_vec, + limit, + ); match plan { Ok(plan) => { @@ -135,7 +184,8 @@ impl ExportedTableProvider { // This does silently fail because TableProvider does not return a result // so we expect it to always pass. - FFI_ArrowSchema::try_from(provider.schema().as_ref()).unwrap_or(FFI_ArrowSchema::empty()) + FFI_ArrowSchema::try_from(provider.schema().as_ref()) + .unwrap_or(FFI_ArrowSchema::empty()) } pub fn table_type(&self) -> FFI_TableType { @@ -169,6 +219,37 @@ impl ExportedTableProvider { let plan_boxed = Box::new(FFI_ExecutionPlan::new(plan, ctx.task_ctx())); Ok(Box::into_raw(plan_boxed)) } + + pub fn supports_filters_pushdown( + &self, + buffer_size: c_int, + filter_buffer: *const u8, + ) -> Result> { + unsafe { + let default_ctx = SessionContext::new(); + let codec = DefaultLogicalExtensionCodec {}; + + let filters = match buffer_size > 0 { + false => vec![], + true => { + let data = slice::from_raw_parts(filter_buffer, buffer_size as usize); + + let proto_filters = LogicalExprList::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? + } + }; + let filters_borrowed: Vec<&Expr> = filters.iter().collect(); + + let private_data = self.get_private_data(); + let provider = &private_data.provider; + + provider + .supports_filters_pushdown(&filters_borrowed) + .map(|f| f.iter().map(|v| v.into()).collect()) + } + } } impl FFI_TableProvider { @@ -177,10 +258,10 @@ impl FFI_TableProvider { let private_data = Box::new(ProviderPrivateData { provider }); Self { - version: 1, schema: Some(schema_fn_wrapper), scan: Some(scan_fn_wrapper), table_type: Some(table_type_fn_wrapper), + supports_filters_pushdown: Some(supports_filters_pushdown_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } @@ -197,10 +278,10 @@ impl FFI_TableProvider { /// Creates a new empty [FFI_ArrowArrayStream]. Used to import from the C Stream Interface. pub fn empty() -> Self { Self { - version: 0, schema: None, scan: None, table_type: None, + supports_filters_pushdown: None, private_data: std::ptr::null_mut(), } } @@ -208,7 +289,7 @@ impl FFI_TableProvider { /// This wrapper struct exists on the reciever side of the FFI interface, so it has /// no guarantees about being able to access the data in `private_data`. Any functions -/// defined on this struct must only use the stable functions provided in +/// defined on this struct must only use the stable functions provided in /// FFI_TableProvider to interact with the foreign table provider. #[derive(Debug)] pub struct ForeignTableProvider(pub *const FFI_TableProvider); @@ -225,7 +306,10 @@ impl TableProvider for ForeignTableProvider { fn schema(&self) -> SchemaRef { let schema = unsafe { let schema_fn = (*self.0).schema; - schema_fn.map(|func| func(self.0)).and_then(|s| Schema::try_from(&s).ok()).unwrap_or(Schema::empty()) + schema_fn + .map(|func| func(self.0)) + .and_then(|s| Schema::try_from(&s).ok()) + .unwrap_or(Schema::empty()) }; Arc::new(schema) @@ -234,7 +318,10 @@ impl TableProvider for ForeignTableProvider { fn table_type(&self) -> TableType { unsafe { let table_type_fn = (*self.0).table_type; - table_type_fn.map(|func| func(self.0)).unwrap_or(FFI_TableType::Base).into() + table_type_fn + .map(|func| func(self.0)) + .unwrap_or(FFI_TableType::Base) + .into() } } @@ -245,9 +332,11 @@ impl TableProvider for ForeignTableProvider { filters: &[Expr], limit: Option, ) -> Result> { - let scan_fn = unsafe { (*self.0).scan.ok_or(DataFusionError::NotImplemented( - "Scan not defined on FFI_TableProvider".to_string(), - ))?}; + let scan_fn = unsafe { + (*self.0).scan.ok_or(DataFusionError::NotImplemented( + "Scan not defined on FFI_TableProvider".to_string(), + ))? + }; let session_config = FFI_SessionConfig::new(session); @@ -300,6 +389,30 @@ impl TableProvider for ForeignTableProvider { &self, filter: &[&Expr], ) -> Result> { - todo!() + unsafe { + let codec = DefaultLogicalExtensionCodec {}; + + let expr_list = LogicalExprList { + expr: serialize_exprs(filter.iter().map(|f| f.to_owned()), &codec)? + }; + let expr_bytes = expr_list.encode_to_vec(); + let buffer_size = expr_bytes.len(); + let buffer = expr_bytes.as_ptr(); + + let pushdown_fn = (*self.0).supports_filters_pushdown.ok_or(DataFusionError::Plan("FFI_TableProvider does not implement supports_filters_pushdown".to_string()))?; + let mut num_return = 0; + let pushdowns = null_mut(); + let err_code = pushdown_fn(self.0, buffer_size as c_int, buffer, &mut num_return, pushdowns); + let pushdowns_slice = slice::from_raw_parts(pushdowns, num_return as usize); + + match err_code { + 0 => { + Ok(pushdowns_slice.iter().map(|v| v.into()).collect()) + } + _ => { + Err(DataFusionError::Plan("Error occurred during FFI call to supports_filters_pushdown".to_string())) + } + } + } } } diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs index f6dbf7bc5885..077470d6411a 100644 --- a/datafusion/ffi/src/table_source.rs +++ b/datafusion/ffi/src/table_source.rs @@ -9,21 +9,29 @@ pub enum FFI_TableProviderFilterPushDown { Exact, } -impl From for TableProviderFilterPushDown { - fn from(value: FFI_TableProviderFilterPushDown) -> Self { +impl From<&FFI_TableProviderFilterPushDown> for TableProviderFilterPushDown { + fn from(value: &FFI_TableProviderFilterPushDown) -> Self { match value { - FFI_TableProviderFilterPushDown::Unsupported => TableProviderFilterPushDown::Unsupported, - FFI_TableProviderFilterPushDown::Inexact => TableProviderFilterPushDown::Inexact, + FFI_TableProviderFilterPushDown::Unsupported => { + TableProviderFilterPushDown::Unsupported + } + FFI_TableProviderFilterPushDown::Inexact => { + TableProviderFilterPushDown::Inexact + } FFI_TableProviderFilterPushDown::Exact => TableProviderFilterPushDown::Exact, } } } -impl From for FFI_TableProviderFilterPushDown { - fn from(value: TableProviderFilterPushDown) -> Self { +impl From<&TableProviderFilterPushDown> for FFI_TableProviderFilterPushDown { + fn from(value: &TableProviderFilterPushDown) -> Self { match value { - TableProviderFilterPushDown::Unsupported => FFI_TableProviderFilterPushDown::Unsupported, - TableProviderFilterPushDown::Inexact => FFI_TableProviderFilterPushDown::Inexact, + TableProviderFilterPushDown::Unsupported => { + FFI_TableProviderFilterPushDown::Unsupported + } + TableProviderFilterPushDown::Inexact => { + FFI_TableProviderFilterPushDown::Inexact + } TableProviderFilterPushDown::Exact => FFI_TableProviderFilterPushDown::Exact, } } @@ -57,4 +65,4 @@ impl From for FFI_TableType { TableType::Temporary => FFI_TableType::Temporary, } } -} \ No newline at end of file +} From 7fb77da40e63addd96ceeca3c38610857f5c1e3b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 12:01:55 -0400 Subject: [PATCH 05/28] Move plan properties over to its own file --- datafusion/ffi/src/execution_plan.rs | 324 +------------------------- datafusion/ffi/src/lib.rs | 1 + datafusion/ffi/src/plan_properties.rs | 265 +++++++++++++++++++++ datafusion/ffi/src/session_config.rs | 18 +- datafusion/ffi/src/table_provider.rs | 9 +- 5 files changed, 287 insertions(+), 330 deletions(-) create mode 100644 datafusion/ffi/src/plan_properties.rs diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index d32670457fea..5769c80a2cfd 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -1,29 +1,18 @@ use std::{ ffi::{c_char, c_void, CString}, pin::Pin, - ptr::null_mut, - slice, sync::Arc, }; -use arrow::{datatypes::Schema, ffi::FFI_ArrowSchema, ffi_stream::FFI_ArrowArrayStream}; +use arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::{ error::DataFusionError, execution::{SendableRecordBatchStream, TaskContext}, - physical_plan::{DisplayAs, ExecutionMode, ExecutionPlan, PlanProperties}, + physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}, }; -use datafusion::{ - error::Result, physical_expr::EquivalenceProperties, prelude::SessionContext, -}; -use datafusion_proto::{ - physical_plan::{ - from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, - to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, - DefaultPhysicalExtensionCodec, - }, - protobuf::{Partitioning, PhysicalSortExprNodeCollection}, -}; -use prost::Message; +use datafusion::error::Result; + +use crate::plan_properties::FFI_PlanProperties; use super::record_batch_stream::{ record_batch_to_arrow_stream, ConsumerRecordBatchStream, @@ -97,7 +86,7 @@ unsafe extern "C" fn execute_fn_wrapper( let record_batch_stream = match (*private_data) .plan - .execute(partition, (*private_data).context.clone()) + .execute(partition, Arc::clone(&(*private_data).context)) { Ok(rbs) => rbs, Err(_e) => { @@ -148,14 +137,13 @@ impl DisplayAs for ExportedExecutionPlan { impl FFI_ExecutionPlan { /// This function is called on the provider's side. - pub fn new(plan: Arc, context: Arc) -> Self { + pub fn new(plan: Arc, context: Arc) -> Self { let children = plan .children() .into_iter() - .map(|child| Box::new(FFI_ExecutionPlan::new(child.clone(), context.clone()))) + .map(|child| Box::new(FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(&context)))) .map(|child| Box::into_raw(child) as *const FFI_ExecutionPlan) .collect(); - println!("children collected"); let private_data = Box::new(ExecutionPlanPrivateData { plan, @@ -163,7 +151,6 @@ impl FFI_ExecutionPlan { context, last_error: None, }); - println!("generated private data, ready to return"); Self { properties: Some(properties_fn_wrapper), @@ -198,17 +185,14 @@ impl ExportedExecutionPlan { .unwrap_or("Unable to parse FFI_ExecutionPlan name") .to_string(); - println!("entered ExportedExecutionPlan::new"); let properties = unsafe { let properties_fn = (*plan).properties.ok_or(DataFusionError::NotImplemented( "properties not implemented on FFI_ExecutionPlan".to_string(), ))?; - println!("About to call properties fn"); properties_fn(plan).try_into()? }; - println!("created properties"); let children = unsafe { let children_fn = (*plan).children.ok_or(DataFusionError::NotImplemented( "children not implemented on FFI_ExecutionPlan".to_string(), @@ -217,11 +201,6 @@ impl ExportedExecutionPlan { let mut err_code = 0; let children_ptr = children_fn(plan, &mut num_children, &mut err_code); - println!( - "We called the FFI function children so the provider told us we have {} children", - num_children - ); - if err_code != 0 { return Err(DataFusionError::Plan( "Error getting children for FFI_ExecutionPlan".to_string(), @@ -232,25 +211,16 @@ impl ExportedExecutionPlan { let maybe_children: Result> = ffi_vec .into_iter() .map(|child| { - println!("Ok, we are about to examine a child ffi_executionplan"); - if let Some(props_fn) = (*child).properties { - println!("We do have properties on the child "); - let child_props = props_fn(child); - println!("Child schema {:?}", child_props.schema); - } let child_plan = ExportedExecutionPlan::new(child); child_plan.map(|c| Arc::new(c) as Arc) }) .collect(); - println!("finsihed maybe children"); maybe_children? }; - println!("About to return ExportedExecurtionPlan"); - Ok(Self { name, plan, @@ -313,281 +283,3 @@ impl ExecutionPlan for ExportedExecutionPlan { } } } - -#[repr(C)] -#[derive(Debug)] -#[allow(missing_docs)] -#[allow(non_camel_case_types)] -pub struct FFI_PlanProperties { - // Returns protobuf serialized bytes of the partitioning - pub output_partitioning: Option< - unsafe extern "C" fn( - plan: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, - ) -> i32, - >, - - pub execution_mode: Option< - unsafe extern "C" fn(plan: *const FFI_PlanProperties) -> FFI_ExecutionMode, - >, - - // PhysicalSortExprNodeCollection proto - pub output_ordering: Option< - unsafe extern "C" fn( - plan: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, - ) -> i32, - >, - - pub schema: - Option FFI_ArrowSchema>, - - pub private_data: *mut c_void, -} - -unsafe extern "C" fn output_partitioning_fn_wrapper( - properties: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, -) -> i32 { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() - let private_data = (*properties).private_data as *const PlanProperties; - let partitioning = (*private_data).output_partitioning(); - - let codec = DefaultPhysicalExtensionCodec {}; - let partitioning_data = match serialize_partitioning(partitioning, &codec) { - Ok(p) => p, - Err(_) => return 1, - }; - - let mut partition_bytes = partitioning_data.encode_to_vec(); - *buffer_size = partition_bytes.len(); - *buffer_bytes = partition_bytes.as_mut_ptr(); - - std::mem::forget(partition_bytes); - - 0 -} - -unsafe extern "C" fn execution_mode_fn_wrapper( - properties: *const FFI_PlanProperties, -) -> FFI_ExecutionMode { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() - let private_data = (*properties).private_data as *const PlanProperties; - let execution_mode = (*private_data).execution_mode(); - - execution_mode.into() -} - -unsafe extern "C" fn output_ordering_fn_wrapper( - properties: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, -) -> i32 { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() - let private_data = (*properties).private_data as *const PlanProperties; - let output_ordering = match (*private_data).output_ordering() { - Some(o) => o, - None => { - *buffer_size = 0; - return 0; - } - } - .to_owned(); - - let codec = DefaultPhysicalExtensionCodec {}; - let physical_sort_expr_nodes = - match serialize_physical_sort_exprs(output_ordering, &codec) { - Ok(p) => p, - Err(_) => return 1, - }; - - let ordering_data = PhysicalSortExprNodeCollection { - physical_sort_expr_nodes, - }; - - let mut ordering_bytes = ordering_data.encode_to_vec(); - *buffer_size = ordering_bytes.len(); - *buffer_bytes = ordering_bytes.as_mut_ptr(); - std::mem::forget(ordering_bytes); - - 0 -} - -// pub schema: Option FFI_ArrowSchema>, -unsafe extern "C" fn schema_fn_wrapper( - properties: *const FFI_PlanProperties, -) -> FFI_ArrowSchema { - let private_data = (*properties).private_data as *const PlanProperties; - let schema = (*private_data).eq_properties.schema(); - - // This does silently fail because TableProvider does not return a result - // so we expect it to always pass. Maybe some logging should be added. - FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()) -} - -impl From for FFI_PlanProperties { - fn from(value: PlanProperties) -> Self { - let private_data = Box::new(value); - - Self { - output_partitioning: Some(output_partitioning_fn_wrapper), - execution_mode: Some(execution_mode_fn_wrapper), - output_ordering: Some(output_ordering_fn_wrapper), - schema: Some(schema_fn_wrapper), - private_data: Box::into_raw(private_data) as *mut c_void, - } - } -} - -// /// Creates a new [`FFI_TableProvider`]. -// pub fn new(provider: Box) -> Self { -// let private_data = Box::new(ProviderPrivateData { -// provider, -// last_error: None, -// }); - -// Self { -// version: 2, -// schema: Some(provider_schema), -// scan: Some(provider_scan), -// private_data: Box::into_raw(private_data) as *mut c_void, -// } -// } - -impl TryFrom for PlanProperties { - type Error = DataFusionError; - - fn try_from(value: FFI_PlanProperties) -> std::result::Result { - unsafe { - let schema_fn = value.schema.ok_or(DataFusionError::NotImplemented( - "schema() not implemented on FFI_PlanProperties".to_string(), - ))?; - let ffi_schema = schema_fn(&value); - let schema: Schema = (&ffi_schema).try_into()?; - - let ordering_fn = - value - .output_ordering - .ok_or(DataFusionError::NotImplemented( - "output_ordering() not implemented on FFI_PlanProperties" - .to_string(), - ))?; - let mut buff_size = 0; - let mut buff = null_mut(); - if ordering_fn(&value, &mut buff_size, &mut buff) != 0 { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_ordering in FFI_PlanProperties" - .to_string(), - )); - } - - // TODO we will need to get these, but unsure if it happesn on the provider or consumer right now. - let default_ctx = SessionContext::new(); - let codex = DefaultPhysicalExtensionCodec {}; - - let orderings = match buff_size == 0 { - true => None, - false => { - let data = slice::from_raw_parts(buff, buff_size); - - let proto_output_ordering = - PhysicalSortExprNodeCollection::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - Some(parse_physical_sort_exprs( - &proto_output_ordering.physical_sort_expr_nodes, - &default_ctx, - &schema, - &codex, - )?) - } - }; - - let partitioning_fn = - value - .output_partitioning - .ok_or(DataFusionError::NotImplemented( - "output_partitioning() not implemented on FFI_PlanProperties" - .to_string(), - ))?; - if partitioning_fn(&value, &mut buff_size, &mut buff) != 0 { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" - .to_string(), - )); - } - let data = slice::from_raw_parts(buff, buff_size); - - let proto_partitioning = Partitioning::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - // TODO: Validate this unwrap is safe. - let partitioning = parse_protobuf_partitioning( - Some(&proto_partitioning), - &default_ctx, - &schema, - &codex, - )? - .unwrap(); - - let execution_mode_fn = - value.execution_mode.ok_or(DataFusionError::NotImplemented( - "execution_mode() not implemented on FFI_PlanProperties".to_string(), - ))?; - let execution_mode = execution_mode_fn(&value).into(); - - let eq_properties = match orderings { - Some(ordering) => EquivalenceProperties::new_with_orderings( - Arc::new(schema), - &[ordering], - ), - None => EquivalenceProperties::new(Arc::new(schema)), - }; - - Ok(Self::new(eq_properties, partitioning, execution_mode)) - } - } - // fn from(value: FFI_PlanProperties) -> Self { - // let schema = self.schema() - - // let equiv_prop = EquivalenceProperties::new_with_orderings(schema, orderings); - // } -} - -#[repr(C)] -#[allow(non_camel_case_types)] -pub enum FFI_ExecutionMode { - Bounded, - - Unbounded, - - PipelineBreaking, -} - -impl From for FFI_ExecutionMode { - fn from(value: ExecutionMode) -> Self { - match value { - ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, - ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, - ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, - } - } -} - -impl From for ExecutionMode { - fn from(value: FFI_ExecutionMode) -> Self { - match value { - FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, - FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, - FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, - } - } -} diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index ef6825c43d42..72d3bfba1b9f 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -18,6 +18,7 @@ #![deny(clippy::clone_on_ref_ptr)] pub mod execution_plan; +pub mod plan_properties; pub mod record_batch_stream; pub mod session_config; pub mod table_provider; diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs new file mode 100644 index 000000000000..e45b64d0da5a --- /dev/null +++ b/datafusion/ffi/src/plan_properties.rs @@ -0,0 +1,265 @@ +use std::{ffi::c_void, ptr::null_mut, slice, sync::Arc}; + +use arrow::ffi::FFI_ArrowSchema; +use datafusion::{error::DataFusionError, physical_expr::EquivalenceProperties, physical_plan::{ExecutionMode, PlanProperties}, prelude::SessionContext}; +use datafusion_proto::{physical_plan::{from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, DefaultPhysicalExtensionCodec}, protobuf::{Partitioning, PhysicalSortExprNodeCollection}}; +use prost::Message; + + +#[repr(C)] +#[allow(non_camel_case_types)] +pub enum FFI_ExecutionMode { + Bounded, + + Unbounded, + + PipelineBreaking, +} + +impl From for FFI_ExecutionMode { + fn from(value: ExecutionMode) -> Self { + match value { + ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, + ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, + ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, + } + } +} + +impl From for ExecutionMode { + fn from(value: FFI_ExecutionMode) -> Self { + match value { + FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, + FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, + FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, + } + } +} + +#[repr(C)] +#[derive(Debug)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_PlanProperties { + // Returns protobuf serialized bytes of the partitioning + pub output_partitioning: Option< + unsafe extern "C" fn( + plan: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, + ) -> i32, + >, + + pub execution_mode: Option< + unsafe extern "C" fn(plan: *const FFI_PlanProperties) -> FFI_ExecutionMode, + >, + + // PhysicalSortExprNodeCollection proto + pub output_ordering: Option< + unsafe extern "C" fn( + plan: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, + ) -> i32, + >, + + pub schema: + Option FFI_ArrowSchema>, + + pub private_data: *mut c_void, +} + +unsafe extern "C" fn output_partitioning_fn_wrapper( + properties: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, +) -> i32 { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let partitioning = (*private_data).output_partitioning(); + + let codec = DefaultPhysicalExtensionCodec {}; + let partitioning_data = match serialize_partitioning(partitioning, &codec) { + Ok(p) => p, + Err(_) => return 1, + }; + + let mut partition_bytes = partitioning_data.encode_to_vec(); + *buffer_size = partition_bytes.len(); + *buffer_bytes = partition_bytes.as_mut_ptr(); + + std::mem::forget(partition_bytes); + + 0 +} + +unsafe extern "C" fn execution_mode_fn_wrapper( + properties: *const FFI_PlanProperties, +) -> FFI_ExecutionMode { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let execution_mode = (*private_data).execution_mode(); + + execution_mode.into() +} + +unsafe extern "C" fn output_ordering_fn_wrapper( + properties: *const FFI_PlanProperties, + buffer_size: &mut usize, + buffer_bytes: &mut *mut u8, +) -> i32 { + // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + // let properties = (*private_data).plan.properties(); + // properties.clone().into() + let private_data = (*properties).private_data as *const PlanProperties; + let output_ordering = match (*private_data).output_ordering() { + Some(o) => o, + None => { + *buffer_size = 0; + return 0; + } + } + .to_owned(); + + let codec = DefaultPhysicalExtensionCodec {}; + let physical_sort_expr_nodes = + match serialize_physical_sort_exprs(output_ordering, &codec) { + Ok(p) => p, + Err(_) => return 1, + }; + + let ordering_data = PhysicalSortExprNodeCollection { + physical_sort_expr_nodes, + }; + + let mut ordering_bytes = ordering_data.encode_to_vec(); + *buffer_size = ordering_bytes.len(); + *buffer_bytes = ordering_bytes.as_mut_ptr(); + std::mem::forget(ordering_bytes); + + 0 +} + +// pub schema: Option FFI_ArrowSchema>, +unsafe extern "C" fn schema_fn_wrapper( + properties: *const FFI_PlanProperties, +) -> FFI_ArrowSchema { + let private_data = (*properties).private_data as *const PlanProperties; + let schema = (*private_data).eq_properties.schema(); + + // This does silently fail because TableProvider does not return a result + // so we expect it to always pass. Maybe some logging should be added. + FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()) +} + +impl From for FFI_PlanProperties { + fn from(value: PlanProperties) -> Self { + let private_data = Box::new(value); + + Self { + output_partitioning: Some(output_partitioning_fn_wrapper), + execution_mode: Some(execution_mode_fn_wrapper), + output_ordering: Some(output_ordering_fn_wrapper), + schema: Some(schema_fn_wrapper), + private_data: Box::into_raw(private_data) as *mut c_void, + } + } +} + +impl TryFrom for PlanProperties { + type Error = DataFusionError; + + fn try_from(value: FFI_PlanProperties) -> std::result::Result { + unsafe { + let schema_fn = value.schema.ok_or(DataFusionError::NotImplemented( + "schema() not implemented on FFI_PlanProperties".to_string(), + ))?; + let ffi_schema = schema_fn(&value); + let schema = (&ffi_schema).try_into()?; + + let ordering_fn = + value + .output_ordering + .ok_or(DataFusionError::NotImplemented( + "output_ordering() not implemented on FFI_PlanProperties" + .to_string(), + ))?; + let mut buff_size = 0; + let mut buff = null_mut(); + if ordering_fn(&value, &mut buff_size, &mut buff) != 0 { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_ordering in FFI_PlanProperties" + .to_string(), + )); + } + + // TODO we will need to get these, but unsure if it happesn on the provider or consumer right now. + let default_ctx = SessionContext::new(); + let codex = DefaultPhysicalExtensionCodec {}; + + let orderings = match buff_size == 0 { + true => None, + false => { + let data = slice::from_raw_parts(buff, buff_size); + + let proto_output_ordering = + PhysicalSortExprNodeCollection::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Some(parse_physical_sort_exprs( + &proto_output_ordering.physical_sort_expr_nodes, + &default_ctx, + &schema, + &codex, + )?) + } + }; + + let partitioning_fn = + value + .output_partitioning + .ok_or(DataFusionError::NotImplemented( + "output_partitioning() not implemented on FFI_PlanProperties" + .to_string(), + ))?; + if partitioning_fn(&value, &mut buff_size, &mut buff) != 0 { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" + .to_string(), + )); + } + let data = slice::from_raw_parts(buff, buff_size); + + let proto_partitioning = Partitioning::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + // TODO: Validate this unwrap is safe. + let partitioning = parse_protobuf_partitioning( + Some(&proto_partitioning), + &default_ctx, + &schema, + &codex, + )? + .unwrap(); + + let execution_mode_fn = + value.execution_mode.ok_or(DataFusionError::NotImplemented( + "execution_mode() not implemented on FFI_PlanProperties".to_string(), + ))?; + let execution_mode = execution_mode_fn(&value).into(); + + let eq_properties = match orderings { + Some(ordering) => EquivalenceProperties::new_with_orderings( + Arc::new(schema), + &[ordering], + ), + None => EquivalenceProperties::new(Arc::new(schema)), + }; + + Ok(Self::new(eq_properties, partitioning, execution_mode)) + } + } +} diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 7a436ca3b288..e16609ff7888 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -1,4 +1,4 @@ -use std::ffi::{c_void, CString}; +use std::ffi::c_void; use datafusion::{catalog::Session, prelude::SessionConfig}; @@ -14,18 +14,19 @@ pub struct FFI_SessionConfig { unsafe impl Send for FFI_SessionConfig {} -pub struct SessionConfigPrivateData { +struct SessionConfigPrivateData { pub config: SessionConfig, - pub last_error: Option, } -pub struct ExportedSessionConfig { - session: *mut FFI_SessionConfig, -} +pub struct ExportedSessionConfig(pub *const FFI_SessionConfig); impl ExportedSessionConfig { - fn get_private_data(&mut self) -> &mut SessionConfigPrivateData { - unsafe { &mut *((*self.session).private_data as *mut SessionConfigPrivateData) } + fn get_private_data(&self) -> &SessionConfigPrivateData { + unsafe { &*((*self.0).private_data as *const SessionConfigPrivateData) } + } + + pub fn session_config(&self) -> &SessionConfig { + &self.get_private_data().config } } @@ -35,7 +36,6 @@ impl FFI_SessionConfig { let config = session.config().clone(); let private_data = Box::new(SessionConfigPrivateData { config, - last_error: None, }); Self { diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 638cb1c5dd5e..252058dbcbb1 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -28,11 +28,11 @@ use datafusion_proto::{ use futures::executor::block_on; use prost::Message; -use crate::table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}; +use crate::{session_config::ExportedSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}}; use super::{ execution_plan::{ExportedExecutionPlan, FFI_ExecutionPlan}, - session_config::{FFI_SessionConfig, SessionConfigPrivateData}, + session_config::FFI_SessionConfig, }; use datafusion::error::Result; @@ -127,10 +127,9 @@ unsafe extern "C" fn scan_fn_wrapper( limit: c_int, err_code: *mut c_int, ) -> *mut FFI_ExecutionPlan { - let config = - unsafe { (*session_config).private_data as *const SessionConfigPrivateData }; + let config = ExportedSessionConfig(session_config).session_config().clone(); let session = SessionStateBuilder::new() - .with_config((*config).config.clone()) + .with_config(config) .build(); let ctx = SessionContext::new_with_state(session); From 3ab0f9f31109f8b4b84c232345a3977efda1d741 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 12:25:23 -0400 Subject: [PATCH 06/28] Adding release function --- datafusion/ffi/src/plan_properties.rs | 28 +++++++++++++++++++++----- datafusion/ffi/src/session_config.rs | 29 +++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 9 deletions(-) diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index e45b64d0da5a..120d3c7f62bf 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -6,13 +6,12 @@ use datafusion_proto::{physical_plan::{from_proto::{parse_physical_sort_exprs, p use prost::Message; +// TODO: should we just make ExecutionMode repr(C)? #[repr(C)] #[allow(non_camel_case_types)] pub enum FFI_ExecutionMode { Bounded, - Unbounded, - PipelineBreaking, } @@ -66,6 +65,8 @@ pub struct FFI_PlanProperties { pub schema: Option FFI_ArrowSchema>, + pub release: Option, + pub private_data: *mut c_void, } @@ -74,9 +75,6 @@ unsafe extern "C" fn output_partitioning_fn_wrapper( buffer_size: &mut usize, buffer_bytes: &mut *mut u8, ) -> i32 { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() let private_data = (*properties).private_data as *const PlanProperties; let partitioning = (*private_data).output_partitioning(); @@ -156,6 +154,25 @@ unsafe extern "C" fn schema_fn_wrapper( FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()) } +unsafe extern "C" fn release_fn_wrapper(props: *mut FFI_PlanProperties) { + if props.is_null() { + return; + } + let props = &mut *props; + + props.execution_mode = None; + props.output_partitioning = None; + props.execution_mode = None; + props.output_ordering = None; + props.schema = None; + + let private_data = Box::from_raw(props.private_data as *mut PlanProperties); + drop(private_data); + + props.release = None; +} + + impl From for FFI_PlanProperties { fn from(value: PlanProperties) -> Self { let private_data = Box::new(value); @@ -165,6 +182,7 @@ impl From for FFI_PlanProperties { execution_mode: Some(execution_mode_fn_wrapper), output_ordering: Some(output_ordering_fn_wrapper), schema: Some(schema_fn_wrapper), + release: Some(release_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index e16609ff7888..fd8065c038e3 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -7,13 +7,25 @@ use datafusion::{catalog::Session, prelude::SessionConfig}; #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_SessionConfig { - pub version: i64, - pub private_data: *mut c_void, + pub release: Option, } unsafe impl Send for FFI_SessionConfig {} +unsafe extern "C" fn release_fn_wrapper(config: *mut FFI_SessionConfig) { + if config.is_null() { + return; + } + let config = &mut *config; + + let private_data = Box::from_raw(config.private_data as *mut SessionConfigPrivateData); + drop(private_data); + + config.release = None; +} + + struct SessionConfigPrivateData { pub config: SessionConfig, } @@ -31,7 +43,7 @@ impl ExportedSessionConfig { } impl FFI_SessionConfig { - /// Creates a new [`FFI_TableProvider`]. + /// Creates a new [`FFI_SessionConfig`]. pub fn new(session: &dyn Session) -> Self { let config = session.config().clone(); let private_data = Box::new(SessionConfigPrivateData { @@ -39,8 +51,17 @@ impl FFI_SessionConfig { }); Self { - version: 2, private_data: Box::into_raw(private_data) as *mut c_void, + release: Some(release_fn_wrapper), } } } + +impl Drop for FFI_SessionConfig { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } +} \ No newline at end of file From c6162273560384ed8e56429510a989d68fc79156 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 12:42:32 -0400 Subject: [PATCH 07/28] Adding release functions to additional structs --- datafusion/ffi/src/execution_plan.rs | 66 +++++++++++++++++++-------- datafusion/ffi/src/plan_properties.rs | 9 ++++ datafusion/ffi/src/table_provider.rs | 31 +++++++++++++ 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 5769c80a2cfd..5d7fedc11936 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -33,14 +33,15 @@ pub struct FFI_ExecutionPlan { err_code: &mut i32, ) -> *mut *const FFI_ExecutionPlan, >, - pub name: unsafe extern "C" fn(plan: *const FFI_ExecutionPlan) -> *const c_char, + pub name: Option *const c_char>, - pub execute: unsafe extern "C" fn( + pub execute: Option FFI_ArrowArrayStream, + ) -> FFI_ArrowArrayStream>, + pub release: Option, pub private_data: *mut c_void, } @@ -107,6 +108,24 @@ unsafe extern "C" fn name_fn_wrapper(plan: *const FFI_ExecutionPlan) -> *const c .into_raw() } +unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { + if plan.is_null() { + return; + } + let plan = &mut *plan; + + plan.properties = None; + plan.children = None; + plan.name = None; + plan.execute = None; + + let private_data = Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData); + drop(private_data); + + plan.release = None; +} + + // Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. // This struct exists on the consumer side (datafusion-python, for example) and not // in the provider's side. @@ -155,19 +174,21 @@ impl FFI_ExecutionPlan { Self { properties: Some(properties_fn_wrapper), children: Some(children_fn_wrapper), - name: name_fn_wrapper, - execute: execute_fn_wrapper, + name: Some(name_fn_wrapper), + execute: Some(execute_fn_wrapper), + release: Some(release_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } +} - // pub fn empty() -> Self { - // Self { - // properties: None, - // children: None, - // private_data: std::ptr::null_mut(), - // } - // } +impl Drop for FFI_ExecutionPlan { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } } impl ExportedExecutionPlan { @@ -178,12 +199,16 @@ impl ExportedExecutionPlan { /// The caller must ensure the pointer provided points to a valid implementation /// of FFI_ExecutionPlan pub unsafe fn new(plan: *const FFI_ExecutionPlan) -> Result { - let name_fn = (*plan).name; - let name_cstr = name_fn(plan); - let name = CString::from_raw(name_cstr as *mut c_char) - .to_str() - .unwrap_or("Unable to parse FFI_ExecutionPlan name") - .to_string(); + let name_ptr = (*plan).name.map(|func| func(plan)); + let name = match name_ptr { + Some(name_cstr) => { + CString::from_raw(name_cstr as *mut c_char) + .to_str() + .unwrap_or("Unable to parse FFI_ExecutionPlan name") + .to_string() + } + None => "Plan has no name".to_string() + }; let properties = unsafe { let properties_fn = @@ -269,6 +294,11 @@ impl ExecutionPlan for ExportedExecutionPlan { ) -> datafusion::error::Result { unsafe { let execute_fn = (*self.plan).execute; + if execute_fn.is_none() { + return Err(DataFusionError::Execution("execute is not defined on FFI_ExecutionPlan".to_string())); + } + let execute_fn = execute_fn.unwrap(); + let mut err_code = 0; let arrow_stream = execute_fn(self.plan, partition, &mut err_code); diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 120d3c7f62bf..a10d95bd5c71 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -173,6 +173,15 @@ unsafe extern "C" fn release_fn_wrapper(props: *mut FFI_PlanProperties) { } +impl Drop for FFI_PlanProperties { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } +} + impl From for FFI_PlanProperties { fn from(value: PlanProperties) -> Self { let private_data = Box::new(value); diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 252058dbcbb1..dc35b1339a5f 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -69,6 +69,7 @@ pub struct FFI_TableProvider { ) -> c_int, >, + pub release: Option, pub private_data: *mut c_void, } @@ -172,6 +173,34 @@ unsafe extern "C" fn scan_fn_wrapper( } } + +unsafe extern "C" fn release_fn_wrapper(provider: *mut FFI_TableProvider) { + if provider.is_null() { + return; + } + let provider = &mut *provider; + + provider.schema = None; + provider.scan = None; + provider.table_type = None; + provider.supports_filters_pushdown = None; + + let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData); + drop(private_data); + + provider.release = None; +} + + +impl Drop for FFI_TableProvider { + fn drop(&mut self) { + match self.release { + None => (), + Some(release) => unsafe { release(self) }, + }; + } +} + impl ExportedTableProvider { fn get_private_data(&self) -> &ProviderPrivateData { unsafe { &*((*self.0).private_data as *const ProviderPrivateData) } @@ -261,6 +290,7 @@ impl FFI_TableProvider { scan: Some(scan_fn_wrapper), table_type: Some(table_type_fn_wrapper), supports_filters_pushdown: Some(supports_filters_pushdown_fn_wrapper), + release: Some(release_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } @@ -281,6 +311,7 @@ impl FFI_TableProvider { scan: None, table_type: None, supports_filters_pushdown: None, + release: None, private_data: std::ptr::null_mut(), } } From 461b4a97de81f09238c43ea2db9f242eb53f436e Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 13:37:05 -0400 Subject: [PATCH 08/28] Resolve memory leaks --- datafusion/ffi/src/execution_plan.rs | 65 ++++++++++++++++++----- datafusion/ffi/src/plan_properties.rs | 27 ++++++++-- datafusion/ffi/src/record_batch_stream.rs | 17 ++++++ datafusion/ffi/src/session_config.rs | 17 ++++++ datafusion/ffi/src/table_provider.rs | 20 ++++++- datafusion/ffi/src/table_source.rs | 17 ++++++ 6 files changed, 143 insertions(+), 20 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 5d7fedc11936..bd20f4516c70 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::{ ffi::{c_char, c_void, CString}, pin::Pin, @@ -28,10 +45,10 @@ pub struct FFI_ExecutionPlan { >, pub children: Option< unsafe extern "C" fn( - plan: *const FFI_ExecutionPlan, + plan: *mut FFI_ExecutionPlan, num_children: &mut usize, err_code: &mut i32, - ) -> *mut *const FFI_ExecutionPlan, + ) -> *mut *mut FFI_ExecutionPlan, >, pub name: Option *const c_char>, @@ -41,14 +58,16 @@ pub struct FFI_ExecutionPlan { err_code: &mut i32, ) -> FFI_ArrowArrayStream>, + pub clone: Option FFI_ExecutionPlan>, + pub release: Option, pub private_data: *mut c_void, } pub struct ExecutionPlanPrivateData { - pub plan: Arc, + pub plan: Arc, pub last_error: Option, - pub children: Vec<*const FFI_ExecutionPlan>, + pub children: Vec<*mut FFI_ExecutionPlan>, pub context: Arc, } @@ -61,10 +80,10 @@ unsafe extern "C" fn properties_fn_wrapper( } unsafe extern "C" fn children_fn_wrapper( - plan: *const FFI_ExecutionPlan, + plan: *mut FFI_ExecutionPlan, num_children: &mut usize, err_code: &mut i32, -) -> *mut *const FFI_ExecutionPlan { +) -> *mut *mut FFI_ExecutionPlan { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; *num_children = (*private_data).children.len(); @@ -125,6 +144,14 @@ unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { plan.release = None; } +unsafe extern "C" fn clone_fn_wrapper(plan: *const FFI_ExecutionPlan) -> FFI_ExecutionPlan { + let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; + let plan_data = &(*private_data); + + FFI_ExecutionPlan::new(Arc::clone(&plan_data.plan), Arc::clone(&plan_data.context)) +} + + // Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. // This struct exists on the consumer side (datafusion-python, for example) and not @@ -132,7 +159,7 @@ unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { #[derive(Debug)] pub struct ExportedExecutionPlan { name: String, - plan: *const FFI_ExecutionPlan, + plan: Box, properties: PlanProperties, children: Vec>, } @@ -161,7 +188,7 @@ impl FFI_ExecutionPlan { .children() .into_iter() .map(|child| Box::new(FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(&context)))) - .map(|child| Box::into_raw(child) as *const FFI_ExecutionPlan) + .map(Box::into_raw) .collect(); let private_data = Box::new(ExecutionPlanPrivateData { @@ -177,6 +204,7 @@ impl FFI_ExecutionPlan { name: Some(name_fn_wrapper), execute: Some(execute_fn_wrapper), release: Some(release_fn_wrapper), + clone: Some(clone_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } @@ -192,13 +220,13 @@ impl Drop for FFI_ExecutionPlan { } impl ExportedExecutionPlan { - /// Wrap a FFI Execution Plan + /// Takes ownership of a FFI_ExecutionPlan /// /// # Safety /// /// The caller must ensure the pointer provided points to a valid implementation /// of FFI_ExecutionPlan - pub unsafe fn new(plan: *const FFI_ExecutionPlan) -> Result { + pub unsafe fn new(plan: *mut FFI_ExecutionPlan) -> Result { let name_ptr = (*plan).name.map(|func| func(plan)); let name = match name_ptr { Some(name_cstr) => { @@ -248,13 +276,22 @@ impl ExportedExecutionPlan { Ok(Self { name, - plan, + plan: Box::from_raw(plan), properties, children, }) } } +impl Clone for FFI_ExecutionPlan { + fn clone(&self) -> Self { + unsafe { + let clone_fn = self.clone.expect("FFI_ExecutionPlan does not have clone defined."); + clone_fn(self) + } + } +} + impl ExecutionPlan for ExportedExecutionPlan { fn name(&self) -> &str { &self.name @@ -280,7 +317,7 @@ impl ExecutionPlan for ExportedExecutionPlan { children: Vec>, ) -> datafusion::error::Result> { Ok(Arc::new(ExportedExecutionPlan { - plan: self.plan, + plan: self.plan.clone(), name: self.name.clone(), children, properties: self.properties.clone(), @@ -293,14 +330,14 @@ impl ExecutionPlan for ExportedExecutionPlan { _context: Arc, ) -> datafusion::error::Result { unsafe { - let execute_fn = (*self.plan).execute; + let execute_fn = self.plan.execute; if execute_fn.is_none() { return Err(DataFusionError::Execution("execute is not defined on FFI_ExecutionPlan".to_string())); } let execute_fn = execute_fn.unwrap(); let mut err_code = 0; - let arrow_stream = execute_fn(self.plan, partition, &mut err_code); + let arrow_stream = execute_fn(self.plan.as_ref(), partition, &mut err_code); match err_code { 0 => ConsumerRecordBatchStream::try_from(arrow_stream) diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index a10d95bd5c71..e13fc5f35f10 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -1,4 +1,21 @@ -use std::{ffi::c_void, ptr::null_mut, slice, sync::Arc}; +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{ffi::c_void, ptr::null_mut, sync::Arc}; use arrow::ffi::FFI_ArrowSchema; use datafusion::{error::DataFusionError, physical_expr::EquivalenceProperties, physical_plan::{ExecutionMode, PlanProperties}, prelude::SessionContext}; @@ -231,10 +248,10 @@ impl TryFrom for PlanProperties { let orderings = match buff_size == 0 { true => None, false => { - let data = slice::from_raw_parts(buff, buff_size); + let data = Vec::from_raw_parts(buff, buff_size, buff_size); let proto_output_ordering = - PhysicalSortExprNodeCollection::decode(data) + PhysicalSortExprNodeCollection::decode(data.as_ref()) .map_err(|e| DataFusionError::External(Box::new(e)))?; Some(parse_physical_sort_exprs( @@ -259,9 +276,9 @@ impl TryFrom for PlanProperties { .to_string(), )); } - let data = slice::from_raw_parts(buff, buff_size); + let data = Vec::from_raw_parts(buff, buff_size, buff_size); - let proto_partitioning = Partitioning::decode(data) + let proto_partitioning = Partitioning::decode(data.as_ref()) .map_err(|e| DataFusionError::External(Box::new(e)))?; // TODO: Validate this unwrap is safe. let partitioning = parse_protobuf_partitioning( diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index afe49c8c1328..983c46544613 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::{ ffi::{c_char, c_int, c_void, CString}, ptr::addr_of, diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index fd8065c038e3..803f52dd2669 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::ffi::c_void; use datafusion::{catalog::Session, prelude::SessionConfig}; diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index dc35b1339a5f..e7d85306b5b7 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::{ any::Any, ffi::{c_char, c_int, c_void, CStr, CString}, @@ -433,7 +450,8 @@ impl TableProvider for ForeignTableProvider { let mut num_return = 0; let pushdowns = null_mut(); let err_code = pushdown_fn(self.0, buffer_size as c_int, buffer, &mut num_return, pushdowns); - let pushdowns_slice = slice::from_raw_parts(pushdowns, num_return as usize); + let num_return = num_return as usize; + let pushdowns_slice = Vec::from_raw_parts(pushdowns, num_return, num_return); match err_code { 0 => { diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs index 077470d6411a..03774b88948a 100644 --- a/datafusion/ffi/src/table_source.rs +++ b/datafusion/ffi/src/table_source.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown}; // TODO Should we just define TableProviderFilterPushDown as repr(C)? From 0408a9b730aef55aa1dae5a695c8628fb1f56f54 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Thu, 10 Oct 2024 14:00:18 -0400 Subject: [PATCH 09/28] Rename ForeignExecutionPlan for consistency --- datafusion/ffi/src/execution_plan.rs | 16 +++++------ datafusion/ffi/src/table_provider.rs | 41 ++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index bd20f4516c70..ced510821a06 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -157,17 +157,17 @@ unsafe extern "C" fn clone_fn_wrapper(plan: *const FFI_ExecutionPlan) -> FFI_Exe // This struct exists on the consumer side (datafusion-python, for example) and not // in the provider's side. #[derive(Debug)] -pub struct ExportedExecutionPlan { +pub struct ForeignExecutionPlan { name: String, plan: Box, properties: PlanProperties, children: Vec>, } -unsafe impl Send for ExportedExecutionPlan {} -unsafe impl Sync for ExportedExecutionPlan {} +unsafe impl Send for ForeignExecutionPlan {} +unsafe impl Sync for ForeignExecutionPlan {} -impl DisplayAs for ExportedExecutionPlan { +impl DisplayAs for ForeignExecutionPlan { fn fmt_as( &self, _t: datafusion::physical_plan::DisplayFormatType, @@ -219,7 +219,7 @@ impl Drop for FFI_ExecutionPlan { } } -impl ExportedExecutionPlan { +impl ForeignExecutionPlan { /// Takes ownership of a FFI_ExecutionPlan /// /// # Safety @@ -265,7 +265,7 @@ impl ExportedExecutionPlan { .into_iter() .map(|child| { - let child_plan = ExportedExecutionPlan::new(child); + let child_plan = ForeignExecutionPlan::new(child); child_plan.map(|c| Arc::new(c) as Arc) }) @@ -292,7 +292,7 @@ impl Clone for FFI_ExecutionPlan { } } -impl ExecutionPlan for ExportedExecutionPlan { +impl ExecutionPlan for ForeignExecutionPlan { fn name(&self) -> &str { &self.name } @@ -316,7 +316,7 @@ impl ExecutionPlan for ExportedExecutionPlan { self: Arc, children: Vec>, ) -> datafusion::error::Result> { - Ok(Arc::new(ExportedExecutionPlan { + Ok(Arc::new(ForeignExecutionPlan { plan: self.plan.clone(), name: self.name.clone(), children, diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index e7d85306b5b7..6a288fc4949b 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -48,7 +48,7 @@ use prost::Message; use crate::{session_config::ExportedSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}}; use super::{ - execution_plan::{ExportedExecutionPlan, FFI_ExecutionPlan}, + execution_plan::{ForeignExecutionPlan, FFI_ExecutionPlan}, session_config::FFI_SessionConfig, }; use datafusion::error::Result; @@ -424,7 +424,7 @@ impl TableProvider for ForeignTableProvider { )); } - ExportedExecutionPlan::new(plan_ptr)? + ForeignExecutionPlan::new(plan_ptr)? }; Ok(Arc::new(plan)) @@ -464,3 +464,40 @@ impl TableProvider for ForeignTableProvider { } } } + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_round_trip_ffi_table_provider() -> Result<()> { + use datafusion::datasource::MemTable; + use datafusion::arrow::{ + array::Float32Array, datatypes::DataType, record_batch::RecordBatch, + }; + use arrow::datatypes::Field; + + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); + + // define data in two partitions + let batch1 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Float32Array::from(vec![2.0, 4.0, 8.0]))], + )?; + let batch2 = RecordBatch::try_new( + Arc::clone(&schema), + vec![Arc::new(Float32Array::from(vec![64.0]))], + )?; + + // declare a new context. In spark API, this corresponds to a new spark SQLsession + let ctx = SessionContext::new(); + + // declare a table in memory. In spark API, this corresponds to createDataFrame(...). + let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?; + ctx.register_table("t", Arc::new(provider))?; + + Ok(()) + } + +} \ No newline at end of file From 7922d8ff686108bc998a2b79b40952bbef543090 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 14 Oct 2024 08:47:49 -0400 Subject: [PATCH 10/28] Resolving memory leak issues --- datafusion/ffi/src/execution_plan.rs | 251 ++++++++++++----- datafusion/ffi/src/plan_properties.rs | 372 +++++++++++++++----------- datafusion/ffi/src/session_config.rs | 162 +++++++++-- datafusion/ffi/src/table_provider.rs | 306 +++++++++++++-------- 4 files changed, 728 insertions(+), 363 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index ced510821a06..9fb614783eb2 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -16,20 +16,22 @@ // under the License. use std::{ - ffi::{c_char, c_void, CString}, + ffi::{c_char, c_void, CStr, CString}, pin::Pin, + ptr::null_mut, + slice, sync::Arc, }; use arrow::ffi_stream::FFI_ArrowArrayStream; +use datafusion::error::Result; use datafusion::{ error::DataFusionError, execution::{SendableRecordBatchStream, TaskContext}, physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}, }; -use datafusion::error::Result; -use crate::plan_properties::FFI_PlanProperties; +use crate::plan_properties::{FFI_PlanProperties, ForeignPlanProperties}; use super::record_batch_stream::{ record_batch_to_arrow_stream, ConsumerRecordBatchStream, @@ -43,22 +45,28 @@ pub struct FFI_ExecutionPlan { pub properties: Option< unsafe extern "C" fn(plan: *const FFI_ExecutionPlan) -> FFI_PlanProperties, >, + pub children: Option< unsafe extern "C" fn( - plan: *mut FFI_ExecutionPlan, + plan: *const FFI_ExecutionPlan, num_children: &mut usize, err_code: &mut i32, - ) -> *mut *mut FFI_ExecutionPlan, + ) -> *const FFI_ExecutionPlan, >, - pub name: Option *const c_char>, - pub execute: Option FFI_ArrowArrayStream>, + pub name: + Option *const c_char>, - pub clone: Option FFI_ExecutionPlan>, + pub execute: Option< + unsafe extern "C" fn( + plan: *const FFI_ExecutionPlan, + partition: usize, + err_code: &mut i32, + ) -> FFI_ArrowArrayStream, + >, + + pub clone: + Option FFI_ExecutionPlan>, pub release: Option, pub private_data: *mut c_void, @@ -67,8 +75,9 @@ pub struct FFI_ExecutionPlan { pub struct ExecutionPlanPrivateData { pub plan: Arc, pub last_error: Option, - pub children: Vec<*mut FFI_ExecutionPlan>, + pub children: Vec, pub context: Arc, + pub name: CString, } unsafe extern "C" fn properties_fn_wrapper( @@ -76,25 +85,21 @@ unsafe extern "C" fn properties_fn_wrapper( ) -> FFI_PlanProperties { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; let properties = (*private_data).plan.properties(); - properties.clone().into() + + FFI_PlanProperties::new(properties.clone()).unwrap_or(FFI_PlanProperties::empty()) } unsafe extern "C" fn children_fn_wrapper( - plan: *mut FFI_ExecutionPlan, + plan: *const FFI_ExecutionPlan, num_children: &mut usize, err_code: &mut i32, -) -> *mut *mut FFI_ExecutionPlan { +) -> *const FFI_ExecutionPlan { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; *num_children = (*private_data).children.len(); *err_code = 0; - let mut children: Vec<_> = (*private_data).children.to_owned(); - let children_ptr = children.as_mut_ptr(); - - std::mem::forget(children); - - children_ptr + (*private_data).children.as_ptr() } unsafe extern "C" fn execute_fn_wrapper( @@ -119,12 +124,7 @@ unsafe extern "C" fn execute_fn_wrapper( } unsafe extern "C" fn name_fn_wrapper(plan: *const FFI_ExecutionPlan) -> *const c_char { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - - let name = (*private_data).plan.name(); - - CString::new(name) - .unwrap_or(CString::new("unable to parse execution plan name").unwrap()) - .into_raw() + (*private_data).name.as_ptr() } unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { @@ -144,22 +144,23 @@ unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { plan.release = None; } -unsafe extern "C" fn clone_fn_wrapper(plan: *const FFI_ExecutionPlan) -> FFI_ExecutionPlan { +unsafe extern "C" fn clone_fn_wrapper( + plan: *const FFI_ExecutionPlan, +) -> FFI_ExecutionPlan { let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; let plan_data = &(*private_data); FFI_ExecutionPlan::new(Arc::clone(&plan_data.plan), Arc::clone(&plan_data.context)) + .unwrap() } - - // Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. // This struct exists on the consumer side (datafusion-python, for example) and not // in the provider's side. #[derive(Debug)] pub struct ForeignExecutionPlan { name: String, - plan: Box, + plan: FFI_ExecutionPlan, properties: PlanProperties, children: Vec>, } @@ -183,22 +184,31 @@ impl DisplayAs for ForeignExecutionPlan { impl FFI_ExecutionPlan { /// This function is called on the provider's side. - pub fn new(plan: Arc, context: Arc) -> Self { - let children = plan + pub fn new(plan: Arc, context: Arc) -> Result { + let maybe_children: Result> = plan .children() .into_iter() - .map(|child| Box::new(FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(&context)))) - .map(Box::into_raw) + .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(&context))) .collect(); + let children = maybe_children?; + + let name = CString::new(plan.name()).map_err(|e| { + DataFusionError::Plan(format!( + "Unable to convert name to CString in FFI_ExecutionPlan: {}", + e + )) + })?; + println!("FFI_ExecutionPlan::new name {:?}", name); let private_data = Box::new(ExecutionPlanPrivateData { plan, children, context, last_error: None, + name, }); - Self { + Ok(Self { properties: Some(properties_fn_wrapper), children: Some(children_fn_wrapper), name: Some(name_fn_wrapper), @@ -206,6 +216,18 @@ impl FFI_ExecutionPlan { release: Some(release_fn_wrapper), clone: Some(clone_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, + }) + } + + pub fn empty() -> Self { + Self { + properties: None, + children: None, + name: None, + execute: None, + release: None, + clone: None, + private_data: null_mut(), } } } @@ -226,33 +248,42 @@ impl ForeignExecutionPlan { /// /// The caller must ensure the pointer provided points to a valid implementation /// of FFI_ExecutionPlan - pub unsafe fn new(plan: *mut FFI_ExecutionPlan) -> Result { - let name_ptr = (*plan).name.map(|func| func(plan)); - let name = match name_ptr { - Some(name_cstr) => { - CString::from_raw(name_cstr as *mut c_char) - .to_str() - .unwrap_or("Unable to parse FFI_ExecutionPlan name") - .to_string() - } - None => "Plan has no name".to_string() - }; + pub unsafe fn new(plan: FFI_ExecutionPlan) -> Result { + let name_fn = plan.name.ok_or(DataFusionError::NotImplemented( + "name() not implemented on FFI_ExecutionPlan".to_string(), + ))?; + let name_ptr = name_fn(&plan); + + println!("ForeignExecutionPlan::new name_ptr {:?}", name_ptr); + let name_cstr = CStr::from_ptr(name_ptr); + + println!(" CStr {:?}", name_cstr); + let name = name_cstr + .to_str() + .map_err(|e| { + DataFusionError::Plan(format!( + "Unable to convert CStr name in FFI_ExecutionPlan: {}", + e + )) + })? + .to_string(); let properties = unsafe { let properties_fn = - (*plan).properties.ok_or(DataFusionError::NotImplemented( + plan.properties.ok_or(DataFusionError::NotImplemented( "properties not implemented on FFI_ExecutionPlan".to_string(), ))?; - properties_fn(plan).try_into()? + + ForeignPlanProperties::new(properties_fn(&plan))? }; let children = unsafe { - let children_fn = (*plan).children.ok_or(DataFusionError::NotImplemented( + let children_fn = plan.children.ok_or(DataFusionError::NotImplemented( "children not implemented on FFI_ExecutionPlan".to_string(), ))?; let mut num_children = 0; let mut err_code = 0; - let children_ptr = children_fn(plan, &mut num_children, &mut err_code); + let children_ptr = children_fn(&plan, &mut num_children, &mut err_code); if err_code != 0 { return Err(DataFusionError::Plan( @@ -260,12 +291,11 @@ impl ForeignExecutionPlan { )); } - let ffi_vec = Vec::from_raw_parts(children_ptr, num_children, num_children); + let ffi_vec = slice::from_raw_parts(children_ptr, num_children); let maybe_children: Result> = ffi_vec - .into_iter() + .iter() .map(|child| { - - let child_plan = ForeignExecutionPlan::new(child); + let child_plan = ForeignExecutionPlan::new(child.clone()); child_plan.map(|c| Arc::new(c) as Arc) }) @@ -276,8 +306,8 @@ impl ForeignExecutionPlan { Ok(Self { name, - plan: Box::from_raw(plan), - properties, + plan, + properties: properties.0, children, }) } @@ -286,7 +316,9 @@ impl ForeignExecutionPlan { impl Clone for FFI_ExecutionPlan { fn clone(&self) -> Self { unsafe { - let clone_fn = self.clone.expect("FFI_ExecutionPlan does not have clone defined."); + let clone_fn = self + .clone + .expect("FFI_ExecutionPlan does not have clone defined."); clone_fn(self) } } @@ -316,6 +348,7 @@ impl ExecutionPlan for ForeignExecutionPlan { self: Arc, children: Vec>, ) -> datafusion::error::Result> { + println!("Cloning ForeignExecutionPlan {}", self.name); Ok(Arc::new(ForeignExecutionPlan { plan: self.plan.clone(), name: self.name.clone(), @@ -330,14 +363,12 @@ impl ExecutionPlan for ForeignExecutionPlan { _context: Arc, ) -> datafusion::error::Result { unsafe { - let execute_fn = self.plan.execute; - if execute_fn.is_none() { - return Err(DataFusionError::Execution("execute is not defined on FFI_ExecutionPlan".to_string())); - } - let execute_fn = execute_fn.unwrap(); + let execute_fn = self.plan.execute.ok_or(DataFusionError::Execution( + "execute is not defined on FFI_ExecutionPlan".to_string(), + ))?; let mut err_code = 0; - let arrow_stream = execute_fn(self.plan.as_ref(), partition, &mut err_code); + let arrow_stream = execute_fn(&self.plan, partition, &mut err_code); match err_code { 0 => ConsumerRecordBatchStream::try_from(arrow_stream) @@ -350,3 +381,93 @@ impl ExecutionPlan for ForeignExecutionPlan { } } } + +#[cfg(test)] +mod tests { + use datafusion::{physical_plan::Partitioning, prelude::SessionContext}; + + use super::*; + + #[derive(Debug)] + pub struct EmptyExec { + props: PlanProperties, + } + + impl EmptyExec { + pub fn new(schema: arrow::datatypes::SchemaRef) -> Self { + Self { + props: PlanProperties::new( + datafusion::physical_expr::EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(3), + datafusion::physical_plan::ExecutionMode::Unbounded, + ), + } + } + } + + impl DisplayAs for EmptyExec { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + _f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + unimplemented!() + } + } + + impl ExecutionPlan for EmptyExec { + fn name(&self) -> &'static str { + "empty-exec" + } + + fn as_any(&self) -> &dyn std::any::Any { + self + } + + fn properties(&self) -> &PlanProperties { + &self.props + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _: Vec>, + ) -> Result> { + unimplemented!() + } + + fn execute( + &self, + _partition: usize, + _context: Arc, + ) -> Result { + unimplemented!() + } + + fn statistics(&self) -> Result { + unimplemented!() + } + } + + #[test] + fn test_round_trip_ffi_execution_plan() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); + let ctx = SessionContext::new(); + + let original_plan = Arc::new(EmptyExec::new(schema)); + let original_name = original_plan.name().to_string(); + + let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx())?; + + let foreign_plan = unsafe { ForeignExecutionPlan::new(local_plan)? }; + + assert!(original_name == foreign_plan.name()); + + Ok(()) + } +} diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index e13fc5f35f10..78f8e8e82879 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -15,17 +15,34 @@ // specific language governing permissions and limitations // under the License. -use std::{ffi::c_void, ptr::null_mut, sync::Arc}; - -use arrow::ffi::FFI_ArrowSchema; -use datafusion::{error::DataFusionError, physical_expr::EquivalenceProperties, physical_plan::{ExecutionMode, PlanProperties}, prelude::SessionContext}; -use datafusion_proto::{physical_plan::{from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, DefaultPhysicalExtensionCodec}, protobuf::{Partitioning, PhysicalSortExprNodeCollection}}; +use core::slice; +use std::{ + ffi::{c_uint, c_void}, + ptr::null_mut, + sync::Arc, +}; + +use arrow::{datatypes::Schema, ffi::FFI_ArrowSchema}; +use datafusion::{ + error::{DataFusionError, Result}, + physical_expr::EquivalenceProperties, + physical_plan::{ExecutionMode, PlanProperties}, + prelude::SessionContext, +}; +use datafusion_proto::{ + physical_plan::{ + from_proto::{parse_physical_sort_exprs, parse_protobuf_partitioning}, + to_proto::{serialize_partitioning, serialize_physical_sort_exprs}, + DefaultPhysicalExtensionCodec, + }, + protobuf::{Partitioning, PhysicalSortExprNodeCollection}, +}; use prost::Message; - // TODO: should we just make ExecutionMode repr(C)? #[repr(C)] #[allow(non_camel_case_types)] +#[derive(Clone)] pub enum FFI_ExecutionMode { Bounded, Unbounded, @@ -61,9 +78,8 @@ pub struct FFI_PlanProperties { pub output_partitioning: Option< unsafe extern "C" fn( plan: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, - ) -> i32, + buffer_size: &mut c_uint, + ) -> *const u8, >, pub execution_mode: Option< @@ -75,106 +91,62 @@ pub struct FFI_PlanProperties { unsafe extern "C" fn( plan: *const FFI_PlanProperties, buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, - ) -> i32, + ) -> *const u8, >, pub schema: Option FFI_ArrowSchema>, pub release: Option, - + pub private_data: *mut c_void, } +struct PlanPropertiesPrivateData { + output_partitioning: Vec, + execution_mode: FFI_ExecutionMode, + output_ordering: Vec, + schema: Arc, +} + unsafe extern "C" fn output_partitioning_fn_wrapper( properties: *const FFI_PlanProperties, - buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, -) -> i32 { - let private_data = (*properties).private_data as *const PlanProperties; - let partitioning = (*private_data).output_partitioning(); - - let codec = DefaultPhysicalExtensionCodec {}; - let partitioning_data = match serialize_partitioning(partitioning, &codec) { - Ok(p) => p, - Err(_) => return 1, - }; - - let mut partition_bytes = partitioning_data.encode_to_vec(); - *buffer_size = partition_bytes.len(); - *buffer_bytes = partition_bytes.as_mut_ptr(); - - std::mem::forget(partition_bytes); - - 0 + buffer_size: &mut c_uint, +) -> *const u8 { + let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; + *buffer_size = (*private_data).output_partitioning.len() as c_uint; + (*private_data).output_partitioning.as_ptr() } unsafe extern "C" fn execution_mode_fn_wrapper( properties: *const FFI_PlanProperties, ) -> FFI_ExecutionMode { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() - let private_data = (*properties).private_data as *const PlanProperties; - let execution_mode = (*private_data).execution_mode(); - - execution_mode.into() + let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; + (*private_data).execution_mode.clone() } unsafe extern "C" fn output_ordering_fn_wrapper( properties: *const FFI_PlanProperties, buffer_size: &mut usize, - buffer_bytes: &mut *mut u8, -) -> i32 { - // let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - // let properties = (*private_data).plan.properties(); - // properties.clone().into() - let private_data = (*properties).private_data as *const PlanProperties; - let output_ordering = match (*private_data).output_ordering() { - Some(o) => o, - None => { - *buffer_size = 0; - return 0; - } - } - .to_owned(); - - let codec = DefaultPhysicalExtensionCodec {}; - let physical_sort_expr_nodes = - match serialize_physical_sort_exprs(output_ordering, &codec) { - Ok(p) => p, - Err(_) => return 1, - }; - - let ordering_data = PhysicalSortExprNodeCollection { - physical_sort_expr_nodes, - }; - - let mut ordering_bytes = ordering_data.encode_to_vec(); - *buffer_size = ordering_bytes.len(); - *buffer_bytes = ordering_bytes.as_mut_ptr(); - std::mem::forget(ordering_bytes); - - 0 +) -> *const u8 { + let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; + *buffer_size = (*private_data).output_ordering.len(); + (*private_data).output_ordering.as_ptr() } -// pub schema: Option FFI_ArrowSchema>, unsafe extern "C" fn schema_fn_wrapper( properties: *const FFI_PlanProperties, ) -> FFI_ArrowSchema { - let private_data = (*properties).private_data as *const PlanProperties; - let schema = (*private_data).eq_properties.schema(); - - // This does silently fail because TableProvider does not return a result - // so we expect it to always pass. Maybe some logging should be added. - FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()) + let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; + FFI_ArrowSchema::try_from((*private_data).schema.as_ref()) + .unwrap_or(FFI_ArrowSchema::empty()) } unsafe extern "C" fn release_fn_wrapper(props: *mut FFI_PlanProperties) { if props.is_null() { return; } + let props = &mut *props; props.execution_mode = None; @@ -183,13 +155,13 @@ unsafe extern "C" fn release_fn_wrapper(props: *mut FFI_PlanProperties) { props.output_ordering = None; props.schema = None; - let private_data = Box::from_raw(props.private_data as *mut PlanProperties); + let private_data = + Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData); drop(private_data); props.release = None; } - impl Drop for FFI_PlanProperties { fn drop(&mut self) { match self.release { @@ -199,111 +171,195 @@ impl Drop for FFI_PlanProperties { } } -impl From for FFI_PlanProperties { - fn from(value: PlanProperties) -> Self { - let private_data = Box::new(value); +impl FFI_PlanProperties { + pub fn new(props: PlanProperties) -> Result { + let partitioning = props.output_partitioning(); - Self { + let codec = DefaultPhysicalExtensionCodec {}; + let partitioning_data = serialize_partitioning(partitioning, &codec)?; + let output_partitioning = partitioning_data.encode_to_vec(); + + let output_ordering = match props.output_ordering() { + Some(ordering) => { + let physical_sort_expr_nodes = + serialize_physical_sort_exprs(ordering.to_owned(), &codec)?; + + let ordering_data = PhysicalSortExprNodeCollection { + physical_sort_expr_nodes, + }; + + ordering_data.encode_to_vec() + } + None => Vec::default(), + }; + + let private_data = Box::new(PlanPropertiesPrivateData { + output_partitioning, + output_ordering, + execution_mode: props.execution_mode.into(), + schema: Arc::clone(props.eq_properties.schema()), + }); + + let ffi_props = FFI_PlanProperties { output_partitioning: Some(output_partitioning_fn_wrapper), execution_mode: Some(execution_mode_fn_wrapper), output_ordering: Some(output_ordering_fn_wrapper), schema: Some(schema_fn_wrapper), release: Some(release_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, + }; + + Ok(ffi_props) + } + + pub fn empty() -> Self { + Self { + output_partitioning: None, + execution_mode: None, + output_ordering: None, + schema: None, + release: None, + private_data: null_mut(), } } } -impl TryFrom for PlanProperties { - type Error = DataFusionError; - - fn try_from(value: FFI_PlanProperties) -> std::result::Result { - unsafe { - let schema_fn = value.schema.ok_or(DataFusionError::NotImplemented( - "schema() not implemented on FFI_PlanProperties".to_string(), - ))?; - let ffi_schema = schema_fn(&value); - let schema = (&ffi_schema).try_into()?; - - let ordering_fn = - value - .output_ordering - .ok_or(DataFusionError::NotImplemented( - "output_ordering() not implemented on FFI_PlanProperties" - .to_string(), - ))?; - let mut buff_size = 0; - let mut buff = null_mut(); - if ordering_fn(&value, &mut buff_size, &mut buff) != 0 { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_ordering in FFI_PlanProperties" - .to_string(), - )); +#[derive(Debug)] +pub struct ForeignPlanProperties(pub PlanProperties); + +impl ForeignPlanProperties { + /// Construct a ForeignPlanProperties object from a FFI Plan Properties. + /// + /// # Safety + /// + /// This function will call the unsafe interfaces on FFI_PlanProperties + /// provided, so the user must ensure it remains valid for the lifetime + /// of the returned struct. + pub unsafe fn new(ffi_props: FFI_PlanProperties) -> Result { + let schema_fn = ffi_props.schema.ok_or(DataFusionError::NotImplemented( + "schema() not implemented on FFI_PlanProperties".to_string(), + ))?; + let ffi_schema = schema_fn(&ffi_props); + let schema = (&ffi_schema).try_into()?; + + let ordering_fn = + ffi_props + .output_ordering + .ok_or(DataFusionError::NotImplemented( + "output_ordering() not implemented on FFI_PlanProperties".to_string(), + ))?; + let mut buff_size = 0; + let buff = ordering_fn(&ffi_props, &mut buff_size); + if buff.is_null() { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_ordering in FFI_PlanProperties" + .to_string(), + )); + } + + // TODO Extend FFI to get the registry and codex + let default_ctx = SessionContext::new(); + let codex = DefaultPhysicalExtensionCodec {}; + + let orderings = match buff_size == 0 { + true => None, + false => { + let data = slice::from_raw_parts(buff, buff_size); + + let proto_output_ordering = PhysicalSortExprNodeCollection::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Some(parse_physical_sort_exprs( + &proto_output_ordering.physical_sort_expr_nodes, + &default_ctx, + &schema, + &codex, + )?) } + }; - // TODO we will need to get these, but unsure if it happesn on the provider or consumer right now. - let default_ctx = SessionContext::new(); - let codex = DefaultPhysicalExtensionCodec {}; - - let orderings = match buff_size == 0 { - true => None, - false => { - let data = Vec::from_raw_parts(buff, buff_size, buff_size); - - let proto_output_ordering = - PhysicalSortExprNodeCollection::decode(data.as_ref()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - Some(parse_physical_sort_exprs( - &proto_output_ordering.physical_sort_expr_nodes, - &default_ctx, - &schema, - &codex, - )?) - } - }; - - let partitioning_fn = - value - .output_partitioning - .ok_or(DataFusionError::NotImplemented( - "output_partitioning() not implemented on FFI_PlanProperties" - .to_string(), - ))?; - if partitioning_fn(&value, &mut buff_size, &mut buff) != 0 { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" + let mut buff_size = 0; + + let partitioning_fn = + ffi_props + .output_partitioning + .ok_or(DataFusionError::NotImplemented( + "output_partitioning() not implemented on FFI_PlanProperties" .to_string(), - )); - } - let data = Vec::from_raw_parts(buff, buff_size, buff_size); + ))?; + let buff = partitioning_fn(&ffi_props, &mut buff_size); + if buff.is_null() && buff_size != 0 { + return Err(DataFusionError::Plan( + "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" + .to_string(), + )); + } - let proto_partitioning = Partitioning::decode(data.as_ref()) + let partitioning = { + println!("ForeignPlanProperties::new buff {:?}", buff); + let data = slice::from_raw_parts(buff, buff_size as usize); + + let proto_partitioning = Partitioning::decode(data) .map_err(|e| DataFusionError::External(Box::new(e)))?; // TODO: Validate this unwrap is safe. - let partitioning = parse_protobuf_partitioning( + parse_protobuf_partitioning( Some(&proto_partitioning), &default_ctx, &schema, &codex, )? - .unwrap(); + .unwrap() + }; - let execution_mode_fn = - value.execution_mode.ok_or(DataFusionError::NotImplemented( + let execution_mode_fn = + ffi_props + .execution_mode + .ok_or(DataFusionError::NotImplemented( "execution_mode() not implemented on FFI_PlanProperties".to_string(), ))?; - let execution_mode = execution_mode_fn(&value).into(); + let execution_mode: ExecutionMode = execution_mode_fn(&ffi_props).into(); + + let eq_properties = match orderings { + Some(ordering) => { + EquivalenceProperties::new_with_orderings(Arc::new(schema), &[ordering]) + } + None => EquivalenceProperties::new(Arc::new(schema)), + }; - let eq_properties = match orderings { - Some(ordering) => EquivalenceProperties::new_with_orderings( - Arc::new(schema), - &[ordering], - ), - None => EquivalenceProperties::new(Arc::new(schema)), - }; + Ok(Self(PlanProperties::new( + eq_properties, + partitioning, + execution_mode, + ))) + } +} - Ok(Self::new(eq_properties, partitioning, execution_mode)) - } +#[cfg(test)] +mod tests { + use datafusion::physical_plan::Partitioning; + + use super::*; + + #[test] + fn test_round_trip_ffi_plan_properties() -> Result<()> { + use arrow::datatypes::{DataType, Field, Schema}; + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); + + let original_props = PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(3), + ExecutionMode::Unbounded, + ); + + let local_props_ptr = FFI_PlanProperties::new(original_props.clone())?; + + let foreign_props = unsafe { ForeignPlanProperties::new(local_props_ptr)? }; + + let returned_props: PlanProperties = foreign_props.0; + + assert!(format!("{:?}", returned_props) == format!("{:?}", original_props)); + + Ok(()) } } diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 803f52dd2669..4980b28693f0 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -15,59 +15,102 @@ // specific language governing permissions and limitations // under the License. -use std::ffi::c_void; +use std::{ + collections::HashMap, + ffi::{c_char, c_uint, c_void, CStr, CString}, + ptr::null_mut, + slice, +}; -use datafusion::{catalog::Session, prelude::SessionConfig}; +use datafusion::error::Result; +use datafusion::{error::DataFusionError, prelude::SessionConfig}; #[repr(C)] #[derive(Debug)] #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_SessionConfig { + pub config_options: Option< + unsafe extern "C" fn( + config: *const FFI_SessionConfig, + num_options: &mut c_uint, + keys: &mut *const *const c_char, + values: &mut *const *const c_char, + ) -> (), + >, + pub private_data: *mut c_void, pub release: Option, } unsafe impl Send for FFI_SessionConfig {} +unsafe extern "C" fn config_options_fn_wrapper( + config: *const FFI_SessionConfig, + num_options: &mut c_uint, + keys: &mut *const *const c_char, + values: &mut *const *const c_char, +) { + let private_data = (*config).private_data as *mut SessionConfigPrivateData; + + *num_options = (*private_data).config_keys.len() as c_uint; + *keys = (*private_data).config_keys.as_ptr(); + *values = (*private_data).config_values.as_ptr(); +} + unsafe extern "C" fn release_fn_wrapper(config: *mut FFI_SessionConfig) { if config.is_null() { return; } let config = &mut *config; - let private_data = Box::from_raw(config.private_data as *mut SessionConfigPrivateData); + let mut private_data = + Box::from_raw(config.private_data as *mut SessionConfigPrivateData); + let _removed_keys: Vec<_> = private_data + .config_keys + .drain(..) + .map(|key| CString::from_raw(key as *mut c_char)) + .collect(); + let _removed_values: Vec<_> = private_data + .config_values + .drain(..) + .map(|key| CString::from_raw(key as *mut c_char)) + .collect(); + drop(private_data); config.release = None; } - struct SessionConfigPrivateData { - pub config: SessionConfig, -} - -pub struct ExportedSessionConfig(pub *const FFI_SessionConfig); - -impl ExportedSessionConfig { - fn get_private_data(&self) -> &SessionConfigPrivateData { - unsafe { &*((*self.0).private_data as *const SessionConfigPrivateData) } - } - - pub fn session_config(&self) -> &SessionConfig { - &self.get_private_data().config - } + pub config_keys: Vec<*const c_char>, + pub config_values: Vec<*const c_char>, } impl FFI_SessionConfig { /// Creates a new [`FFI_SessionConfig`]. - pub fn new(session: &dyn Session) -> Self { - let config = session.config().clone(); + pub fn new(session: &SessionConfig) -> Self { + let mut config_keys = Vec::new(); + let mut config_values = Vec::new(); + for config_entry in session.options().entries() { + if let Some(value) = config_entry.value { + let key_cstr = CString::new(config_entry.key).unwrap_or_default(); + let key_ptr = key_cstr.into_raw() as *const c_char; + config_keys.push(key_ptr); + + config_values + .push(CString::new(value).unwrap_or_default().into_raw() + as *const c_char); + } + } + let private_data = Box::new(SessionConfigPrivateData { - config, + config_keys, + config_values, }); Self { + config_options: Some(config_options_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, release: Some(release_fn_wrapper), } @@ -81,4 +124,81 @@ impl Drop for FFI_SessionConfig { Some(release) => unsafe { release(self) }, }; } -} \ No newline at end of file +} + +pub struct ForeignSessionConfig(pub SessionConfig); + +impl ForeignSessionConfig { + /// Create a session config object from a foreign provider. + /// + /// # Safety + /// + /// This function will dereference the provided config pointer and will + /// access it's unsafe methods. It is the provider's responsibility that + /// this pointer and it's internal functions remain valid for the lifetime + /// of the returned struct. + pub unsafe fn new(config: *const FFI_SessionConfig) -> Result { + let (keys, values) = unsafe { + let config_options = + (*config) + .config_options + .ok_or(DataFusionError::NotImplemented( + "config_options not implemented on FFI_SessionConfig".to_string(), + ))?; + let mut num_keys = 0; + let mut keys: *const *const c_char = null_mut(); + let mut values: *const *const c_char = null_mut(); + config_options(config, &mut num_keys, &mut keys, &mut values); + let num_keys = num_keys as usize; + println!("Received {} key value pairs", num_keys); + + let keys: Vec = slice::from_raw_parts(keys, num_keys) + .iter() + .map(|key| CStr::from_ptr(*key)) + .map(|key| key.to_str().unwrap_or_default().to_string()) + .collect(); + let values: Vec = slice::from_raw_parts(values, num_keys) + .iter() + .map(|value| CStr::from_ptr(*value)) + .map(|val| val.to_str().unwrap_or_default().to_string()) + .collect(); + + (keys, values) + }; + + let mut options_map = HashMap::new(); + keys.into_iter().zip(values).for_each(|(key, value)| { + options_map.insert(key, value); + }); + + Ok(Self(SessionConfig::from_string_hash_map(&options_map)?)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_round_trip_ffi_session_config() -> Result<()> { + let session_config = SessionConfig::new(); + let original_options = session_config.options().entries(); + + let ffi_config = FFI_SessionConfig::new(&session_config); + let ffi_config_ptr = Box::into_raw(Box::new(ffi_config)); + + let foreign_config = unsafe { ForeignSessionConfig::new(ffi_config_ptr)? }; + + let returned_options = foreign_config.0.options().entries(); + + println!( + "Length of original options: {} returned {}", + original_options.len(), + returned_options.len() + ); + + let _ = unsafe { Box::from_raw(ffi_config_ptr) }; + + Ok(()) + } +} diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 6a288fc4949b..9c708183ce52 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -17,8 +17,8 @@ use std::{ any::Any, - ffi::{c_char, c_int, c_void, CStr, CString}, - ptr::null_mut, + ffi::{c_int, c_uint, c_void}, + ptr::{addr_of, null}, slice, sync::Arc, }; @@ -30,7 +30,6 @@ use arrow::{ use async_trait::async_trait; use datafusion::{ catalog::{Session, TableProvider}, - common::DFSchema, datasource::TableType, error::DataFusionError, execution::session_state::SessionStateBuilder, @@ -39,16 +38,21 @@ use datafusion::{ prelude::{Expr, SessionContext}, }; use datafusion_proto::{ - logical_plan::{from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec}, + logical_plan::{ + from_proto::parse_exprs, to_proto::serialize_exprs, DefaultLogicalExtensionCodec, + }, protobuf::LogicalExprList, }; use futures::executor::block_on; use prost::Message; -use crate::{session_config::ExportedSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}}; +use crate::{ + session_config::ForeignSessionConfig, + table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}, +}; use super::{ - execution_plan::{ForeignExecutionPlan, FFI_ExecutionPlan}, + execution_plan::{FFI_ExecutionPlan, ForeignExecutionPlan}, session_config::FFI_SessionConfig, }; use datafusion::error::Result; @@ -65,13 +69,13 @@ pub struct FFI_TableProvider { unsafe extern "C" fn( provider: *const FFI_TableProvider, session_config: *const FFI_SessionConfig, - n_projections: c_int, - projections: *const c_int, - n_filters: c_int, - filters: *const *const c_char, + n_projections: c_uint, + projections: *const c_uint, + filter_buffer_size: c_uint, + filter_buffer: *const u8, limit: c_int, err_code: *mut c_int, - ) -> *mut FFI_ExecutionPlan, + ) -> FFI_ExecutionPlan, >, pub table_type: Option FFI_TableType>, @@ -79,13 +83,14 @@ pub struct FFI_TableProvider { pub supports_filters_pushdown: Option< unsafe extern "C" fn( provider: *const FFI_TableProvider, - filter_buff_size: c_int, + filter_buff_size: c_uint, filter_buffer: *const u8, num_filters: &mut c_int, - out: *mut FFI_TableProviderFilterPushDown, + out: &mut *const FFI_TableProviderFilterPushDown, ) -> c_int, >, + pub clone: Option Self>, pub release: Option, pub private_data: *mut c_void, } @@ -94,7 +99,8 @@ unsafe impl Send for FFI_TableProvider {} unsafe impl Sync for FFI_TableProvider {} struct ProviderPrivateData { - provider: Box, + provider: Arc, + last_filter_pushdowns: Vec, } /// Wrapper struct to provide access functions from the FFI interface to the underlying @@ -116,19 +122,18 @@ unsafe extern "C" fn table_type_fn_wrapper( unsafe extern "C" fn supports_filters_pushdown_fn_wrapper( provider: *const FFI_TableProvider, - filter_buff_size: c_int, + filter_buff_size: c_uint, filter_buffer: *const u8, num_filters: &mut c_int, - out: *mut FFI_TableProviderFilterPushDown, + out: &mut *const FFI_TableProviderFilterPushDown, ) -> c_int { let results = ExportedTableProvider(provider) .supports_filters_pushdown(filter_buff_size, filter_buffer); match results { - Ok(pushdowns) => { - *num_filters = pushdowns.len() as c_int; - std::ptr::copy(pushdowns.as_ptr(), out, 1); - std::mem::forget(pushdowns); + Ok((num_pushdowns, pushdowns_ptr)) => { + *num_filters = num_pushdowns as c_int; + std::ptr::copy(addr_of!(pushdowns_ptr), out, 1); 0 } Err(_e) => 1, @@ -138,16 +143,23 @@ unsafe extern "C" fn supports_filters_pushdown_fn_wrapper( unsafe extern "C" fn scan_fn_wrapper( provider: *const FFI_TableProvider, session_config: *const FFI_SessionConfig, - n_projections: c_int, - projections: *const c_int, - n_filters: c_int, - filters: *const *const c_char, + n_projections: c_uint, + projections: *const c_uint, + filter_buffer_size: c_uint, + filter_buffer: *const u8, limit: c_int, err_code: *mut c_int, -) -> *mut FFI_ExecutionPlan { - let config = ExportedSessionConfig(session_config).session_config().clone(); +) -> FFI_ExecutionPlan { + let config = match ForeignSessionConfig::new(session_config) { + Ok(c) => c, + Err(_) => { + *err_code = 1; + return FFI_ExecutionPlan::empty(); + } + }; let session = SessionStateBuilder::new() - .with_config(config) + .with_default_features() + .with_config(config.0) .build(); let ctx = SessionContext::new_with_state(session); @@ -163,18 +175,13 @@ unsafe extern "C" fn scan_fn_wrapper( false => Some(&projections), }; - let filters_slice = std::slice::from_raw_parts(filters, n_filters as usize); - let filters_vec: Vec = filters_slice - .iter() - .map(|&s| CStr::from_ptr(s).to_string_lossy().to_string()) - .collect(); - let limit = limit.try_into().ok(); let plan = ExportedTableProvider(provider).provider_scan( &ctx, maybe_projections, - filters_vec, + filter_buffer_size, + filter_buffer, limit, ); @@ -185,11 +192,22 @@ unsafe extern "C" fn scan_fn_wrapper( } Err(_) => { *err_code = 1; - null_mut() + FFI_ExecutionPlan::empty() } } } +unsafe extern "C" fn clone_fn_wrapper( + provider: *const FFI_TableProvider, +) -> FFI_TableProvider { + if provider.is_null() { + return FFI_TableProvider::empty(); + } + + let private_data = (*provider).private_data as *const ProviderPrivateData; + let table_provider = unsafe { Arc::clone(&(*private_data).provider) }; + FFI_TableProvider::new(table_provider) +} unsafe extern "C" fn release_fn_wrapper(provider: *mut FFI_TableProvider) { if provider.is_null() { @@ -203,28 +221,32 @@ unsafe extern "C" fn release_fn_wrapper(provider: *mut FFI_TableProvider) { provider.supports_filters_pushdown = None; let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData); + drop(private_data); provider.release = None; } - impl Drop for FFI_TableProvider { fn drop(&mut self) { match self.release { None => (), - Some(release) => unsafe { release(self) }, + Some(release) => unsafe { release(self as *mut FFI_TableProvider) }, }; } } impl ExportedTableProvider { - fn get_private_data(&self) -> &ProviderPrivateData { + fn private_data(&self) -> &ProviderPrivateData { unsafe { &*((*self.0).private_data as *const ProviderPrivateData) } } + fn mut_private_data(&mut self) -> &mut ProviderPrivateData { + unsafe { &mut *((*self.0).private_data as *mut ProviderPrivateData) } + } + pub fn schema(&self) -> FFI_ArrowSchema { - let private_data = self.get_private_data(); + let private_data = self.private_data(); let provider = &private_data.provider; // This does silently fail because TableProvider does not return a result @@ -234,7 +256,7 @@ impl ExportedTableProvider { } pub fn table_type(&self) -> FFI_TableType { - let private_data = self.get_private_data(); + let private_data = self.private_data(); let provider = &private_data.provider; provider.table_type().into() @@ -244,32 +266,40 @@ impl ExportedTableProvider { &mut self, ctx: &SessionContext, projections: Option<&Vec>, - filters: Vec, + filter_buffer_size: c_uint, + filter_buffer: *const u8, limit: Option, - ) -> Result<*mut FFI_ExecutionPlan> { - let private_data = self.get_private_data(); + ) -> Result { + let private_data = self.private_data(); let provider = &private_data.provider; - let schema = provider.schema(); - let df_schema: DFSchema = schema.try_into()?; + let filters = match filter_buffer_size > 0 { + false => vec![], + true => { + let default_ctx = SessionContext::new(); + let codec = DefaultLogicalExtensionCodec {}; - let filter_exprs = filters - .into_iter() - .map(|expr_str| ctx.state().create_logical_expr(&expr_str, &df_schema)) - .collect::>>()?; + let data = unsafe { + slice::from_raw_parts(filter_buffer, filter_buffer_size as usize) + }; - let plan = - block_on(provider.scan(&ctx.state(), projections, &filter_exprs, limit))?; + let proto_filters = LogicalExprList::decode(data) + .map_err(|e| DataFusionError::External(Box::new(e)))?; - let plan_boxed = Box::new(FFI_ExecutionPlan::new(plan, ctx.task_ctx())); - Ok(Box::into_raw(plan_boxed)) + parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? + } + }; + + let plan = block_on(provider.scan(&ctx.state(), projections, &filters, limit))?; + + FFI_ExecutionPlan::new(plan, ctx.task_ctx()) } pub fn supports_filters_pushdown( - &self, - buffer_size: c_int, + &mut self, + buffer_size: c_uint, filter_buffer: *const u8, - ) -> Result> { + ) -> Result<(usize, *const FFI_TableProviderFilterPushDown)> { unsafe { let default_ctx = SessionContext::new(); let codec = DefaultLogicalExtensionCodec {}; @@ -287,20 +317,29 @@ impl ExportedTableProvider { }; let filters_borrowed: Vec<&Expr> = filters.iter().collect(); - let private_data = self.get_private_data(); - let provider = &private_data.provider; - - provider - .supports_filters_pushdown(&filters_borrowed) - .map(|f| f.iter().map(|v| v.into()).collect()) + let private_data = self.mut_private_data(); + private_data.last_filter_pushdowns = private_data + .provider + .supports_filters_pushdown(&filters_borrowed)? + .iter() + .map(|v| v.into()) + .collect(); + + Ok(( + private_data.last_filter_pushdowns.len(), + private_data.last_filter_pushdowns.as_ptr(), + )) } } } impl FFI_TableProvider { /// Creates a new [`FFI_TableProvider`]. - pub fn new(provider: Box) -> Self { - let private_data = Box::new(ProviderPrivateData { provider }); + pub fn new(provider: Arc) -> Self { + let private_data = Box::new(ProviderPrivateData { + provider, + last_filter_pushdowns: Vec::default(), + }); Self { schema: Some(schema_fn_wrapper), @@ -308,15 +347,16 @@ impl FFI_TableProvider { table_type: Some(table_type_fn_wrapper), supports_filters_pushdown: Some(supports_filters_pushdown_fn_wrapper), release: Some(release_fn_wrapper), + clone: Some(clone_fn_wrapper), private_data: Box::into_raw(private_data) as *mut c_void, } } - /** - Replace temporary pointer with updated - # Safety - User must validate the raw pointer is valid. - */ + /// Create a FFI_TableProvider from a raw pointer + /// + /// # Safety + /// + /// This function assumes the raw pointer is valid and takes onwership of it. pub unsafe fn from_raw(raw_provider: *mut FFI_TableProvider) -> Self { std::ptr::replace(raw_provider, Self::empty()) } @@ -329,6 +369,7 @@ impl FFI_TableProvider { table_type: None, supports_filters_pushdown: None, release: None, + clone: None, private_data: std::ptr::null_mut(), } } @@ -339,11 +380,17 @@ impl FFI_TableProvider { /// defined on this struct must only use the stable functions provided in /// FFI_TableProvider to interact with the foreign table provider. #[derive(Debug)] -pub struct ForeignTableProvider(pub *const FFI_TableProvider); +pub struct ForeignTableProvider(FFI_TableProvider); unsafe impl Send for ForeignTableProvider {} unsafe impl Sync for ForeignTableProvider {} +impl ForeignTableProvider { + pub fn new(provider: FFI_TableProvider) -> Self { + Self(provider) + } +} + #[async_trait] impl TableProvider for ForeignTableProvider { fn as_any(&self) -> &dyn Any { @@ -352,21 +399,20 @@ impl TableProvider for ForeignTableProvider { fn schema(&self) -> SchemaRef { let schema = unsafe { - let schema_fn = (*self.0).schema; + let schema_fn = self.0.schema; schema_fn - .map(|func| func(self.0)) + .map(|func| func(&self.0)) .and_then(|s| Schema::try_from(&s).ok()) .unwrap_or(Schema::empty()) }; - Arc::new(schema) } fn table_type(&self) -> TableType { unsafe { - let table_type_fn = (*self.0).table_type; + let table_type_fn = self.0.table_type; table_type_fn - .map(|func| func(self.0)) + .map(|func| func(&self.0)) .unwrap_or(FFI_TableType::Base) .into() } @@ -379,26 +425,25 @@ impl TableProvider for ForeignTableProvider { filters: &[Expr], limit: Option, ) -> Result> { - let scan_fn = unsafe { - (*self.0).scan.ok_or(DataFusionError::NotImplemented( - "Scan not defined on FFI_TableProvider".to_string(), - ))? - }; + let scan_fn = self.0.scan.ok_or(DataFusionError::NotImplemented( + "Scan not defined on FFI_TableProvider".to_string(), + ))?; - let session_config = FFI_SessionConfig::new(session); + let session_config = FFI_SessionConfig::new(session.config()); - let n_projections = projection.map(|p| p.len()).unwrap_or(0) as c_int; - let projections: Vec = projection - .map(|p| p.iter().map(|v| *v as c_int).collect()) + let n_projections = projection.map(|p| p.len()).unwrap_or(0) as c_uint; + let projections: Vec = projection + .map(|p| p.iter().map(|v| *v as c_uint).collect()) .unwrap_or_default(); let projections_ptr = projections.as_ptr(); - let n_filters = filters.len() as c_int; - let filters: Vec = filters - .iter() - .filter_map(|f| CString::new(f.to_string()).ok()) - .collect(); - let filters_ptr: Vec<*const i8> = filters.iter().map(|s| s.as_ptr()).collect(); + let codec = DefaultLogicalExtensionCodec {}; + let filter_list = LogicalExprList { + expr: serialize_exprs(filters, &codec)?, + }; + let filter_bytes = filter_list.encode_to_vec(); + let filter_buffer_size = filter_bytes.len() as c_uint; + let filter_buffer = filter_bytes.as_ptr(); let limit = match limit { Some(l) => l as c_int, @@ -408,12 +453,12 @@ impl TableProvider for ForeignTableProvider { let mut err_code = 0; let plan = unsafe { let plan_ptr = scan_fn( - self.0, + &self.0, &session_config, n_projections, projections_ptr, - n_filters, - filters_ptr.as_ptr(), + filter_buffer_size, + filter_buffer, limit, &mut err_code, ); @@ -440,46 +485,59 @@ impl TableProvider for ForeignTableProvider { let codec = DefaultLogicalExtensionCodec {}; let expr_list = LogicalExprList { - expr: serialize_exprs(filter.iter().map(|f| f.to_owned()), &codec)? + expr: serialize_exprs(filter.iter().map(|f| f.to_owned()), &codec)?, }; let expr_bytes = expr_list.encode_to_vec(); let buffer_size = expr_bytes.len(); let buffer = expr_bytes.as_ptr(); - let pushdown_fn = (*self.0).supports_filters_pushdown.ok_or(DataFusionError::Plan("FFI_TableProvider does not implement supports_filters_pushdown".to_string()))?; + let pushdown_fn = + self.0 + .supports_filters_pushdown + .ok_or(DataFusionError::Plan( + "FFI_TableProvider does not implement supports_filters_pushdown" + .to_string(), + ))?; let mut num_return = 0; - let pushdowns = null_mut(); - let err_code = pushdown_fn(self.0, buffer_size as c_int, buffer, &mut num_return, pushdowns); + let mut pushdowns = null(); + let err_code = pushdown_fn( + &self.0, + buffer_size as c_uint, + buffer, + &mut num_return, + &mut pushdowns, + ); let num_return = num_return as usize; - let pushdowns_slice = Vec::from_raw_parts(pushdowns, num_return, num_return); + let pushdowns_slice = slice::from_raw_parts(pushdowns, num_return); match err_code { - 0 => { - Ok(pushdowns_slice.iter().map(|v| v.into()).collect()) - } - _ => { - Err(DataFusionError::Plan("Error occurred during FFI call to supports_filters_pushdown".to_string())) - } + 0 => Ok(pushdowns_slice.iter().map(|v| v.into()).collect()), + _ => Err(DataFusionError::Plan( + "Error occurred during FFI call to supports_filters_pushdown" + .to_string(), + )), } } } } - #[cfg(test)] mod tests { + use datafusion::prelude::{col, lit}; + use super::*; - #[test] - fn test_round_trip_ffi_table_provider() -> Result<()> { - use datafusion::datasource::MemTable; + #[tokio::test] + async fn test_round_trip_ffi_table_provider() -> Result<()> { + use arrow::datatypes::Field; use datafusion::arrow::{ array::Float32Array, datatypes::DataType, record_batch::RecordBatch, }; - use arrow::datatypes::Field; + use datafusion::datasource::MemTable; + + let schema = + Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); - let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Float32, false)])); - // define data in two partitions let batch1 = RecordBatch::try_new( Arc::clone(&schema), @@ -489,15 +547,25 @@ mod tests { Arc::clone(&schema), vec![Arc::new(Float32Array::from(vec![64.0]))], )?; - - // declare a new context. In spark API, this corresponds to a new spark SQLsession + let ctx = SessionContext::new(); - - // declare a table in memory. In spark API, this corresponds to createDataFrame(...). - let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?; - ctx.register_table("t", Arc::new(provider))?; + + let provider = + Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?); + + let ffi_provider = FFI_TableProvider::new(provider); + + let foreign_table_provider = ForeignTableProvider::new(ffi_provider); + + ctx.register_table("t", Arc::new(foreign_table_provider))?; + + let df = ctx.table("t").await?; + + df.select(vec![col("a")])? + .filter(col("a").gt(lit(3.0)))? + .show() + .await?; Ok(()) } - -} \ No newline at end of file +} From afd06bef98fa14492d683d5b2e4700700a3187ad Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 15 Oct 2024 08:19:21 -0400 Subject: [PATCH 11/28] Remove debug statements. Create runtime for block_on operations --- datafusion/ffi/src/execution_plan.rs | 4 ---- datafusion/ffi/src/plan_properties.rs | 1 - datafusion/ffi/src/record_batch_stream.rs | 14 ++++++++++++-- datafusion/ffi/src/session_config.rs | 1 - datafusion/ffi/src/table_provider.rs | 17 +++++++++++++++-- 5 files changed, 27 insertions(+), 10 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 9fb614783eb2..78ba0b5205d8 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -198,7 +198,6 @@ impl FFI_ExecutionPlan { e )) })?; - println!("FFI_ExecutionPlan::new name {:?}", name); let private_data = Box::new(ExecutionPlanPrivateData { plan, @@ -254,10 +253,8 @@ impl ForeignExecutionPlan { ))?; let name_ptr = name_fn(&plan); - println!("ForeignExecutionPlan::new name_ptr {:?}", name_ptr); let name_cstr = CStr::from_ptr(name_ptr); - println!(" CStr {:?}", name_cstr); let name = name_cstr .to_str() .map_err(|e| { @@ -348,7 +345,6 @@ impl ExecutionPlan for ForeignExecutionPlan { self: Arc, children: Vec>, ) -> datafusion::error::Result> { - println!("Cloning ForeignExecutionPlan {}", self.name); Ok(Arc::new(ForeignExecutionPlan { plan: self.plan.clone(), name: self.name.clone(), diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 78f8e8e82879..7ea8a851f410 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -296,7 +296,6 @@ impl ForeignPlanProperties { } let partitioning = { - println!("ForeignPlanProperties::new buff {:?}", buff); let data = slice::from_raw_parts(buff, buff_size as usize); let proto_partitioning = Partitioning::decode(data) diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index 983c46544613..453f5ce8d2dc 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -34,7 +34,7 @@ use datafusion::{ error::DataFusionError, execution::{RecordBatchStream, SendableRecordBatchStream}, }; -use futures::{executor::block_on, Stream, TryStreamExt}; +use futures::{Stream, TryStreamExt}; pub fn record_batch_to_arrow_stream( stream: SendableRecordBatchStream, @@ -139,7 +139,17 @@ impl ExportedRecordBatchStream { pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 { let private_data = self.get_private_data(); - let maybe_batch = block_on(private_data.stream.try_next()); + let runtime = match tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + { + Ok(r) => r, + Err(_e) => { + return 1; + } + }; + // let maybe_batch = block_on(private_data.stream.try_next()); + let maybe_batch = runtime.block_on(private_data.stream.try_next()); match maybe_batch { Ok(None) => { diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 4980b28693f0..3f70875dd2ed 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -150,7 +150,6 @@ impl ForeignSessionConfig { let mut values: *const *const c_char = null_mut(); config_options(config, &mut num_keys, &mut keys, &mut values); let num_keys = num_keys as usize; - println!("Received {} key value pairs", num_keys); let keys: Vec = slice::from_raw_parts(keys, num_keys) .iter() diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 9c708183ce52..86357d13b4cb 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -43,7 +43,6 @@ use datafusion_proto::{ }, protobuf::LogicalExprList, }; -use futures::executor::block_on; use prost::Message; use crate::{ @@ -290,7 +289,21 @@ impl ExportedTableProvider { } }; - let plan = block_on(provider.scan(&ctx.state(), projections, &filters, limit))?; + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| { + DataFusionError::Execution(format!( + "Error getting runtime during scan(): {}", + e + )) + })?; + let plan = runtime.block_on(provider.scan( + &ctx.state(), + projections, + &filters, + limit, + ))?; FFI_ExecutionPlan::new(plan, ctx.task_ctx()) } From dafc982af7921a5759dac87de1ac77e95e49e8e0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 16 Oct 2024 19:17:26 -0400 Subject: [PATCH 12/28] Switching over to stable abi and async-ffi --- datafusion/ffi/Cargo.toml | 2 + datafusion/ffi/README.md | 49 +++ datafusion/ffi/src/execution_plan.rs | 297 +++++-------- datafusion/ffi/src/plan_properties.rs | 294 +++++-------- datafusion/ffi/src/record_batch_stream.rs | 276 ++++++------ datafusion/ffi/src/session_config.rs | 152 +++---- datafusion/ffi/src/table_provider.rs | 503 ++++++++-------------- datafusion/ffi/src/table_source.rs | 4 +- 8 files changed, 644 insertions(+), 933 deletions(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 189b4d0cb1d0..227cd3c7ee61 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -43,3 +43,5 @@ futures = { workspace = true } async-trait = { workspace = true } tokio = { workspace = true } prost = { workspace = true } +abi_stable = "0.11.3" +async-ffi = { version = "0.5.0", features = ["abi_stable"] } diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index a4353f1a0c4f..a8b4cb2b766a 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -24,5 +24,54 @@ with functions from other languages using a stable interface. See [API Docs] for details and examples. +We expect this crate may be used by both sides of the FFI. This allows users +to create modules that can interoperate with the necessity of using the same +version of DataFusion. The driving use case has been the `datafusion-python` +repository, but many other use cases may exist. We envision at least two +use cases. + +1. `datafusion-python` which will use the FFI to provide external services such +as a `TableProvider` without needing to re-export the entire `datafusion-python` +code base. With `datafusion-ffi` these packages do not need `datafusion-python` +as a dependency at all. +2. Users may want to create a modular interface that allows runtime loading of +libraries. + +## Struct Layout + +In this crate we have a variety of structs which closely mimic the behavior of +their internal counterparts. In the following example, we will refer to the +`TableProvider`, but the same pattern exists for other structs. + +Each of the exposted structs in this crate is provided with two variants that +are to be used based on which side of the interface you are viewing it from. +The structs that begin with `Exported` are designed to be used by the side that +provides the functions. The receiver of these structs begins with `Foreign`. + +For example, we have a struct `FFI_TableProvider` to give access to the +`TableProvider` functions like `table_type()` and `scan()`. If we write a +library that wishes to expose it's `TableProvider`, then we can access the +private data that contains the Arc reference to the `TableProvider` via +`ExportedTableProvider`. This data is local to the library. + +If we have a program that accesses a `TableProvider` provided via FFI, then it +will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** +not attempt to access the `private_data` field in `FFI_TableProvider`. If a +user is testing locally, you may be able to successfully access this field, but +it will only work if you are building against the exact same version of +`DataFusion` for both libraries **and** the same compiler. It is not guaranteed +to work in general. + +It is worth noting that which library is the `local` and which is `foreign` +depends on which interface we are considering. For example, suppose we have a +Python library called `my_provider` that exposes a `TableProvider` called +`MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can +access the `private_data` via `ExportedTableProvider`. We connect this to +`datafusion-python`, where we access it as a `ForeignTableProvider`. Now when +we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. +The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. +It is important to be careful when expanding these functions to be certain which +side of the interface each object refers to. + [datafusion]: https://datafusion.apache.org [api docs]: http://docs.rs/datafusion-ffi/latest diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 78ba0b5205d8..6dc3bb3da554 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -16,13 +16,15 @@ // under the License. use std::{ - ffi::{c_char, c_void, CStr, CString}, + ffi::{c_void, CString}, pin::Pin, - ptr::null_mut, - slice, sync::Arc, }; +use abi_stable::{ + std_types::{RResult, RStr, RString, RVec}, + StableAbi, +}; use arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::error::Result; use datafusion::{ @@ -31,129 +33,120 @@ use datafusion::{ physical_plan::{DisplayAs, ExecutionPlan, PlanProperties}, }; -use crate::plan_properties::{FFI_PlanProperties, ForeignPlanProperties}; - -use super::record_batch_stream::{ - record_batch_to_arrow_stream, ConsumerRecordBatchStream, +use crate::{ + plan_properties::{FFI_PlanProperties, ForeignPlanProperties}, + record_batch_stream::FFI_RecordBatchStream, }; #[repr(C)] -#[derive(Debug)] +#[derive(Debug, StableAbi)] +pub struct WrappedArrayStream(#[sabi(unsafe_opaque_field)] pub FFI_ArrowArrayStream); + +#[repr(C)] +#[derive(Debug, StableAbi)] #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_ExecutionPlan { - pub properties: Option< - unsafe extern "C" fn(plan: *const FFI_ExecutionPlan) -> FFI_PlanProperties, - >, - - pub children: Option< - unsafe extern "C" fn( - plan: *const FFI_ExecutionPlan, - num_children: &mut usize, - err_code: &mut i32, - ) -> *const FFI_ExecutionPlan, - >, - - pub name: - Option *const c_char>, - - pub execute: Option< - unsafe extern "C" fn( - plan: *const FFI_ExecutionPlan, - partition: usize, - err_code: &mut i32, - ) -> FFI_ArrowArrayStream, - >, - - pub clone: - Option FFI_ExecutionPlan>, - - pub release: Option, + pub properties: unsafe extern "C" fn(plan: &Self) -> FFI_PlanProperties, + + pub children: unsafe extern "C" fn( + plan: &Self, + ) + -> RResult, RStr<'static>>, + + pub name: unsafe extern "C" fn(plan: &Self) -> RString, + + pub execute: unsafe extern "C" fn( + plan: &Self, + partition: usize, + ) -> RResult, + + pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + pub release: unsafe extern "C" fn(arg: &mut Self), pub private_data: *mut c_void, } +unsafe impl Send for FFI_ExecutionPlan {} +unsafe impl Sync for FFI_ExecutionPlan {} + pub struct ExecutionPlanPrivateData { pub plan: Arc, - pub last_error: Option, pub children: Vec, pub context: Arc, pub name: CString, } unsafe extern "C" fn properties_fn_wrapper( - plan: *const FFI_ExecutionPlan, + plan: &FFI_ExecutionPlan, ) -> FFI_PlanProperties { - let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - let properties = (*private_data).plan.properties(); + let private_data = plan.private_data as *const ExecutionPlanPrivateData; + let plan = &(*private_data).plan; - FFI_PlanProperties::new(properties.clone()).unwrap_or(FFI_PlanProperties::empty()) + FFI_PlanProperties::new(plan.properties()) } unsafe extern "C" fn children_fn_wrapper( - plan: *const FFI_ExecutionPlan, - num_children: &mut usize, - err_code: &mut i32, -) -> *const FFI_ExecutionPlan { - let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - - *num_children = (*private_data).children.len(); - *err_code = 0; - - (*private_data).children.as_ptr() + plan: &FFI_ExecutionPlan, +) -> RResult, RStr<'static>> { + let private_data = plan.private_data as *const ExecutionPlanPrivateData; + let plan = &(*private_data).plan; + let ctx = &(*private_data).context; + + let maybe_children: Result> = plan + .children() + .into_iter() + .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(ctx))) + .collect(); + + match maybe_children { + Ok(c) => RResult::ROk(c.into()), + Err(_) => RResult::RErr( + "Error occurred during collection of FFI_ExecutionPlan children".into(), + ), + } } unsafe extern "C" fn execute_fn_wrapper( - plan: *const FFI_ExecutionPlan, + plan: &FFI_ExecutionPlan, partition: usize, - err_code: &mut i32, -) -> FFI_ArrowArrayStream { - let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - - let record_batch_stream = match (*private_data) - .plan - .execute(partition, Arc::clone(&(*private_data).context)) - { - Ok(rbs) => rbs, - Err(_e) => { - *err_code = 1; - return FFI_ArrowArrayStream::empty(); - } - }; - - record_batch_to_arrow_stream(record_batch_stream) -} -unsafe extern "C" fn name_fn_wrapper(plan: *const FFI_ExecutionPlan) -> *const c_char { - let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; - (*private_data).name.as_ptr() -} - -unsafe extern "C" fn release_fn_wrapper(plan: *mut FFI_ExecutionPlan) { - if plan.is_null() { - return; +) -> RResult { + let private_data = plan.private_data as *const ExecutionPlanPrivateData; + let plan = &(*private_data).plan; + let ctx = &(*private_data).context; + + match plan.execute(partition, Arc::clone(ctx)) { + Ok(rbs) => RResult::ROk(FFI_RecordBatchStream::new(rbs)), + Err(e) => RResult::RErr( + format!("Error occurred during FFI_ExecutionPlan execute: {}", e).into(), + ), } - let plan = &mut *plan; +} +unsafe extern "C" fn name_fn_wrapper(plan: &FFI_ExecutionPlan) -> RString { + let private_data = plan.private_data as *const ExecutionPlanPrivateData; + let plan = &(*private_data).plan; - plan.properties = None; - plan.children = None; - plan.name = None; - plan.execute = None; + plan.name().into() +} +unsafe extern "C" fn release_fn_wrapper(plan: &mut FFI_ExecutionPlan) { let private_data = Box::from_raw(plan.private_data as *mut ExecutionPlanPrivateData); drop(private_data); - - plan.release = None; } -unsafe extern "C" fn clone_fn_wrapper( - plan: *const FFI_ExecutionPlan, -) -> FFI_ExecutionPlan { - let private_data = (*plan).private_data as *const ExecutionPlanPrivateData; +unsafe extern "C" fn clone_fn_wrapper(plan: &FFI_ExecutionPlan) -> FFI_ExecutionPlan { + let private_data = plan.private_data as *const ExecutionPlanPrivateData; let plan_data = &(*private_data); FFI_ExecutionPlan::new(Arc::clone(&plan_data.plan), Arc::clone(&plan_data.context)) .unwrap() } +impl Clone for FFI_ExecutionPlan { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } + } +} + // Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. // This struct exists on the consumer side (datafusion-python, for example) and not // in the provider's side. @@ -203,40 +196,24 @@ impl FFI_ExecutionPlan { plan, children, context, - last_error: None, name, }); Ok(Self { - properties: Some(properties_fn_wrapper), - children: Some(children_fn_wrapper), - name: Some(name_fn_wrapper), - execute: Some(execute_fn_wrapper), - release: Some(release_fn_wrapper), - clone: Some(clone_fn_wrapper), + properties: properties_fn_wrapper, + children: children_fn_wrapper, + name: name_fn_wrapper, + execute: execute_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, }) } - - pub fn empty() -> Self { - Self { - properties: None, - children: None, - name: None, - execute: None, - release: None, - clone: None, - private_data: null_mut(), - } - } } impl Drop for FFI_ExecutionPlan { fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; + unsafe { (self.release)(self) } } } @@ -248,57 +225,25 @@ impl ForeignExecutionPlan { /// The caller must ensure the pointer provided points to a valid implementation /// of FFI_ExecutionPlan pub unsafe fn new(plan: FFI_ExecutionPlan) -> Result { - let name_fn = plan.name.ok_or(DataFusionError::NotImplemented( - "name() not implemented on FFI_ExecutionPlan".to_string(), - ))?; - let name_ptr = name_fn(&plan); - - let name_cstr = CStr::from_ptr(name_ptr); - - let name = name_cstr - .to_str() - .map_err(|e| { - DataFusionError::Plan(format!( - "Unable to convert CStr name in FFI_ExecutionPlan: {}", - e - )) - })? - .to_string(); + let name = (plan.name)(&plan).into(); - let properties = unsafe { - let properties_fn = - plan.properties.ok_or(DataFusionError::NotImplemented( - "properties not implemented on FFI_ExecutionPlan".to_string(), - ))?; + let properties = ForeignPlanProperties::new((plan.properties)(&plan))?; - ForeignPlanProperties::new(properties_fn(&plan))? - }; + let children = match (plan.children)(&plan) { + RResult::ROk(children_rvec) => { + let maybe_children: Result> = children_rvec + .iter() + .map(|child| ForeignExecutionPlan::new(child.clone())) + .collect(); - let children = unsafe { - let children_fn = plan.children.ok_or(DataFusionError::NotImplemented( - "children not implemented on FFI_ExecutionPlan".to_string(), - ))?; - let mut num_children = 0; - let mut err_code = 0; - let children_ptr = children_fn(&plan, &mut num_children, &mut err_code); - - if err_code != 0 { - return Err(DataFusionError::Plan( - "Error getting children for FFI_ExecutionPlan".to_string(), - )); + maybe_children? + .into_iter() + .map(|child| Arc::new(child) as Arc) + .collect() + } + RResult::RErr(e) => { + return Err(DataFusionError::Execution(e.to_string())); } - - let ffi_vec = slice::from_raw_parts(children_ptr, num_children); - let maybe_children: Result> = ffi_vec - .iter() - .map(|child| { - let child_plan = ForeignExecutionPlan::new(child.clone()); - - child_plan.map(|c| Arc::new(c) as Arc) - }) - .collect(); - - maybe_children? }; Ok(Self { @@ -310,17 +255,6 @@ impl ForeignExecutionPlan { } } -impl Clone for FFI_ExecutionPlan { - fn clone(&self) -> Self { - unsafe { - let clone_fn = self - .clone - .expect("FFI_ExecutionPlan does not have clone defined."); - clone_fn(self) - } - } -} - impl ExecutionPlan for ForeignExecutionPlan { fn name(&self) -> &str { &self.name @@ -356,23 +290,18 @@ impl ExecutionPlan for ForeignExecutionPlan { fn execute( &self, partition: usize, - _context: Arc, - ) -> datafusion::error::Result { + _context: Arc, + ) -> Result { unsafe { - let execute_fn = self.plan.execute.ok_or(DataFusionError::Execution( - "execute is not defined on FFI_ExecutionPlan".to_string(), - ))?; - - let mut err_code = 0; - let arrow_stream = execute_fn(&self.plan, partition, &mut err_code); - - match err_code { - 0 => ConsumerRecordBatchStream::try_from(arrow_stream) - .map(|v| Pin::new(Box::new(v)) as SendableRecordBatchStream), - _ => Err(DataFusionError::Execution( - "Error occurred during FFI call to FFI_ExecutionPlan execute." - .to_string(), - )), + match (self.plan.execute)(&self.plan, partition) { + RResult::ROk(stream) => { + let stream = Pin::new(Box::new(stream)) as SendableRecordBatchStream; + Ok(stream) + } + RResult::RErr(e) => Err(DataFusionError::Execution(format!( + "Error occurred during FFI call to FFI_ExecutionPlan execute. {}", + e + ))), } } } diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 7ea8a851f410..8b65944963bc 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -15,14 +15,16 @@ // specific language governing permissions and limitations // under the License. -use core::slice; -use std::{ - ffi::{c_uint, c_void}, - ptr::null_mut, - sync::Arc, -}; +use std::{ffi::c_void, sync::Arc}; -use arrow::{datatypes::Schema, ffi::FFI_ArrowSchema}; +use abi_stable::{ + std_types::{ + RResult::{self, RErr, ROk}, + RStr, RVec, + }, + StableAbi, +}; +use arrow::ffi::FFI_ArrowSchema; use datafusion::{ error::{DataFusionError, Result}, physical_expr::EquivalenceProperties, @@ -42,7 +44,7 @@ use prost::Message; // TODO: should we just make ExecutionMode repr(C)? #[repr(C)] #[allow(non_camel_case_types)] -#[derive(Clone)] +#[derive(Clone, StableAbi)] pub enum FFI_ExecutionMode { Bounded, Unbounded, @@ -70,156 +72,130 @@ impl From for ExecutionMode { } #[repr(C)] -#[derive(Debug)] +#[derive(Debug, StableAbi)] +pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema); + +#[repr(C)] +#[derive(Debug, StableAbi)] #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_PlanProperties { // Returns protobuf serialized bytes of the partitioning - pub output_partitioning: Option< - unsafe extern "C" fn( - plan: *const FFI_PlanProperties, - buffer_size: &mut c_uint, - ) -> *const u8, - >, + pub output_partitioning: + unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, - pub execution_mode: Option< - unsafe extern "C" fn(plan: *const FFI_PlanProperties) -> FFI_ExecutionMode, - >, + pub execution_mode: unsafe extern "C" fn(plan: &Self) -> FFI_ExecutionMode, // PhysicalSortExprNodeCollection proto - pub output_ordering: Option< - unsafe extern "C" fn( - plan: *const FFI_PlanProperties, - buffer_size: &mut usize, - ) -> *const u8, - >, + pub output_ordering: + unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, - pub schema: - Option FFI_ArrowSchema>, + pub schema: unsafe extern "C" fn(plan: &Self) -> WrappedSchema, - pub release: Option, + pub release: unsafe extern "C" fn(arg: &mut Self), pub private_data: *mut c_void, } struct PlanPropertiesPrivateData { - output_partitioning: Vec, - execution_mode: FFI_ExecutionMode, - output_ordering: Vec, - schema: Arc, + props: PlanProperties, } unsafe extern "C" fn output_partitioning_fn_wrapper( - properties: *const FFI_PlanProperties, - buffer_size: &mut c_uint, -) -> *const u8 { - let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; - *buffer_size = (*private_data).output_partitioning.len() as c_uint; - (*private_data).output_partitioning.as_ptr() + properties: &FFI_PlanProperties, +) -> RResult, RStr<'static>> { + let private_data = properties.private_data as *const PlanPropertiesPrivateData; + let props = &(*private_data).props; + + let codec = DefaultPhysicalExtensionCodec {}; + let partitioning_data = + match serialize_partitioning(props.output_partitioning(), &codec) { + Ok(p) => p, + Err(_) => { + return RErr( + "unable to serialize output_partitioning in FFI_PlanProperties" + .into(), + ) + } + }; + let output_partitioning = partitioning_data.encode_to_vec(); + + ROk(output_partitioning.into()) } unsafe extern "C" fn execution_mode_fn_wrapper( - properties: *const FFI_PlanProperties, + properties: &FFI_PlanProperties, ) -> FFI_ExecutionMode { - let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; - (*private_data).execution_mode.clone() + let private_data = properties.private_data as *const PlanPropertiesPrivateData; + let props = &(*private_data).props; + props.execution_mode().into() } unsafe extern "C" fn output_ordering_fn_wrapper( - properties: *const FFI_PlanProperties, - buffer_size: &mut usize, -) -> *const u8 { - let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; - *buffer_size = (*private_data).output_ordering.len(); - (*private_data).output_ordering.as_ptr() -} + properties: &FFI_PlanProperties, +) -> RResult, RStr<'static>> { + let private_data = properties.private_data as *const PlanPropertiesPrivateData; + let props = &(*private_data).props; + + let codec = DefaultPhysicalExtensionCodec {}; + let output_ordering = + match props.output_ordering() { + Some(ordering) => { + let physical_sort_expr_nodes = + match serialize_physical_sort_exprs(ordering.to_owned(), &codec) { + Ok(v) => v, + Err(_) => return RErr( + "unable to serialize output_ordering in FFI_PlanProperties" + .into(), + ), + }; -unsafe extern "C" fn schema_fn_wrapper( - properties: *const FFI_PlanProperties, -) -> FFI_ArrowSchema { - let private_data = (*properties).private_data as *const PlanPropertiesPrivateData; - FFI_ArrowSchema::try_from((*private_data).schema.as_ref()) - .unwrap_or(FFI_ArrowSchema::empty()) -} + let ordering_data = PhysicalSortExprNodeCollection { + physical_sort_expr_nodes, + }; -unsafe extern "C" fn release_fn_wrapper(props: *mut FFI_PlanProperties) { - if props.is_null() { - return; - } + ordering_data.encode_to_vec() + } + None => Vec::default(), + }; + ROk(output_ordering.into()) +} - let props = &mut *props; +unsafe extern "C" fn schema_fn_wrapper(properties: &FFI_PlanProperties) -> WrappedSchema { + let private_data = properties.private_data as *const PlanPropertiesPrivateData; + let props = &(*private_data).props; - props.execution_mode = None; - props.output_partitioning = None; - props.execution_mode = None; - props.output_ordering = None; - props.schema = None; + let schema = props.eq_properties.schema(); + WrappedSchema( + FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()), + ) +} +unsafe extern "C" fn release_fn_wrapper(props: &mut FFI_PlanProperties) { let private_data = Box::from_raw(props.private_data as *mut PlanPropertiesPrivateData); drop(private_data); - - props.release = None; } impl Drop for FFI_PlanProperties { fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; + unsafe { (self.release)(self) } } } impl FFI_PlanProperties { - pub fn new(props: PlanProperties) -> Result { - let partitioning = props.output_partitioning(); - - let codec = DefaultPhysicalExtensionCodec {}; - let partitioning_data = serialize_partitioning(partitioning, &codec)?; - let output_partitioning = partitioning_data.encode_to_vec(); - - let output_ordering = match props.output_ordering() { - Some(ordering) => { - let physical_sort_expr_nodes = - serialize_physical_sort_exprs(ordering.to_owned(), &codec)?; - - let ordering_data = PhysicalSortExprNodeCollection { - physical_sort_expr_nodes, - }; - - ordering_data.encode_to_vec() - } - None => Vec::default(), - }; - + pub fn new(props: &PlanProperties) -> Self { let private_data = Box::new(PlanPropertiesPrivateData { - output_partitioning, - output_ordering, - execution_mode: props.execution_mode.into(), - schema: Arc::clone(props.eq_properties.schema()), + props: props.clone(), }); - let ffi_props = FFI_PlanProperties { - output_partitioning: Some(output_partitioning_fn_wrapper), - execution_mode: Some(execution_mode_fn_wrapper), - output_ordering: Some(output_ordering_fn_wrapper), - schema: Some(schema_fn_wrapper), - release: Some(release_fn_wrapper), + FFI_PlanProperties { + output_partitioning: output_partitioning_fn_wrapper, + execution_mode: execution_mode_fn_wrapper, + output_ordering: output_ordering_fn_wrapper, + schema: schema_fn_wrapper, + release: release_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, - }; - - Ok(ffi_props) - } - - pub fn empty() -> Self { - Self { - output_partitioning: None, - execution_mode: None, - output_ordering: None, - schema: None, - release: None, - private_data: null_mut(), } } } @@ -236,39 +212,18 @@ impl ForeignPlanProperties { /// provided, so the user must ensure it remains valid for the lifetime /// of the returned struct. pub unsafe fn new(ffi_props: FFI_PlanProperties) -> Result { - let schema_fn = ffi_props.schema.ok_or(DataFusionError::NotImplemented( - "schema() not implemented on FFI_PlanProperties".to_string(), - ))?; - let ffi_schema = schema_fn(&ffi_props); - let schema = (&ffi_schema).try_into()?; - - let ordering_fn = - ffi_props - .output_ordering - .ok_or(DataFusionError::NotImplemented( - "output_ordering() not implemented on FFI_PlanProperties".to_string(), - ))?; - let mut buff_size = 0; - let buff = ordering_fn(&ffi_props, &mut buff_size); - if buff.is_null() { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_ordering in FFI_PlanProperties" - .to_string(), - )); - } + let ffi_schema = (ffi_props.schema)(&ffi_props); + let schema = (&ffi_schema.0).try_into()?; // TODO Extend FFI to get the registry and codex let default_ctx = SessionContext::new(); let codex = DefaultPhysicalExtensionCodec {}; - let orderings = match buff_size == 0 { - true => None, - false => { - let data = slice::from_raw_parts(buff, buff_size); - - let proto_output_ordering = PhysicalSortExprNodeCollection::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - + let orderings = match (ffi_props.output_ordering)(&ffi_props) { + ROk(ordering_vec) => { + let proto_output_ordering = + PhysicalSortExprNodeCollection::decode(ordering_vec.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; Some(parse_physical_sort_exprs( &proto_output_ordering.physical_sort_expr_nodes, &default_ctx, @@ -276,47 +231,26 @@ impl ForeignPlanProperties { &codex, )?) } + RErr(e) => return Err(DataFusionError::Plan(e.to_string())), }; - let mut buff_size = 0; - - let partitioning_fn = - ffi_props - .output_partitioning - .ok_or(DataFusionError::NotImplemented( - "output_partitioning() not implemented on FFI_PlanProperties" - .to_string(), - ))?; - let buff = partitioning_fn(&ffi_props, &mut buff_size); - if buff.is_null() && buff_size != 0 { - return Err(DataFusionError::Plan( - "Error occurred during FFI call to output_partitioning in FFI_PlanProperties" - .to_string(), - )); - } - - let partitioning = { - let data = slice::from_raw_parts(buff, buff_size as usize); - - let proto_partitioning = Partitioning::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - // TODO: Validate this unwrap is safe. - parse_protobuf_partitioning( - Some(&proto_partitioning), - &default_ctx, - &schema, - &codex, - )? - .unwrap() + let partitioning = match (ffi_props.output_partitioning)(&ffi_props) { + ROk(partitioning_vec) => { + let proto_output_partitioning = + Partitioning::decode(partitioning_vec.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + parse_protobuf_partitioning( + Some(&proto_output_partitioning), + &default_ctx, + &schema, + &codex, + )? + .unwrap() + } + RErr(e) => return Err(DataFusionError::Plan(e.to_string())), }; - let execution_mode_fn = - ffi_props - .execution_mode - .ok_or(DataFusionError::NotImplemented( - "execution_mode() not implemented on FFI_PlanProperties".to_string(), - ))?; - let execution_mode: ExecutionMode = execution_mode_fn(&ffi_props).into(); + let execution_mode: ExecutionMode = (ffi_props.execution_mode)(&ffi_props).into(); let eq_properties = match orderings { Some(ordering) => { @@ -351,7 +285,7 @@ mod tests { ExecutionMode::Unbounded, ); - let local_props_ptr = FFI_PlanProperties::new(original_props.clone())?; + let local_props_ptr = FFI_PlanProperties::new(&original_props); let foreign_props = unsafe { ForeignPlanProperties::new(local_props_ptr)? }; diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index 453f5ce8d2dc..e160f112603b 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -15,20 +15,19 @@ // specific language governing permissions and limitations // under the License. -use std::{ - ffi::{c_char, c_int, c_void, CString}, - ptr::addr_of, -}; +use std::{ffi::c_void, sync::Arc, task::Poll}; -use arrow::{ - array::StructArray, - ffi::{FFI_ArrowArray, FFI_ArrowSchema}, - ffi_stream::FFI_ArrowArrayStream, +use abi_stable::{ + std_types::{ROption, RResult, RString}, + StableAbi, }; +use arrow::array::{Array, RecordBatch}; use arrow::{ - array::{Array, RecordBatch, RecordBatchReader}, - ffi_stream::ArrowArrayStreamReader, + array::{make_array, StructArray}, + datatypes::{Schema, SchemaRef}, + ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema}, }; +use async_ffi::{ContextExt, FfiContext, FfiPoll}; use datafusion::error::Result; use datafusion::{ error::DataFusionError, @@ -36,181 +35,156 @@ use datafusion::{ }; use futures::{Stream, TryStreamExt}; -pub fn record_batch_to_arrow_stream( - stream: SendableRecordBatchStream, -) -> FFI_ArrowArrayStream { - let private_data = Box::new(RecoredBatchStreamPrivateData { - stream, - last_error: None, - }); +#[repr(C)] +#[derive(Debug, StableAbi)] +pub struct WrappedArray { + #[sabi(unsafe_opaque_field)] + array: FFI_ArrowArray, - FFI_ArrowArrayStream { - get_schema: Some(get_schema), - get_next: Some(get_next), - get_last_error: Some(get_last_error), - release: Some(release_stream), - private_data: Box::into_raw(private_data) as *mut c_void, - } + schema: WrappedSchema, } -struct RecoredBatchStreamPrivateData { - stream: SendableRecordBatchStream, - last_error: Option, -} +#[repr(C)] +#[derive(Debug, StableAbi)] +pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] FFI_ArrowSchema); -// callback used to drop [FFI_ArrowArrayStream] when it is exported. -unsafe extern "C" fn release_stream(stream: *mut FFI_ArrowArrayStream) { - if stream.is_null() { - return; +impl From for WrappedSchema { + fn from(value: SchemaRef) -> Self { + let schema = FFI_ArrowSchema::try_from(value.as_ref()); + WrappedSchema(schema.unwrap_or(FFI_ArrowSchema::empty())) } - let stream = &mut *stream; +} - stream.get_schema = None; - stream.get_next = None; - stream.get_last_error = None; +#[repr(C)] +#[derive(Debug, StableAbi)] +#[allow(missing_docs)] +#[allow(non_camel_case_types)] +pub struct FFI_RecordBatchStream { + pub poll_next: + unsafe extern "C" fn( + stream: &Self, + cx: &mut FfiContext, + ) -> FfiPoll>>, - let private_data = - Box::from_raw(stream.private_data as *mut RecoredBatchStreamPrivateData); - drop(private_data); + pub schema: unsafe extern "C" fn(stream: &Self) -> WrappedSchema, - stream.release = None; + pub private_data: *mut c_void, } -// The callback used to get array schema -unsafe extern "C" fn get_schema( - stream: *mut FFI_ArrowArrayStream, - schema: *mut FFI_ArrowSchema, -) -> c_int { - ExportedRecordBatchStream { stream }.get_schema(schema) +impl FFI_RecordBatchStream { + pub fn new(stream: SendableRecordBatchStream) -> Self { + FFI_RecordBatchStream { + poll_next: poll_next_fn_wrapper, + schema: schema_fn_wrapper, + private_data: Box::into_raw(Box::new(stream)) as *mut c_void, + } + } } -// The callback used to get next array -unsafe extern "C" fn get_next( - stream: *mut FFI_ArrowArrayStream, - array: *mut FFI_ArrowArray, -) -> c_int { - ExportedRecordBatchStream { stream }.get_next(array) -} +unsafe impl Send for FFI_RecordBatchStream {} -// The callback used to get the error from last operation on the `FFI_ArrowArrayStream` -unsafe extern "C" fn get_last_error(stream: *mut FFI_ArrowArrayStream) -> *const c_char { - let mut ffi_stream = ExportedRecordBatchStream { stream }; - // The consumer should not take ownership of this string, we should return - // a const pointer to it. - match ffi_stream.get_last_error() { - Some(err_string) => err_string.as_ptr(), - None => std::ptr::null(), - } -} +unsafe extern "C" fn schema_fn_wrapper(stream: &FFI_RecordBatchStream) -> WrappedSchema { + let stream = stream.private_data as *const SendableRecordBatchStream; -struct ExportedRecordBatchStream { - stream: *mut FFI_ArrowArrayStream, + (*stream).schema().into() } -impl ExportedRecordBatchStream { - fn get_private_data(&mut self) -> &mut RecoredBatchStreamPrivateData { - unsafe { - &mut *((*self.stream).private_data as *mut RecoredBatchStreamPrivateData) - } +fn record_batch_to_wrapped_array( + record_batch: RecordBatch, +) -> RResult { + let struct_array = StructArray::from(record_batch); + match to_ffi(&struct_array.to_data()) { + Ok((array, schema)) => RResult::ROk(WrappedArray { + array, + schema: WrappedSchema(schema), + }), + Err(e) => RResult::RErr(e.to_string().into()), } +} - pub fn get_schema(&mut self, out: *mut FFI_ArrowSchema) -> i32 { - let private_data = self.get_private_data(); - let stream = &private_data.stream; - - let schema = FFI_ArrowSchema::try_from(stream.schema().as_ref()); - - match schema { - Ok(schema) => { - unsafe { std::ptr::copy(addr_of!(schema), out, 1) }; - std::mem::forget(schema); - 0 - } - Err(ref err) => { - private_data.last_error = Some( - CString::new(err.to_string()) - .expect("Error string has a null byte in it."), - ); - 1 - } +// probably want to use pub unsafe fn from_ffi(array: FFI_ArrowArray, schema: &FFI_ArrowSchema) -> Result { +fn maybe_record_batch_to_wrapped_stream( + record_batch: Option>, +) -> ROption> { + match record_batch { + Some(Ok(record_batch)) => { + ROption::RSome(record_batch_to_wrapped_array(record_batch)) } + Some(Err(e)) => ROption::RSome(RResult::RErr(e.to_string().into())), + None => ROption::RNone, } +} - pub fn get_next(&mut self, out: *mut FFI_ArrowArray) -> i32 { - let private_data = self.get_private_data(); +unsafe extern "C" fn poll_next_fn_wrapper( + stream: &FFI_RecordBatchStream, + cx: &mut FfiContext, +) -> FfiPoll>> { + let stream = stream.private_data as *mut SendableRecordBatchStream; - let runtime = match tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - { - Ok(r) => r, - Err(_e) => { - return 1; - } - }; - // let maybe_batch = block_on(private_data.stream.try_next()); - let maybe_batch = runtime.block_on(private_data.stream.try_next()); - - match maybe_batch { - Ok(None) => { - // Marks ArrowArray released to indicate reaching the end of stream. - unsafe { std::ptr::write(out, FFI_ArrowArray::empty()) } - 0 - } - Ok(Some(batch)) => { - let struct_array = StructArray::from(batch); - let array = FFI_ArrowArray::new(&struct_array.to_data()); - - unsafe { std::ptr::write_unaligned(out, array) }; - 0 - } - Err(err) => { - private_data.last_error = Some( - CString::new(err.to_string()) - .expect("Error string has a null byte in it."), - ); - 1 - } - } - } + let poll_result = cx.with_context(|std_cx| { + (*stream) + .try_poll_next_unpin(std_cx) + .map(maybe_record_batch_to_wrapped_stream) + }); - pub fn get_last_error(&mut self) -> Option<&CString> { - self.get_private_data().last_error.as_ref() - } + poll_result.into() } -pub struct ConsumerRecordBatchStream { - reader: ArrowArrayStreamReader, +impl RecordBatchStream for FFI_RecordBatchStream { + fn schema(&self) -> arrow::datatypes::SchemaRef { + let wrapped_schema = unsafe { (self.schema)(self) }; + let schema = Schema::try_from(&wrapped_schema.0); + Arc::new(schema.unwrap_or(Schema::empty())) + } } -impl TryFrom for ConsumerRecordBatchStream { - type Error = DataFusionError; - - fn try_from(value: FFI_ArrowArrayStream) -> std::result::Result { - let reader = ArrowArrayStreamReader::try_new(value)?; - - Ok(Self { reader }) - } +fn wrapped_array_to_record_batch(array: WrappedArray) -> Result { + let array_data = + unsafe { from_ffi(array.array, &array.schema.0).map_err(DataFusionError::from)? }; + let array = make_array(array_data); + let struct_array = array + .as_any() + .downcast_ref::() + .ok_or(DataFusionError::Execution( + "Unexpected array type during record batch collection in FFI_RecordBatchStream" + .to_string(), + ))?; + + Ok(struct_array.into()) } -impl RecordBatchStream for ConsumerRecordBatchStream { - fn schema(&self) -> arrow::datatypes::SchemaRef { - self.reader.schema() +fn maybe_wrapped_array_to_record_batch( + array: ROption>, +) -> Option> { + match array { + ROption::RSome(RResult::ROk(wrapped_array)) => { + Some(wrapped_array_to_record_batch(wrapped_array)) + } + ROption::RSome(RResult::RErr(e)) => { + Some(Err(DataFusionError::Execution(e.to_string()))) + } + ROption::RNone => None, } } -impl Stream for ConsumerRecordBatchStream { +impl Stream for FFI_RecordBatchStream { type Item = Result; fn poll_next( - mut self: std::pin::Pin<&mut Self>, - _cx: &mut std::task::Context<'_>, - ) -> std::task::Poll> { - let batch = self - .reader - .next() - .map(|v| v.map_err(|e| DataFusionError::ArrowError(e, None))); - - std::task::Poll::Ready(batch) + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> Poll> { + let poll_result = + unsafe { cx.with_ffi_context(|ffi_cx| (self.poll_next)(&self, ffi_cx)) }; + + match poll_result { + FfiPoll::Ready(array) => { + Poll::Ready(maybe_wrapped_array_to_record_batch(array)) + } + FfiPoll::Pending => Poll::Pending, + FfiPoll::Panicked => Poll::Ready(Some(Err(DataFusionError::Execution( + "Error occurred during poll_next on FFI_RecordBatchStream".to_string(), + )))), + } } } diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 3f70875dd2ed..f2571ceb9163 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -17,74 +17,71 @@ use std::{ collections::HashMap, - ffi::{c_char, c_uint, c_void, CStr, CString}, - ptr::null_mut, - slice, + ffi::{c_char, c_void, CString}, }; -use datafusion::error::Result; -use datafusion::{error::DataFusionError, prelude::SessionConfig}; +use abi_stable::{ + std_types::{RHashMap, RString}, + StableAbi, +}; +use datafusion::prelude::SessionConfig; +use datafusion::{config::ConfigOptions, error::Result}; #[repr(C)] -#[derive(Debug)] +#[derive(Debug, StableAbi)] #[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_SessionConfig { - pub config_options: Option< - unsafe extern "C" fn( - config: *const FFI_SessionConfig, - num_options: &mut c_uint, - keys: &mut *const *const c_char, - values: &mut *const *const c_char, - ) -> (), - >, + pub config_options: unsafe extern "C" fn(config: &Self) -> RHashMap, pub private_data: *mut c_void, - pub release: Option, + pub clone: unsafe extern "C" fn(&Self) -> Self, + pub release: unsafe extern "C" fn(config: &mut Self), } unsafe impl Send for FFI_SessionConfig {} +unsafe impl Sync for FFI_SessionConfig {} unsafe extern "C" fn config_options_fn_wrapper( - config: *const FFI_SessionConfig, - num_options: &mut c_uint, - keys: &mut *const *const c_char, - values: &mut *const *const c_char, -) { - let private_data = (*config).private_data as *mut SessionConfigPrivateData; - - *num_options = (*private_data).config_keys.len() as c_uint; - *keys = (*private_data).config_keys.as_ptr(); - *values = (*private_data).config_values.as_ptr(); -} - -unsafe extern "C" fn release_fn_wrapper(config: *mut FFI_SessionConfig) { - if config.is_null() { - return; + config: &FFI_SessionConfig, +) -> RHashMap { + let private_data = config.private_data as *mut SessionConfigPrivateData; + let config_options = &(*private_data).config; + + let mut options = RHashMap::default(); + for config_entry in config_options.entries() { + if let Some(value) = config_entry.value { + options.insert(config_entry.key.into(), value.into()); + } } - let config = &mut *config; - let mut private_data = - Box::from_raw(config.private_data as *mut SessionConfigPrivateData); - let _removed_keys: Vec<_> = private_data - .config_keys - .drain(..) - .map(|key| CString::from_raw(key as *mut c_char)) - .collect(); - let _removed_values: Vec<_> = private_data - .config_values - .drain(..) - .map(|key| CString::from_raw(key as *mut c_char)) - .collect(); + options +} +unsafe extern "C" fn release_fn_wrapper(config: &mut FFI_SessionConfig) { + let private_data = + Box::from_raw(config.private_data as *mut SessionConfigPrivateData); drop(private_data); +} - config.release = None; +unsafe extern "C" fn clone_fn_wrapper(config: &FFI_SessionConfig) -> FFI_SessionConfig { + let old_private_data = config.private_data as *mut SessionConfigPrivateData; + let old_config = &(*old_private_data).config; + + let private_data = Box::new(SessionConfigPrivateData { + config: old_config.clone(), + }); + + FFI_SessionConfig { + config_options: config_options_fn_wrapper, + private_data: Box::into_raw(private_data) as *mut c_void, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + } } struct SessionConfigPrivateData { - pub config_keys: Vec<*const c_char>, - pub config_values: Vec<*const c_char>, + pub config: ConfigOptions, } impl FFI_SessionConfig { @@ -105,24 +102,27 @@ impl FFI_SessionConfig { } let private_data = Box::new(SessionConfigPrivateData { - config_keys, - config_values, + config: session.options().clone(), }); Self { - config_options: Some(config_options_fn_wrapper), + config_options: config_options_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, - release: Some(release_fn_wrapper), + clone: clone_fn_wrapper, + release: release_fn_wrapper, } } } +impl Clone for FFI_SessionConfig { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } + } +} + impl Drop for FFI_SessionConfig { fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self) }, - }; + unsafe { (self.release)(self) }; } } @@ -137,37 +137,12 @@ impl ForeignSessionConfig { /// access it's unsafe methods. It is the provider's responsibility that /// this pointer and it's internal functions remain valid for the lifetime /// of the returned struct. - pub unsafe fn new(config: *const FFI_SessionConfig) -> Result { - let (keys, values) = unsafe { - let config_options = - (*config) - .config_options - .ok_or(DataFusionError::NotImplemented( - "config_options not implemented on FFI_SessionConfig".to_string(), - ))?; - let mut num_keys = 0; - let mut keys: *const *const c_char = null_mut(); - let mut values: *const *const c_char = null_mut(); - config_options(config, &mut num_keys, &mut keys, &mut values); - let num_keys = num_keys as usize; - - let keys: Vec = slice::from_raw_parts(keys, num_keys) - .iter() - .map(|key| CStr::from_ptr(*key)) - .map(|key| key.to_str().unwrap_or_default().to_string()) - .collect(); - let values: Vec = slice::from_raw_parts(values, num_keys) - .iter() - .map(|value| CStr::from_ptr(*value)) - .map(|val| val.to_str().unwrap_or_default().to_string()) - .collect(); - - (keys, values) - }; + pub unsafe fn new(config: &FFI_SessionConfig) -> Result { + let config_options = (config.config_options)(config); let mut options_map = HashMap::new(); - keys.into_iter().zip(values).for_each(|(key, value)| { - options_map.insert(key, value); + config_options.iter().for_each(|kv_pair| { + options_map.insert(kv_pair.0.to_string(), kv_pair.1.to_string()); }); Ok(Self(SessionConfig::from_string_hash_map(&options_map)?)) @@ -184,19 +159,12 @@ mod tests { let original_options = session_config.options().entries(); let ffi_config = FFI_SessionConfig::new(&session_config); - let ffi_config_ptr = Box::into_raw(Box::new(ffi_config)); - let foreign_config = unsafe { ForeignSessionConfig::new(ffi_config_ptr)? }; + let foreign_config = unsafe { ForeignSessionConfig::new(&ffi_config)? }; let returned_options = foreign_config.0.options().entries(); - println!( - "Length of original options: {} returned {}", - original_options.len(), - returned_options.len() - ); - - let _ = unsafe { Box::from_raw(ffi_config_ptr) }; + assert!(original_options.len() == returned_options.len()); Ok(()) } diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 86357d13b4cb..e68bb00fbc69 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -15,18 +15,17 @@ // specific language governing permissions and limitations // under the License. -use std::{ - any::Any, - ffi::{c_int, c_uint, c_void}, - ptr::{addr_of, null}, - slice, - sync::Arc, -}; +use std::{any::Any, ffi::c_void, sync::Arc}; +use abi_stable::{ + std_types::{ROption, RResult, RString, RVec}, + StableAbi, +}; use arrow::{ datatypes::{Schema, SchemaRef}, ffi::FFI_ArrowSchema, }; +use async_ffi::{FfiFuture, FutureExt}; use async_trait::async_trait; use datafusion::{ catalog::{Session, TableProvider}, @@ -46,6 +45,7 @@ use datafusion_proto::{ use prost::Message; use crate::{ + plan_properties::WrappedSchema, session_config::ForeignSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}, }; @@ -58,39 +58,29 @@ use datafusion::error::Result; /// A stable interface for creating a DataFusion TableProvider. #[repr(C)] -#[derive(Debug)] +#[derive(Debug, StableAbi)] #[allow(non_camel_case_types)] pub struct FFI_TableProvider { - pub schema: Option< - unsafe extern "C" fn(provider: *const FFI_TableProvider) -> FFI_ArrowSchema, - >, - pub scan: Option< + pub schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema, + pub scan: unsafe extern "C" fn( + provider: &Self, + session_config: &FFI_SessionConfig, + projections: RVec, + filters_serialized: RVec, + limit: ROption, + ) -> FfiFuture>, + + pub table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType, + + pub supports_filters_pushdown: unsafe extern "C" fn( - provider: *const FFI_TableProvider, - session_config: *const FFI_SessionConfig, - n_projections: c_uint, - projections: *const c_uint, - filter_buffer_size: c_uint, - filter_buffer: *const u8, - limit: c_int, - err_code: *mut c_int, - ) -> FFI_ExecutionPlan, - >, - pub table_type: - Option FFI_TableType>, - - pub supports_filters_pushdown: Option< - unsafe extern "C" fn( - provider: *const FFI_TableProvider, - filter_buff_size: c_uint, - filter_buffer: *const u8, - num_filters: &mut c_int, - out: &mut *const FFI_TableProviderFilterPushDown, - ) -> c_int, - >, - - pub clone: Option Self>, - pub release: Option, + provider: &Self, + filters_serialized: RVec, + ) + -> RResult, RString>, + + pub clone: unsafe extern "C" fn(provider: &Self) -> Self, + pub release: unsafe extern "C" fn(arg: &mut Self), pub private_data: *mut c_void, } @@ -99,149 +89,131 @@ unsafe impl Sync for FFI_TableProvider {} struct ProviderPrivateData { provider: Arc, - last_filter_pushdowns: Vec, } /// Wrapper struct to provide access functions from the FFI interface to the underlying /// TableProvider. This struct is allowed to access `private_data` because it lives on /// the provider's side of the FFI interace. -struct ExportedTableProvider(*const FFI_TableProvider); +struct ExportedTableProvider<'a>(&'a FFI_TableProvider); -unsafe extern "C" fn schema_fn_wrapper( - provider: *const FFI_TableProvider, -) -> FFI_ArrowSchema { - ExportedTableProvider(provider).schema() +unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema { + WrappedSchema(ExportedTableProvider(provider).schema()) } unsafe extern "C" fn table_type_fn_wrapper( - provider: *const FFI_TableProvider, + provider: &FFI_TableProvider, ) -> FFI_TableType { ExportedTableProvider(provider).table_type() } unsafe extern "C" fn supports_filters_pushdown_fn_wrapper( - provider: *const FFI_TableProvider, - filter_buff_size: c_uint, - filter_buffer: *const u8, - num_filters: &mut c_int, - out: &mut *const FFI_TableProviderFilterPushDown, -) -> c_int { - let results = ExportedTableProvider(provider) - .supports_filters_pushdown(filter_buff_size, filter_buffer); - - match results { - Ok((num_pushdowns, pushdowns_ptr)) => { - *num_filters = num_pushdowns as c_int; - std::ptr::copy(addr_of!(pushdowns_ptr), out, 1); - 0 - } - Err(_e) => 1, - } + provider: &FFI_TableProvider, + filters_serialized: RVec, +) -> RResult, RString> { + ExportedTableProvider(provider) + .supports_filters_pushdown(&filters_serialized) + .map_err(|e| e.to_string().into()) + .into() } unsafe extern "C" fn scan_fn_wrapper( - provider: *const FFI_TableProvider, - session_config: *const FFI_SessionConfig, - n_projections: c_uint, - projections: *const c_uint, - filter_buffer_size: c_uint, - filter_buffer: *const u8, - limit: c_int, - err_code: *mut c_int, -) -> FFI_ExecutionPlan { - let config = match ForeignSessionConfig::new(session_config) { - Ok(c) => c, - Err(_) => { - *err_code = 1; - return FFI_ExecutionPlan::empty(); - } - }; - let session = SessionStateBuilder::new() - .with_default_features() - .with_config(config.0) - .build(); - let ctx = SessionContext::new_with_state(session); + provider: &FFI_TableProvider, + session_config: &FFI_SessionConfig, + projections: RVec, + filters_serialized: RVec, + limit: ROption, +) -> FfiFuture> { + let private_data = provider.private_data as *mut ProviderPrivateData; + let internal_provider = &(*private_data).provider; + let session_config = session_config.clone(); + + async move { + let config = match ForeignSessionConfig::new(&session_config) { + Ok(c) => c, + Err(e) => return RResult::RErr(e.to_string().into()), + }; + let session = SessionStateBuilder::new() + .with_default_features() + .with_config(config.0) + .build(); + let ctx = SessionContext::new_with_state(session); + + let filters = match filters_serialized.is_empty() { + true => vec![], + false => { + let default_ctx = SessionContext::new(); + let codec = DefaultLogicalExtensionCodec {}; - let num_projections: usize = n_projections.try_into().unwrap_or(0); + let proto_filters = + match LogicalExprList::decode(filters_serialized.as_ref()) { + Ok(f) => f, + Err(e) => return RResult::RErr(e.to_string().into()), + }; - let projections: Vec = - std::slice::from_raw_parts(projections, num_projections) - .iter() - .filter_map(|v| (*v).try_into().ok()) - .collect(); - let maybe_projections = match projections.is_empty() { - true => None, - false => Some(&projections), - }; - - let limit = limit.try_into().ok(); - - let plan = ExportedTableProvider(provider).provider_scan( - &ctx, - maybe_projections, - filter_buffer_size, - filter_buffer, - limit, - ); - - match plan { - Ok(plan) => { - *err_code = 0; - plan - } - Err(_) => { - *err_code = 1; - FFI_ExecutionPlan::empty() - } - } -} + match parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec) { + Ok(f) => f, + Err(e) => return RResult::RErr(e.to_string().into()), + } + } + }; -unsafe extern "C" fn clone_fn_wrapper( - provider: *const FFI_TableProvider, -) -> FFI_TableProvider { - if provider.is_null() { - return FFI_TableProvider::empty(); - } + let projections: Vec<_> = projections.into_iter().collect(); + let maybe_projections = match projections.is_empty() { + true => None, + false => Some(&projections), + }; - let private_data = (*provider).private_data as *const ProviderPrivateData; - let table_provider = unsafe { Arc::clone(&(*private_data).provider) }; - FFI_TableProvider::new(table_provider) -} + let plan = match internal_provider + .scan(&ctx.state(), maybe_projections, &filters, limit.into()) + .await + { + Ok(p) => p, + Err(e) => return RResult::RErr(e.to_string().into()), + }; -unsafe extern "C" fn release_fn_wrapper(provider: *mut FFI_TableProvider) { - if provider.is_null() { - return; + FFI_ExecutionPlan::new(plan, ctx.task_ctx()) + .map_err(|e| e.to_string().into()) + .into() } - let provider = &mut *provider; - - provider.schema = None; - provider.scan = None; - provider.table_type = None; - provider.supports_filters_pushdown = None; + .into_ffi() +} +unsafe extern "C" fn release_fn_wrapper(provider: &mut FFI_TableProvider) { let private_data = Box::from_raw(provider.private_data as *mut ProviderPrivateData); - drop(private_data); +} - provider.release = None; +unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_TableProvider { + let old_private_data = provider.private_data as *const ProviderPrivateData; + + let private_data = Box::into_raw(Box::new(ProviderPrivateData { + provider: Arc::clone(&(*old_private_data).provider), + })) as *mut c_void; + + FFI_TableProvider { + schema: schema_fn_wrapper, + scan: scan_fn_wrapper, + table_type: table_type_fn_wrapper, + supports_filters_pushdown: supports_filters_pushdown_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + private_data, + } } impl Drop for FFI_TableProvider { fn drop(&mut self) { - match self.release { - None => (), - Some(release) => unsafe { release(self as *mut FFI_TableProvider) }, - }; + unsafe { (self.release)(self) } } } -impl ExportedTableProvider { +impl<'a> ExportedTableProvider<'a> { fn private_data(&self) -> &ProviderPrivateData { - unsafe { &*((*self.0).private_data as *const ProviderPrivateData) } + unsafe { &*(self.0.private_data as *const ProviderPrivateData) } } fn mut_private_data(&mut self) -> &mut ProviderPrivateData { - unsafe { &mut *((*self.0).private_data as *mut ProviderPrivateData) } + unsafe { &mut *(self.0.private_data as *mut ProviderPrivateData) } } pub fn schema(&self) -> FFI_ArrowSchema { @@ -261,131 +233,51 @@ impl ExportedTableProvider { provider.table_type().into() } - pub fn provider_scan( + pub fn supports_filters_pushdown( &mut self, - ctx: &SessionContext, - projections: Option<&Vec>, - filter_buffer_size: c_uint, - filter_buffer: *const u8, - limit: Option, - ) -> Result { - let private_data = self.private_data(); - let provider = &private_data.provider; - - let filters = match filter_buffer_size > 0 { - false => vec![], - true => { - let default_ctx = SessionContext::new(); - let codec = DefaultLogicalExtensionCodec {}; - - let data = unsafe { - slice::from_raw_parts(filter_buffer, filter_buffer_size as usize) - }; + filters_serialized: &[u8], + ) -> Result> { + let default_ctx = SessionContext::new(); + let codec = DefaultLogicalExtensionCodec {}; - let proto_filters = LogicalExprList::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let filters = match filters_serialized.is_empty() { + true => vec![], + false => { + let proto_filters = LogicalExprList::decode(filters_serialized) + .map_err(|e| DataFusionError::Plan(e.to_string()))?; parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? } }; + let filters_borrowed: Vec<&Expr> = filters.iter().collect(); - let runtime = tokio::runtime::Builder::new_current_thread() - .enable_all() - .build() - .map_err(|e| { - DataFusionError::Execution(format!( - "Error getting runtime during scan(): {}", - e - )) - })?; - let plan = runtime.block_on(provider.scan( - &ctx.state(), - projections, - &filters, - limit, - ))?; - - FFI_ExecutionPlan::new(plan, ctx.task_ctx()) - } - - pub fn supports_filters_pushdown( - &mut self, - buffer_size: c_uint, - filter_buffer: *const u8, - ) -> Result<(usize, *const FFI_TableProviderFilterPushDown)> { - unsafe { - let default_ctx = SessionContext::new(); - let codec = DefaultLogicalExtensionCodec {}; - - let filters = match buffer_size > 0 { - false => vec![], - true => { - let data = slice::from_raw_parts(filter_buffer, buffer_size as usize); - - let proto_filters = LogicalExprList::decode(data) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let private_data = self.mut_private_data(); + let results: RVec<_> = private_data + .provider + .supports_filters_pushdown(&filters_borrowed)? + .iter() + .map(|v| v.into()) + .collect(); - parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? - } - }; - let filters_borrowed: Vec<&Expr> = filters.iter().collect(); - - let private_data = self.mut_private_data(); - private_data.last_filter_pushdowns = private_data - .provider - .supports_filters_pushdown(&filters_borrowed)? - .iter() - .map(|v| v.into()) - .collect(); - - Ok(( - private_data.last_filter_pushdowns.len(), - private_data.last_filter_pushdowns.as_ptr(), - )) - } + Ok(results) } } impl FFI_TableProvider { /// Creates a new [`FFI_TableProvider`]. pub fn new(provider: Arc) -> Self { - let private_data = Box::new(ProviderPrivateData { - provider, - last_filter_pushdowns: Vec::default(), - }); + let private_data = Box::new(ProviderPrivateData { provider }); Self { - schema: Some(schema_fn_wrapper), - scan: Some(scan_fn_wrapper), - table_type: Some(table_type_fn_wrapper), - supports_filters_pushdown: Some(supports_filters_pushdown_fn_wrapper), - release: Some(release_fn_wrapper), - clone: Some(clone_fn_wrapper), + schema: schema_fn_wrapper, + scan: scan_fn_wrapper, + table_type: table_type_fn_wrapper, + supports_filters_pushdown: supports_filters_pushdown_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, } } - - /// Create a FFI_TableProvider from a raw pointer - /// - /// # Safety - /// - /// This function assumes the raw pointer is valid and takes onwership of it. - pub unsafe fn from_raw(raw_provider: *mut FFI_TableProvider) -> Self { - std::ptr::replace(raw_provider, Self::empty()) - } - - /// Creates a new empty [FFI_ArrowArrayStream]. Used to import from the C Stream Interface. - pub fn empty() -> Self { - Self { - schema: None, - scan: None, - table_type: None, - supports_filters_pushdown: None, - release: None, - clone: None, - private_data: std::ptr::null_mut(), - } - } } /// This wrapper struct exists on the reciever side of the FFI interface, so it has @@ -399,8 +291,14 @@ unsafe impl Send for ForeignTableProvider {} unsafe impl Sync for ForeignTableProvider {} impl ForeignTableProvider { - pub fn new(provider: FFI_TableProvider) -> Self { - Self(provider) + pub fn new(provider: &FFI_TableProvider) -> Self { + Self(provider.clone()) + } +} + +impl Clone for FFI_TableProvider { + fn clone(&self) -> Self { + unsafe { (self.clone)(self) } } } @@ -412,23 +310,14 @@ impl TableProvider for ForeignTableProvider { fn schema(&self) -> SchemaRef { let schema = unsafe { - let schema_fn = self.0.schema; - schema_fn - .map(|func| func(&self.0)) - .and_then(|s| Schema::try_from(&s).ok()) - .unwrap_or(Schema::empty()) + let wrapped_schema = (self.0.schema)(&self.0); + Schema::try_from(&wrapped_schema.0).unwrap_or(Schema::empty()) }; Arc::new(schema) } fn table_type(&self) -> TableType { - unsafe { - let table_type_fn = self.0.table_type; - table_type_fn - .map(|func| func(&self.0)) - .unwrap_or(FFI_TableType::Base) - .into() - } + unsafe { (self.0.table_type)(&self.0).into() } } async fn scan( @@ -438,51 +327,35 @@ impl TableProvider for ForeignTableProvider { filters: &[Expr], limit: Option, ) -> Result> { - let scan_fn = self.0.scan.ok_or(DataFusionError::NotImplemented( - "Scan not defined on FFI_TableProvider".to_string(), - ))?; - let session_config = FFI_SessionConfig::new(session.config()); - let n_projections = projection.map(|p| p.len()).unwrap_or(0) as c_uint; - let projections: Vec = projection - .map(|p| p.iter().map(|v| *v as c_uint).collect()) - .unwrap_or_default(); - let projections_ptr = projections.as_ptr(); + let projections: Option> = + projection.map(|p| p.iter().map(|v| v.to_owned()).collect()); let codec = DefaultLogicalExtensionCodec {}; let filter_list = LogicalExprList { expr: serialize_exprs(filters, &codec)?, }; - let filter_bytes = filter_list.encode_to_vec(); - let filter_buffer_size = filter_bytes.len() as c_uint; - let filter_buffer = filter_bytes.as_ptr(); - - let limit = match limit { - Some(l) => l as c_int, - None => -1, - }; + let filters_serialized = filter_list.encode_to_vec().into(); - let mut err_code = 0; let plan = unsafe { - let plan_ptr = scan_fn( + let maybe_plan = (self.0.scan)( &self.0, &session_config, - n_projections, - projections_ptr, - filter_buffer_size, - filter_buffer, - limit, - &mut err_code, - ); - - if 0 != err_code { - return Err(datafusion::error::DataFusionError::Internal( - "Unable to perform scan via FFI".to_string(), - )); + projections.unwrap_or_default(), + filters_serialized, + limit.into(), + ) + .await; + + match maybe_plan { + RResult::ROk(p) => ForeignExecutionPlan::new(p)?, + RResult::RErr(_) => { + return Err(datafusion::error::DataFusionError::Internal( + "Unable to perform scan via FFI".to_string(), + )) + } } - - ForeignExecutionPlan::new(plan_ptr)? }; Ok(Arc::new(plan)) @@ -500,35 +373,15 @@ impl TableProvider for ForeignTableProvider { let expr_list = LogicalExprList { expr: serialize_exprs(filter.iter().map(|f| f.to_owned()), &codec)?, }; - let expr_bytes = expr_list.encode_to_vec(); - let buffer_size = expr_bytes.len(); - let buffer = expr_bytes.as_ptr(); - - let pushdown_fn = - self.0 - .supports_filters_pushdown - .ok_or(DataFusionError::Plan( - "FFI_TableProvider does not implement supports_filters_pushdown" - .to_string(), - ))?; - let mut num_return = 0; - let mut pushdowns = null(); - let err_code = pushdown_fn( - &self.0, - buffer_size as c_uint, - buffer, - &mut num_return, - &mut pushdowns, - ); - let num_return = num_return as usize; - let pushdowns_slice = slice::from_raw_parts(pushdowns, num_return); - - match err_code { - 0 => Ok(pushdowns_slice.iter().map(|v| v.into()).collect()), - _ => Err(DataFusionError::Plan( - "Error occurred during FFI call to supports_filters_pushdown" - .to_string(), - )), + let serialized_filters = expr_list.encode_to_vec(); + + let pushdown_fn = self.0.supports_filters_pushdown; + + let pushdowns = pushdown_fn(&self.0, serialized_filters.into()); + + match pushdowns { + RResult::ROk(p) => Ok(p.iter().map(|v| v.into()).collect()), + RResult::RErr(e) => Err(DataFusionError::Plan(e.to_string())), } } } @@ -568,7 +421,7 @@ mod tests { let ffi_provider = FFI_TableProvider::new(provider); - let foreign_table_provider = ForeignTableProvider::new(ffi_provider); + let foreign_table_provider = ForeignTableProvider::new(&ffi_provider); ctx.register_table("t", Arc::new(foreign_table_provider))?; diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs index 03774b88948a..200b1b94c3fd 100644 --- a/datafusion/ffi/src/table_source.rs +++ b/datafusion/ffi/src/table_source.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. +use abi_stable::StableAbi; use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown}; // TODO Should we just define TableProviderFilterPushDown as repr(C)? #[repr(C)] +#[derive(StableAbi)] #[allow(non_camel_case_types)] pub enum FFI_TableProviderFilterPushDown { Unsupported, @@ -57,7 +59,7 @@ impl From<&TableProviderFilterPushDown> for FFI_TableProviderFilterPushDown { // TODO Should we just define FFI_TableType as repr(C)? #[repr(C)] #[allow(non_camel_case_types)] -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, StableAbi)] pub enum FFI_TableType { Base, View, From ff4d2e444f014a3773628a30d95dabff1683f894 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 16 Oct 2024 19:56:16 -0400 Subject: [PATCH 13/28] Make consistent the use of Foreign and FFI on struct names --- datafusion/ffi/README.md | 22 +++--- datafusion/ffi/src/execution_plan.rs | 79 +++++++------------ datafusion/ffi/src/table_provider.rs | 109 +++++++++++---------------- 3 files changed, 84 insertions(+), 126 deletions(-) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index a8b4cb2b766a..e2af803c4c11 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -43,30 +43,34 @@ In this crate we have a variety of structs which closely mimic the behavior of their internal counterparts. In the following example, we will refer to the `TableProvider`, but the same pattern exists for other structs. -Each of the exposted structs in this crate is provided with two variants that -are to be used based on which side of the interface you are viewing it from. -The structs that begin with `Exported` are designed to be used by the side that -provides the functions. The receiver of these structs begins with `Foreign`. +Each of the exposted structs in this crate is provided with a variant prefixed +with `Foreign`. This variant is designed to be used by the consumer of the +foreign code. The `Foreign` structs should *never* access the `private_data` +fields. Instead they should only access the data returned through the function +calls defined on the `FFI_` structs. The second purpose of the `Foreign` +structs is to contain additional data that may be needed by the traits that +are implemented on them. Some of these traits require borrowing data which +can be far more convienent to be locally stored. For example, we have a struct `FFI_TableProvider` to give access to the `TableProvider` functions like `table_type()` and `scan()`. If we write a library that wishes to expose it's `TableProvider`, then we can access the private data that contains the Arc reference to the `TableProvider` via -`ExportedTableProvider`. This data is local to the library. +`FFI_TableProvider`. This data is local to the library. -If we have a program that accesses a `TableProvider` provided via FFI, then it +If we have a program that accesses a `TableProvider` via FFI, then it will use `ForeignTableProvider`. When using `ForeignTableProvider` we **must** not attempt to access the `private_data` field in `FFI_TableProvider`. If a user is testing locally, you may be able to successfully access this field, but it will only work if you are building against the exact same version of -`DataFusion` for both libraries **and** the same compiler. It is not guaranteed -to work in general. +`DataFusion` for both libraries **and** the same compiler. It will not work +in general. It is worth noting that which library is the `local` and which is `foreign` depends on which interface we are considering. For example, suppose we have a Python library called `my_provider` that exposes a `TableProvider` called `MyProvider` via `FFI_TableProvider`. Within the library `my_provider` we can -access the `private_data` via `ExportedTableProvider`. We connect this to +access the `private_data` via `FFI_TableProvider`. We connect this to `datafusion-python`, where we access it as a `ForeignTableProvider`. Now when we call `scan()` on this interface, we have to pass it a `FFI_SessionConfig`. The `SessionConfig` is local to `datafusion-python` and **not** `my_provider`. diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 6dc3bb3da554..421308ba0dee 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -15,11 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{ - ffi::{c_void, CString}, - pin::Pin, - sync::Arc, -}; +use std::{ffi::c_void, pin::Pin, sync::Arc}; use abi_stable::{ std_types::{RResult, RStr, RString, RVec}, @@ -71,9 +67,7 @@ unsafe impl Sync for FFI_ExecutionPlan {} pub struct ExecutionPlanPrivateData { pub plan: Arc, - pub children: Vec, pub context: Arc, - pub name: CString, } unsafe extern "C" fn properties_fn_wrapper( @@ -147,9 +141,32 @@ impl Clone for FFI_ExecutionPlan { } } -// Since the trait ExecutionPlan requires borrowed values, we wrap our FFI. -// This struct exists on the consumer side (datafusion-python, for example) and not -// in the provider's side. +impl FFI_ExecutionPlan { + /// This function is called on the provider's side. + pub fn new(plan: Arc, context: Arc) -> Result { + let private_data = Box::new(ExecutionPlanPrivateData { plan, context }); + + Ok(Self { + properties: properties_fn_wrapper, + children: children_fn_wrapper, + name: name_fn_wrapper, + execute: execute_fn_wrapper, + clone: clone_fn_wrapper, + release: release_fn_wrapper, + private_data: Box::into_raw(private_data) as *mut c_void, + }) + } +} + +impl Drop for FFI_ExecutionPlan { + fn drop(&mut self) { + unsafe { (self.release)(self) } + } +} + +/// The ForeignExecutionPlan is to be used by the caller of the plan, so it has +/// no knowledge or access to the private data. All interaction with the plan +/// must occur through the functions defined in FFI_ExecutionPlan. #[derive(Debug)] pub struct ForeignExecutionPlan { name: String, @@ -175,48 +192,6 @@ impl DisplayAs for ForeignExecutionPlan { } } -impl FFI_ExecutionPlan { - /// This function is called on the provider's side. - pub fn new(plan: Arc, context: Arc) -> Result { - let maybe_children: Result> = plan - .children() - .into_iter() - .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(&context))) - .collect(); - let children = maybe_children?; - - let name = CString::new(plan.name()).map_err(|e| { - DataFusionError::Plan(format!( - "Unable to convert name to CString in FFI_ExecutionPlan: {}", - e - )) - })?; - - let private_data = Box::new(ExecutionPlanPrivateData { - plan, - children, - context, - name, - }); - - Ok(Self { - properties: properties_fn_wrapper, - children: children_fn_wrapper, - name: name_fn_wrapper, - execute: execute_fn_wrapper, - clone: clone_fn_wrapper, - release: release_fn_wrapper, - private_data: Box::into_raw(private_data) as *mut c_void, - }) - } -} - -impl Drop for FFI_ExecutionPlan { - fn drop(&mut self) { - unsafe { (self.release)(self) } - } -} - impl ForeignExecutionPlan { /// Takes ownership of a FFI_ExecutionPlan /// diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index e68bb00fbc69..0b8150594a30 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -91,27 +91,62 @@ struct ProviderPrivateData { provider: Arc, } -/// Wrapper struct to provide access functions from the FFI interface to the underlying -/// TableProvider. This struct is allowed to access `private_data` because it lives on -/// the provider's side of the FFI interace. -struct ExportedTableProvider<'a>(&'a FFI_TableProvider); - unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedSchema { - WrappedSchema(ExportedTableProvider(provider).schema()) + let private_data = provider.private_data as *const ProviderPrivateData; + let provider = &(*private_data).provider; + + // This does silently fail because TableProvider does not return a result. + // It expects schema to always pass. + let ffi_schema = FFI_ArrowSchema::try_from(provider.schema().as_ref()) + .unwrap_or(FFI_ArrowSchema::empty()); + + WrappedSchema(ffi_schema) } unsafe extern "C" fn table_type_fn_wrapper( provider: &FFI_TableProvider, ) -> FFI_TableType { - ExportedTableProvider(provider).table_type() + let private_data = provider.private_data as *const ProviderPrivateData; + let provider = &(*private_data).provider; + + provider.table_type().into() +} + +fn supports_filters_pushdown_internal( + provider: &Arc, + filters_serialized: &[u8], +) -> Result> { + let default_ctx = SessionContext::new(); + let codec = DefaultLogicalExtensionCodec {}; + + let filters = match filters_serialized.is_empty() { + true => vec![], + false => { + let proto_filters = LogicalExprList::decode(filters_serialized) + .map_err(|e| DataFusionError::Plan(e.to_string()))?; + + parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? + } + }; + let filters_borrowed: Vec<&Expr> = filters.iter().collect(); + + let results: RVec<_> = provider + .supports_filters_pushdown(&filters_borrowed)? + .iter() + .map(|v| v.into()) + .collect(); + + Ok(results) } unsafe extern "C" fn supports_filters_pushdown_fn_wrapper( provider: &FFI_TableProvider, filters_serialized: RVec, ) -> RResult, RString> { - ExportedTableProvider(provider) - .supports_filters_pushdown(&filters_serialized) + let private_data = provider.private_data as *const ProviderPrivateData; + let provider = &(*private_data).provider; + + supports_filters_pushdown_internal(provider, &filters_serialized) .map_err(|e| e.to_string().into()) .into() } @@ -207,62 +242,6 @@ impl Drop for FFI_TableProvider { } } -impl<'a> ExportedTableProvider<'a> { - fn private_data(&self) -> &ProviderPrivateData { - unsafe { &*(self.0.private_data as *const ProviderPrivateData) } - } - - fn mut_private_data(&mut self) -> &mut ProviderPrivateData { - unsafe { &mut *(self.0.private_data as *mut ProviderPrivateData) } - } - - pub fn schema(&self) -> FFI_ArrowSchema { - let private_data = self.private_data(); - let provider = &private_data.provider; - - // This does silently fail because TableProvider does not return a result - // so we expect it to always pass. - FFI_ArrowSchema::try_from(provider.schema().as_ref()) - .unwrap_or(FFI_ArrowSchema::empty()) - } - - pub fn table_type(&self) -> FFI_TableType { - let private_data = self.private_data(); - let provider = &private_data.provider; - - provider.table_type().into() - } - - pub fn supports_filters_pushdown( - &mut self, - filters_serialized: &[u8], - ) -> Result> { - let default_ctx = SessionContext::new(); - let codec = DefaultLogicalExtensionCodec {}; - - let filters = match filters_serialized.is_empty() { - true => vec![], - false => { - let proto_filters = LogicalExprList::decode(filters_serialized) - .map_err(|e| DataFusionError::Plan(e.to_string()))?; - - parse_exprs(proto_filters.expr.iter(), &default_ctx, &codec)? - } - }; - let filters_borrowed: Vec<&Expr> = filters.iter().collect(); - - let private_data = self.mut_private_data(); - let results: RVec<_> = private_data - .provider - .supports_filters_pushdown(&filters_borrowed)? - .iter() - .map(|v| v.into()) - .collect(); - - Ok(results) - } -} - impl FFI_TableProvider { /// Creates a new [`FFI_TableProvider`]. pub fn new(provider: Arc) -> Self { From 1dbca58fb8922b7fd7fa169b2fefa18c0bf8b01b Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 16 Oct 2024 20:00:44 -0400 Subject: [PATCH 14/28] Apply prettier --- datafusion/ffi/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/ffi/README.md b/datafusion/ffi/README.md index e2af803c4c11..ba4bb8b961a1 100644 --- a/datafusion/ffi/README.md +++ b/datafusion/ffi/README.md @@ -31,11 +31,11 @@ repository, but many other use cases may exist. We envision at least two use cases. 1. `datafusion-python` which will use the FFI to provide external services such -as a `TableProvider` without needing to re-export the entire `datafusion-python` -code base. With `datafusion-ffi` these packages do not need `datafusion-python` -as a dependency at all. + as a `TableProvider` without needing to re-export the entire `datafusion-python` + code base. With `datafusion-ffi` these packages do not need `datafusion-python` + as a dependency at all. 2. Users may want to create a modular interface that allows runtime loading of -libraries. + libraries. ## Struct Layout @@ -45,7 +45,7 @@ their internal counterparts. In the following example, we will refer to the Each of the exposted structs in this crate is provided with a variant prefixed with `Foreign`. This variant is designed to be used by the consumer of the -foreign code. The `Foreign` structs should *never* access the `private_data` +foreign code. The `Foreign` structs should _never_ access the `private_data` fields. Instead they should only access the data returned through the function calls defined on the `FFI_` structs. The second purpose of the `Foreign` structs is to contain additional data that may be needed by the traits that From 7761c84632fb7d0d30c73619e0ecbcf128813144 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 16 Oct 2024 20:33:15 -0400 Subject: [PATCH 15/28] Format for linter --- datafusion/ffi/Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 227cd3c7ee61..469844b32eac 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -36,12 +36,12 @@ name = "datafusion_ffi" path = "src/lib.rs" [dependencies] +abi_stable = "0.11.3" arrow = { workspace = true, features = ["ffi"] } +async-ffi = { version = "0.5.0", features = ["abi_stable"] } +async-trait = { workspace = true } datafusion = { workspace = true, default-features = true } datafusion-proto = { workspace = true } futures = { workspace = true } -async-trait = { workspace = true } -tokio = { workspace = true } prost = { workspace = true } -abi_stable = "0.11.3" -async-ffi = { version = "0.5.0", features = ["abi_stable"] } +tokio = { workspace = true } From d0f8f8802954a2f7c7dcc3d2ec1f4e39d29033f4 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 16 Oct 2024 20:34:44 -0400 Subject: [PATCH 16/28] Add doc-comment --- datafusion/ffi/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 469844b32eac..9e9204dcba32 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -42,6 +42,7 @@ async-ffi = { version = "0.5.0", features = ["abi_stable"] } async-trait = { workspace = true } datafusion = { workspace = true, default-features = true } datafusion-proto = { workspace = true } +doc-comment = { workspace = true } futures = { workspace = true } prost = { workspace = true } tokio = { workspace = true } From 6b5227ed5de8372f9ed9b24a9d4ea9abff6f473f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 21 Oct 2024 13:47:47 -0400 Subject: [PATCH 17/28] Add option to specify table provider does not support pushdown filters to avoid extra work for some providers --- datafusion/ffi/src/table_provider.rs | 35 ++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 0b8150594a30..cbfad269e667 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -72,12 +72,13 @@ pub struct FFI_TableProvider { pub table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType, - pub supports_filters_pushdown: + pub supports_filters_pushdown: Option< unsafe extern "C" fn( - provider: &Self, + provider: &FFI_TableProvider, filters_serialized: RVec, ) -> RResult, RString>, + >, pub clone: unsafe extern "C" fn(provider: &Self) -> Self, pub release: unsafe extern "C" fn(arg: &mut Self), @@ -229,7 +230,7 @@ unsafe extern "C" fn clone_fn_wrapper(provider: &FFI_TableProvider) -> FFI_Table schema: schema_fn_wrapper, scan: scan_fn_wrapper, table_type: table_type_fn_wrapper, - supports_filters_pushdown: supports_filters_pushdown_fn_wrapper, + supports_filters_pushdown: provider.supports_filters_pushdown, clone: clone_fn_wrapper, release: release_fn_wrapper, private_data, @@ -244,14 +245,20 @@ impl Drop for FFI_TableProvider { impl FFI_TableProvider { /// Creates a new [`FFI_TableProvider`]. - pub fn new(provider: Arc) -> Self { + pub fn new( + provider: Arc, + can_support_pushdown_filters: bool, + ) -> Self { let private_data = Box::new(ProviderPrivateData { provider }); Self { schema: schema_fn_wrapper, scan: scan_fn_wrapper, table_type: table_type_fn_wrapper, - supports_filters_pushdown: supports_filters_pushdown_fn_wrapper, + supports_filters_pushdown: match can_support_pushdown_filters { + true => Some(supports_filters_pushdown_fn_wrapper), + false => None, + }, clone: clone_fn_wrapper, release: release_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, @@ -344,18 +351,26 @@ impl TableProvider for ForeignTableProvider { /// to optimise data retrieval. fn supports_filters_pushdown( &self, - filter: &[&Expr], + filters: &[&Expr], ) -> Result> { unsafe { + let pushdown_fn = match self.0.supports_filters_pushdown { + Some(func) => func, + None => { + return Ok(vec![ + TableProviderFilterPushDown::Unsupported; + filters.len() + ]) + } + }; + let codec = DefaultLogicalExtensionCodec {}; let expr_list = LogicalExprList { - expr: serialize_exprs(filter.iter().map(|f| f.to_owned()), &codec)?, + expr: serialize_exprs(filters.iter().map(|f| f.to_owned()), &codec)?, }; let serialized_filters = expr_list.encode_to_vec(); - let pushdown_fn = self.0.supports_filters_pushdown; - let pushdowns = pushdown_fn(&self.0, serialized_filters.into()); match pushdowns { @@ -398,7 +413,7 @@ mod tests { let provider = Arc::new(MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?); - let ffi_provider = FFI_TableProvider::new(provider); + let ffi_provider = FFI_TableProvider::new(provider, true); let foreign_table_provider = ForeignTableProvider::new(&ffi_provider); From 400f45a27f1bee3f2f92fed8d537cdbbcc85cc7c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 21 Oct 2024 13:52:13 -0400 Subject: [PATCH 18/28] Remove setting default features in cargo file --- datafusion/ffi/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 9e9204dcba32..2a589fd7dae1 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -40,7 +40,7 @@ abi_stable = "0.11.3" arrow = { workspace = true, features = ["ffi"] } async-ffi = { version = "0.5.0", features = ["abi_stable"] } async-trait = { workspace = true } -datafusion = { workspace = true, default-features = true } +datafusion = { workspace = true } datafusion-proto = { workspace = true } doc-comment = { workspace = true } futures = { workspace = true } From 8b220cdb227439e27efdc5b3e91b7681b3c602a0 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 21 Oct 2024 13:54:10 -0400 Subject: [PATCH 19/28] Tokio only needed for unit tests --- datafusion/ffi/Cargo.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 2a589fd7dae1..716b8f8a4cef 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -45,4 +45,6 @@ datafusion-proto = { workspace = true } doc-comment = { workspace = true } futures = { workspace = true } prost = { workspace = true } -tokio = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true } \ No newline at end of file From 8d0f86dcfc1e106aab75aa905ae8a0a4c4b4897a Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 21 Oct 2024 14:55:30 -0400 Subject: [PATCH 20/28] Provide log errors rather than failing silently on schema requests --- datafusion/ffi/Cargo.toml | 1 + datafusion/ffi/src/execution_plan.rs | 51 +++++++---------------- datafusion/ffi/src/plan_properties.rs | 48 +++++++++++++++++---- datafusion/ffi/src/record_batch_stream.rs | 21 +++------- datafusion/ffi/src/table_provider.rs | 24 +++-------- 5 files changed, 68 insertions(+), 77 deletions(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 716b8f8a4cef..6322e88dc7b9 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -44,6 +44,7 @@ datafusion = { workspace = true } datafusion-proto = { workspace = true } doc-comment = { workspace = true } futures = { workspace = true } +log = { workspace = true } prost = { workspace = true } [dev-dependencies] diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 421308ba0dee..cb6597652ee8 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -18,7 +18,7 @@ use std::{ffi::c_void, pin::Pin, sync::Arc}; use abi_stable::{ - std_types::{RResult, RStr, RString, RVec}, + std_types::{RResult, RString, RVec}, StableAbi, }; use arrow::ffi_stream::FFI_ArrowArrayStream; @@ -45,10 +45,7 @@ pub struct WrappedArrayStream(#[sabi(unsafe_opaque_field)] pub FFI_ArrowArrayStr pub struct FFI_ExecutionPlan { pub properties: unsafe extern "C" fn(plan: &Self) -> FFI_PlanProperties, - pub children: unsafe extern "C" fn( - plan: &Self, - ) - -> RResult, RStr<'static>>, + pub children: unsafe extern "C" fn(plan: &Self) -> RVec, pub name: unsafe extern "C" fn(plan: &Self) -> RString, @@ -81,23 +78,18 @@ unsafe extern "C" fn properties_fn_wrapper( unsafe extern "C" fn children_fn_wrapper( plan: &FFI_ExecutionPlan, -) -> RResult, RStr<'static>> { +) -> RVec { let private_data = plan.private_data as *const ExecutionPlanPrivateData; let plan = &(*private_data).plan; let ctx = &(*private_data).context; - let maybe_children: Result> = plan + let children: Vec<_> = plan .children() .into_iter() .map(|child| FFI_ExecutionPlan::new(Arc::clone(child), Arc::clone(ctx))) .collect(); - match maybe_children { - Ok(c) => RResult::ROk(c.into()), - Err(_) => RResult::RErr( - "Error occurred during collection of FFI_ExecutionPlan children".into(), - ), - } + children.into() } unsafe extern "C" fn execute_fn_wrapper( @@ -132,7 +124,6 @@ unsafe extern "C" fn clone_fn_wrapper(plan: &FFI_ExecutionPlan) -> FFI_Execution let plan_data = &(*private_data); FFI_ExecutionPlan::new(Arc::clone(&plan_data.plan), Arc::clone(&plan_data.context)) - .unwrap() } impl Clone for FFI_ExecutionPlan { @@ -143,10 +134,10 @@ impl Clone for FFI_ExecutionPlan { impl FFI_ExecutionPlan { /// This function is called on the provider's side. - pub fn new(plan: Arc, context: Arc) -> Result { + pub fn new(plan: Arc, context: Arc) -> Self { let private_data = Box::new(ExecutionPlanPrivateData { plan, context }); - Ok(Self { + Self { properties: properties_fn_wrapper, children: children_fn_wrapper, name: name_fn_wrapper, @@ -154,7 +145,7 @@ impl FFI_ExecutionPlan { clone: clone_fn_wrapper, release: release_fn_wrapper, private_data: Box::into_raw(private_data) as *mut c_void, - }) + } } } @@ -204,28 +195,18 @@ impl ForeignExecutionPlan { let properties = ForeignPlanProperties::new((plan.properties)(&plan))?; - let children = match (plan.children)(&plan) { - RResult::ROk(children_rvec) => { - let maybe_children: Result> = children_rvec - .iter() - .map(|child| ForeignExecutionPlan::new(child.clone())) - .collect(); - - maybe_children? - .into_iter() - .map(|child| Arc::new(child) as Arc) - .collect() - } - RResult::RErr(e) => { - return Err(DataFusionError::Execution(e.to_string())); - } - }; + let children_rvec = (plan.children)(&plan); + let children: Result> = children_rvec + .iter() + .map(|child| ForeignExecutionPlan::new(child.clone())) + .map(|child| child.map(|c| Arc::new(c) as Arc)) + .collect(); Ok(Self { name, plan, properties: properties.0, - children, + children: children?, }) } } @@ -362,7 +343,7 @@ mod tests { let original_plan = Arc::new(EmptyExec::new(schema)); let original_name = original_plan.name().to_string(); - let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx())?; + let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx()); let foreign_plan = unsafe { ForeignExecutionPlan::new(local_plan)? }; diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 8b65944963bc..f1734a778008 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -24,7 +24,10 @@ use abi_stable::{ }, StableAbi, }; -use arrow::ffi::FFI_ArrowSchema; +use arrow::{ + datatypes::{Schema, SchemaRef}, + ffi::FFI_ArrowSchema, +}; use datafusion::{ error::{DataFusionError, Result}, physical_expr::EquivalenceProperties, @@ -39,6 +42,7 @@ use datafusion_proto::{ }, protobuf::{Partitioning, PhysicalSortExprNodeCollection}, }; +use log::error; use prost::Message; // TODO: should we just make ExecutionMode repr(C)? @@ -75,6 +79,33 @@ impl From for ExecutionMode { #[derive(Debug, StableAbi)] pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema); +impl From for WrappedSchema { + fn from(value: SchemaRef) -> Self { + let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {}", e); + FFI_ArrowSchema::empty() + } + }; + + WrappedSchema(ffi_schema) + } +} + +impl From for SchemaRef { + fn from(value: WrappedSchema) -> Self { + let schema = match Schema::try_from(&value.0) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {}", e); + Schema::empty() + } + }; + Arc::new(schema) + } +} + #[repr(C)] #[derive(Debug, StableAbi)] #[allow(missing_docs)] @@ -165,10 +196,8 @@ unsafe extern "C" fn schema_fn_wrapper(properties: &FFI_PlanProperties) -> Wrapp let private_data = properties.private_data as *const PlanPropertiesPrivateData; let props = &(*private_data).props; - let schema = props.eq_properties.schema(); - WrappedSchema( - FFI_ArrowSchema::try_from(schema.as_ref()).unwrap_or(FFI_ArrowSchema::empty()), - ) + let schema: SchemaRef = Arc::clone(props.eq_properties.schema()); + schema.into() } unsafe extern "C" fn release_fn_wrapper(props: &mut FFI_PlanProperties) { @@ -245,10 +274,13 @@ impl ForeignPlanProperties { &schema, &codex, )? - .unwrap() + .ok_or(DataFusionError::Plan( + "Unable to deserialize partitioning protobuf in FFI_PlanProperties" + .to_string(), + )) } - RErr(e) => return Err(DataFusionError::Plan(e.to_string())), - }; + RErr(e) => Err(DataFusionError::Plan(e.to_string())), + }?; let execution_mode: ExecutionMode = (ffi_props.execution_mode)(&ffi_props).into(); diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index e160f112603b..e6dd2c21a889 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use std::{ffi::c_void, sync::Arc, task::Poll}; +use std::{ffi::c_void, task::Poll}; use abi_stable::{ std_types::{ROption, RResult, RString}, @@ -24,8 +24,7 @@ use abi_stable::{ use arrow::array::{Array, RecordBatch}; use arrow::{ array::{make_array, StructArray}, - datatypes::{Schema, SchemaRef}, - ffi::{from_ffi, to_ffi, FFI_ArrowArray, FFI_ArrowSchema}, + ffi::{from_ffi, to_ffi, FFI_ArrowArray}, }; use async_ffi::{ContextExt, FfiContext, FfiPoll}; use datafusion::error::Result; @@ -35,6 +34,8 @@ use datafusion::{ }; use futures::{Stream, TryStreamExt}; +use crate::plan_properties::WrappedSchema; + #[repr(C)] #[derive(Debug, StableAbi)] pub struct WrappedArray { @@ -44,17 +45,6 @@ pub struct WrappedArray { schema: WrappedSchema, } -#[repr(C)] -#[derive(Debug, StableAbi)] -pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] FFI_ArrowSchema); - -impl From for WrappedSchema { - fn from(value: SchemaRef) -> Self { - let schema = FFI_ArrowSchema::try_from(value.as_ref()); - WrappedSchema(schema.unwrap_or(FFI_ArrowSchema::empty())) - } -} - #[repr(C)] #[derive(Debug, StableAbi)] #[allow(missing_docs)] @@ -133,8 +123,7 @@ unsafe extern "C" fn poll_next_fn_wrapper( impl RecordBatchStream for FFI_RecordBatchStream { fn schema(&self) -> arrow::datatypes::SchemaRef { let wrapped_schema = unsafe { (self.schema)(self) }; - let schema = Schema::try_from(&wrapped_schema.0); - Arc::new(schema.unwrap_or(Schema::empty())) + wrapped_schema.into() } } diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index cbfad269e667..015e5d79cd22 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -21,10 +21,7 @@ use abi_stable::{ std_types::{ROption, RResult, RString, RVec}, StableAbi, }; -use arrow::{ - datatypes::{Schema, SchemaRef}, - ffi::FFI_ArrowSchema, -}; +use arrow::datatypes::SchemaRef; use async_ffi::{FfiFuture, FutureExt}; use async_trait::async_trait; use datafusion::{ @@ -96,12 +93,7 @@ unsafe extern "C" fn schema_fn_wrapper(provider: &FFI_TableProvider) -> WrappedS let private_data = provider.private_data as *const ProviderPrivateData; let provider = &(*private_data).provider; - // This does silently fail because TableProvider does not return a result. - // It expects schema to always pass. - let ffi_schema = FFI_ArrowSchema::try_from(provider.schema().as_ref()) - .unwrap_or(FFI_ArrowSchema::empty()); - - WrappedSchema(ffi_schema) + provider.schema().into() } unsafe extern "C" fn table_type_fn_wrapper( @@ -207,9 +199,7 @@ unsafe extern "C" fn scan_fn_wrapper( Err(e) => return RResult::RErr(e.to_string().into()), }; - FFI_ExecutionPlan::new(plan, ctx.task_ctx()) - .map_err(|e| e.to_string().into()) - .into() + RResult::ROk(FFI_ExecutionPlan::new(plan, ctx.task_ctx())) } .into_ffi() } @@ -295,11 +285,8 @@ impl TableProvider for ForeignTableProvider { } fn schema(&self) -> SchemaRef { - let schema = unsafe { - let wrapped_schema = (self.0.schema)(&self.0); - Schema::try_from(&wrapped_schema.0).unwrap_or(Schema::empty()) - }; - Arc::new(schema) + let wrapped_schema = unsafe { (self.0.schema)(&self.0) }; + wrapped_schema.into() } fn table_type(&self) -> TableType { @@ -383,6 +370,7 @@ impl TableProvider for ForeignTableProvider { #[cfg(test)] mod tests { + use arrow::datatypes::Schema; use datafusion::prelude::{col, lit}; use super::*; From 9c01f7526b25183e806baefec71ec8c412d22e01 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Mon, 21 Oct 2024 14:58:30 -0400 Subject: [PATCH 21/28] Set default features for datafusion to false in ffi crate --- datafusion/ffi/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 6322e88dc7b9..153909e3344d 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -40,7 +40,7 @@ abi_stable = "0.11.3" arrow = { workspace = true, features = ["ffi"] } async-ffi = { version = "0.5.0", features = ["abi_stable"] } async-trait = { workspace = true } -datafusion = { workspace = true } +datafusion = { workspace = true, default-features = false } datafusion-proto = { workspace = true } doc-comment = { workspace = true } futures = { workspace = true } From 61f44aed2db579ad05fe81fb7e0eed491dcae344 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 26 Oct 2024 09:01:33 -0400 Subject: [PATCH 22/28] Using TryFrom or From instead of implementing new when there is only one parameter --- datafusion/ffi/src/execution_plan.rs | 55 +++++++++++------------ datafusion/ffi/src/plan_properties.rs | 43 ++++++++---------- datafusion/ffi/src/record_batch_stream.rs | 4 +- datafusion/ffi/src/session_config.rs | 27 +++++------ datafusion/ffi/src/table_provider.rs | 12 ++--- 5 files changed, 62 insertions(+), 79 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index cb6597652ee8..9ab0bbe162f7 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -30,8 +30,7 @@ use datafusion::{ }; use crate::{ - plan_properties::{FFI_PlanProperties, ForeignPlanProperties}, - record_batch_stream::FFI_RecordBatchStream, + plan_properties::FFI_PlanProperties, record_batch_stream::FFI_RecordBatchStream, }; #[repr(C)] @@ -73,7 +72,7 @@ unsafe extern "C" fn properties_fn_wrapper( let private_data = plan.private_data as *const ExecutionPlanPrivateData; let plan = &(*private_data).plan; - FFI_PlanProperties::new(plan.properties()) + plan.properties().into() } unsafe extern "C" fn children_fn_wrapper( @@ -101,7 +100,7 @@ unsafe extern "C" fn execute_fn_wrapper( let ctx = &(*private_data).context; match plan.execute(partition, Arc::clone(ctx)) { - Ok(rbs) => RResult::ROk(FFI_RecordBatchStream::new(rbs)), + Ok(rbs) => RResult::ROk(rbs.into()), Err(e) => RResult::RErr( format!("Error occurred during FFI_ExecutionPlan execute: {}", e).into(), ), @@ -183,31 +182,29 @@ impl DisplayAs for ForeignExecutionPlan { } } -impl ForeignExecutionPlan { - /// Takes ownership of a FFI_ExecutionPlan - /// - /// # Safety - /// - /// The caller must ensure the pointer provided points to a valid implementation - /// of FFI_ExecutionPlan - pub unsafe fn new(plan: FFI_ExecutionPlan) -> Result { - let name = (plan.name)(&plan).into(); +impl TryFrom<&FFI_ExecutionPlan> for ForeignExecutionPlan { + type Error = DataFusionError; - let properties = ForeignPlanProperties::new((plan.properties)(&plan))?; - - let children_rvec = (plan.children)(&plan); - let children: Result> = children_rvec - .iter() - .map(|child| ForeignExecutionPlan::new(child.clone())) - .map(|child| child.map(|c| Arc::new(c) as Arc)) - .collect(); - - Ok(Self { - name, - plan, - properties: properties.0, - children: children?, - }) + fn try_from(plan: &FFI_ExecutionPlan) -> Result { + unsafe { + let name = (plan.name)(plan).into(); + + let properties: PlanProperties = (plan.properties)(plan).try_into()?; + + let children_rvec = (plan.children)(plan); + let children: Result> = children_rvec + .iter() + .map(ForeignExecutionPlan::try_from) + .map(|child| child.map(|c| Arc::new(c) as Arc)) + .collect(); + + Ok(Self { + name, + plan: plan.clone(), + properties, + children: children?, + }) + } } } @@ -345,7 +342,7 @@ mod tests { let local_plan = FFI_ExecutionPlan::new(original_plan, ctx.task_ctx()); - let foreign_plan = unsafe { ForeignExecutionPlan::new(local_plan)? }; + let foreign_plan: ForeignExecutionPlan = (&local_plan).try_into()?; assert!(original_name == foreign_plan.name()); diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index f1734a778008..77b14e20eb75 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -212,8 +212,8 @@ impl Drop for FFI_PlanProperties { } } -impl FFI_PlanProperties { - pub fn new(props: &PlanProperties) -> Self { +impl From<&PlanProperties> for FFI_PlanProperties { + fn from(props: &PlanProperties) -> Self { let private_data = Box::new(PlanPropertiesPrivateData { props: props.clone(), }); @@ -229,26 +229,19 @@ impl FFI_PlanProperties { } } -#[derive(Debug)] -pub struct ForeignPlanProperties(pub PlanProperties); - -impl ForeignPlanProperties { - /// Construct a ForeignPlanProperties object from a FFI Plan Properties. - /// - /// # Safety - /// - /// This function will call the unsafe interfaces on FFI_PlanProperties - /// provided, so the user must ensure it remains valid for the lifetime - /// of the returned struct. - pub unsafe fn new(ffi_props: FFI_PlanProperties) -> Result { - let ffi_schema = (ffi_props.schema)(&ffi_props); +impl TryFrom for PlanProperties { + type Error = DataFusionError; + + fn try_from(ffi_props: FFI_PlanProperties) -> Result { + let ffi_schema = unsafe { (ffi_props.schema)(&ffi_props) }; let schema = (&ffi_schema.0).try_into()?; // TODO Extend FFI to get the registry and codex let default_ctx = SessionContext::new(); let codex = DefaultPhysicalExtensionCodec {}; - let orderings = match (ffi_props.output_ordering)(&ffi_props) { + let ffi_orderings = unsafe { (ffi_props.output_ordering)(&ffi_props) }; + let orderings = match ffi_orderings { ROk(ordering_vec) => { let proto_output_ordering = PhysicalSortExprNodeCollection::decode(ordering_vec.as_ref()) @@ -263,7 +256,8 @@ impl ForeignPlanProperties { RErr(e) => return Err(DataFusionError::Plan(e.to_string())), }; - let partitioning = match (ffi_props.output_partitioning)(&ffi_props) { + let ffi_partitioning = unsafe { (ffi_props.output_partitioning)(&ffi_props) }; + let partitioning = match ffi_partitioning { ROk(partitioning_vec) => { let proto_output_partitioning = Partitioning::decode(partitioning_vec.as_ref()) @@ -282,7 +276,8 @@ impl ForeignPlanProperties { RErr(e) => Err(DataFusionError::Plan(e.to_string())), }?; - let execution_mode: ExecutionMode = (ffi_props.execution_mode)(&ffi_props).into(); + let execution_mode: ExecutionMode = + unsafe { (ffi_props.execution_mode)(&ffi_props).into() }; let eq_properties = match orderings { Some(ordering) => { @@ -291,11 +286,11 @@ impl ForeignPlanProperties { None => EquivalenceProperties::new(Arc::new(schema)), }; - Ok(Self(PlanProperties::new( + Ok(PlanProperties::new( eq_properties, partitioning, execution_mode, - ))) + )) } } @@ -317,13 +312,11 @@ mod tests { ExecutionMode::Unbounded, ); - let local_props_ptr = FFI_PlanProperties::new(&original_props); - - let foreign_props = unsafe { ForeignPlanProperties::new(local_props_ptr)? }; + let local_props_ptr = FFI_PlanProperties::from(&original_props); - let returned_props: PlanProperties = foreign_props.0; + let foreign_props: PlanProperties = local_props_ptr.try_into()?; - assert!(format!("{:?}", returned_props) == format!("{:?}", original_props)); + assert!(format!("{:?}", foreign_props) == format!("{:?}", original_props)); Ok(()) } diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index e6dd2c21a889..2e250a992cb8 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -61,8 +61,8 @@ pub struct FFI_RecordBatchStream { pub private_data: *mut c_void, } -impl FFI_RecordBatchStream { - pub fn new(stream: SendableRecordBatchStream) -> Self { +impl From for FFI_RecordBatchStream { + fn from(stream: SendableRecordBatchStream) -> Self { FFI_RecordBatchStream { poll_next: poll_next_fn_wrapper, schema: schema_fn_wrapper, diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index f2571ceb9163..86499f44c8cc 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -24,8 +24,8 @@ use abi_stable::{ std_types::{RHashMap, RString}, StableAbi, }; -use datafusion::prelude::SessionConfig; use datafusion::{config::ConfigOptions, error::Result}; +use datafusion::{error::DataFusionError, prelude::SessionConfig}; #[repr(C)] #[derive(Debug, StableAbi)] @@ -84,9 +84,8 @@ struct SessionConfigPrivateData { pub config: ConfigOptions, } -impl FFI_SessionConfig { - /// Creates a new [`FFI_SessionConfig`]. - pub fn new(session: &SessionConfig) -> Self { +impl From<&SessionConfig> for FFI_SessionConfig { + fn from(session: &SessionConfig) -> Self { let mut config_keys = Vec::new(); let mut config_values = Vec::new(); for config_entry in session.options().entries() { @@ -128,17 +127,11 @@ impl Drop for FFI_SessionConfig { pub struct ForeignSessionConfig(pub SessionConfig); -impl ForeignSessionConfig { - /// Create a session config object from a foreign provider. - /// - /// # Safety - /// - /// This function will dereference the provided config pointer and will - /// access it's unsafe methods. It is the provider's responsibility that - /// this pointer and it's internal functions remain valid for the lifetime - /// of the returned struct. - pub unsafe fn new(config: &FFI_SessionConfig) -> Result { - let config_options = (config.config_options)(config); +impl TryFrom<&FFI_SessionConfig> for ForeignSessionConfig { + type Error = DataFusionError; + + fn try_from(config: &FFI_SessionConfig) -> Result { + let config_options = unsafe { (config.config_options)(config) }; let mut options_map = HashMap::new(); config_options.iter().for_each(|kv_pair| { @@ -158,9 +151,9 @@ mod tests { let session_config = SessionConfig::new(); let original_options = session_config.options().entries(); - let ffi_config = FFI_SessionConfig::new(&session_config); + let ffi_config: FFI_SessionConfig = (&session_config).into(); - let foreign_config = unsafe { ForeignSessionConfig::new(&ffi_config)? }; + let foreign_config: ForeignSessionConfig = (&ffi_config).try_into()?; let returned_options = foreign_config.0.options().entries(); diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 015e5d79cd22..6a697a56d636 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -156,7 +156,7 @@ unsafe extern "C" fn scan_fn_wrapper( let session_config = session_config.clone(); async move { - let config = match ForeignSessionConfig::new(&session_config) { + let config = match ForeignSessionConfig::try_from(&session_config) { Ok(c) => c, Err(e) => return RResult::RErr(e.to_string().into()), }; @@ -266,8 +266,8 @@ pub struct ForeignTableProvider(FFI_TableProvider); unsafe impl Send for ForeignTableProvider {} unsafe impl Sync for ForeignTableProvider {} -impl ForeignTableProvider { - pub fn new(provider: &FFI_TableProvider) -> Self { +impl From<&FFI_TableProvider> for ForeignTableProvider { + fn from(provider: &FFI_TableProvider) -> Self { Self(provider.clone()) } } @@ -300,7 +300,7 @@ impl TableProvider for ForeignTableProvider { filters: &[Expr], limit: Option, ) -> Result> { - let session_config = FFI_SessionConfig::new(session.config()); + let session_config: FFI_SessionConfig = session.config().into(); let projections: Option> = projection.map(|p| p.iter().map(|v| v.to_owned()).collect()); @@ -322,7 +322,7 @@ impl TableProvider for ForeignTableProvider { .await; match maybe_plan { - RResult::ROk(p) => ForeignExecutionPlan::new(p)?, + RResult::ROk(p) => ForeignExecutionPlan::try_from(&p)?, RResult::RErr(_) => { return Err(datafusion::error::DataFusionError::Internal( "Unable to perform scan via FFI".to_string(), @@ -403,7 +403,7 @@ mod tests { let ffi_provider = FFI_TableProvider::new(provider, true); - let foreign_table_provider = ForeignTableProvider::new(&ffi_provider); + let foreign_table_provider: ForeignTableProvider = (&ffi_provider).into(); ctx.register_table("t", Arc::new(foreign_table_provider))?; From 1576520ed9e0d9ef670fc7972c33b0f6bfaecdf7 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 26 Oct 2024 10:36:55 -0400 Subject: [PATCH 23/28] Move arrow wrappers into their own file --- datafusion/ffi/src/arrow_wrappers.rs | 53 +++++++++++++++++++++++ datafusion/ffi/src/execution_plan.rs | 22 +++++++--- datafusion/ffi/src/lib.rs | 1 + datafusion/ffi/src/plan_properties.rs | 39 ++--------------- datafusion/ffi/src/record_batch_stream.rs | 13 +----- datafusion/ffi/src/table_provider.rs | 2 +- 6 files changed, 76 insertions(+), 54 deletions(-) create mode 100644 datafusion/ffi/src/arrow_wrappers.rs diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs new file mode 100644 index 000000000000..2772e9d19867 --- /dev/null +++ b/datafusion/ffi/src/arrow_wrappers.rs @@ -0,0 +1,53 @@ +use std::sync::Arc; + +use abi_stable::StableAbi; +use arrow::{ + datatypes::{Schema, SchemaRef}, + ffi::{FFI_ArrowArray, FFI_ArrowSchema}, +}; +use log::error; + +/// This is a wrapper struct around FFI_ArrowSchema simply to indicate +/// to the StableAbi macros that the underlying struct is FFI safe. +#[repr(C)] +#[derive(Debug, StableAbi)] +pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema); + +impl From for WrappedSchema { + fn from(value: SchemaRef) -> Self { + let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {}", e); + FFI_ArrowSchema::empty() + } + }; + + WrappedSchema(ffi_schema) + } +} + +impl From for SchemaRef { + fn from(value: WrappedSchema) -> Self { + let schema = match Schema::try_from(&value.0) { + Ok(s) => s, + Err(e) => { + error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {}", e); + Schema::empty() + } + }; + Arc::new(schema) + } +} + +/// This is a wrapper struct for FFI_ArrowArray to indicate to StableAbi +/// that the struct is FFI Safe. For convenience, we also include the +/// schema needed to create a record batch from the array. +#[repr(C)] +#[derive(Debug, StableAbi)] +pub struct WrappedArray { + #[sabi(unsafe_opaque_field)] + pub array: FFI_ArrowArray, + + pub schema: WrappedSchema, +} diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 9ab0bbe162f7..38cf2103bd72 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -21,7 +21,6 @@ use abi_stable::{ std_types::{RResult, RString, RVec}, StableAbi, }; -use arrow::ffi_stream::FFI_ArrowArrayStream; use datafusion::error::Result; use datafusion::{ error::DataFusionError, @@ -33,28 +32,36 @@ use crate::{ plan_properties::FFI_PlanProperties, record_batch_stream::FFI_RecordBatchStream, }; +/// A stable struct for sharing a [`ExecutionPlan`] across FFI boundaries. #[repr(C)] #[derive(Debug, StableAbi)] -pub struct WrappedArrayStream(#[sabi(unsafe_opaque_field)] pub FFI_ArrowArrayStream); - -#[repr(C)] -#[derive(Debug, StableAbi)] -#[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_ExecutionPlan { + /// Return the plan properties pub properties: unsafe extern "C" fn(plan: &Self) -> FFI_PlanProperties, + /// Return a vector of children plans pub children: unsafe extern "C" fn(plan: &Self) -> RVec, + /// Return the plan name. pub name: unsafe extern "C" fn(plan: &Self) -> RString, + /// Execute the plan and return a record batch stream. Errors + /// will be returned as a string. pub execute: unsafe extern "C" fn( plan: &Self, partition: usize, ) -> RResult, + /// Used to create a clone on the provider of the execution plan. This should + /// only need to be called by the receiver of the plan. pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. pub release: unsafe extern "C" fn(arg: &mut Self), + + /// Internal data. This is only to be accessed by the provider of the plan. + /// A [`ForeignExecutionPlan`] should never attempt to access this data. pub private_data: *mut c_void, } @@ -154,6 +161,9 @@ impl Drop for FFI_ExecutionPlan { } } +/// This struct is used to access an execution plan provided by a foreign +/// library across a FFI boundary. +/// /// The ForeignExecutionPlan is to be used by the caller of the plan, so it has /// no knowledge or access to the private data. All interaction with the plan /// must occur through the functions defined in FFI_ExecutionPlan. diff --git a/datafusion/ffi/src/lib.rs b/datafusion/ffi/src/lib.rs index 72d3bfba1b9f..4a74e65dc671 100644 --- a/datafusion/ffi/src/lib.rs +++ b/datafusion/ffi/src/lib.rs @@ -17,6 +17,7 @@ // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] +pub mod arrow_wrappers; pub mod execution_plan; pub mod plan_properties; pub mod record_batch_stream; diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index 77b14e20eb75..c6a8d2064fb9 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -24,10 +24,7 @@ use abi_stable::{ }, StableAbi, }; -use arrow::{ - datatypes::{Schema, SchemaRef}, - ffi::FFI_ArrowSchema, -}; +use arrow::datatypes::SchemaRef; use datafusion::{ error::{DataFusionError, Result}, physical_expr::EquivalenceProperties, @@ -42,9 +39,10 @@ use datafusion_proto::{ }, protobuf::{Partitioning, PhysicalSortExprNodeCollection}, }; -use log::error; use prost::Message; +use crate::arrow_wrappers::WrappedSchema; + // TODO: should we just make ExecutionMode repr(C)? #[repr(C)] #[allow(non_camel_case_types)] @@ -75,37 +73,6 @@ impl From for ExecutionMode { } } -#[repr(C)] -#[derive(Debug, StableAbi)] -pub struct WrappedSchema(#[sabi(unsafe_opaque_field)] pub FFI_ArrowSchema); - -impl From for WrappedSchema { - fn from(value: SchemaRef) -> Self { - let ffi_schema = match FFI_ArrowSchema::try_from(value.as_ref()) { - Ok(s) => s, - Err(e) => { - error!("Unable to convert DataFusion Schema to FFI_ArrowSchema in FFI_PlanProperties. {}", e); - FFI_ArrowSchema::empty() - } - }; - - WrappedSchema(ffi_schema) - } -} - -impl From for SchemaRef { - fn from(value: WrappedSchema) -> Self { - let schema = match Schema::try_from(&value.0) { - Ok(s) => s, - Err(e) => { - error!("Unable to convert from FFI_ArrowSchema to DataFusion Schema in FFI_PlanProperties. {}", e); - Schema::empty() - } - }; - Arc::new(schema) - } -} - #[repr(C)] #[derive(Debug, StableAbi)] #[allow(missing_docs)] diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index 2e250a992cb8..bdc5a3d24d15 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -24,7 +24,7 @@ use abi_stable::{ use arrow::array::{Array, RecordBatch}; use arrow::{ array::{make_array, StructArray}, - ffi::{from_ffi, to_ffi, FFI_ArrowArray}, + ffi::{from_ffi, to_ffi}, }; use async_ffi::{ContextExt, FfiContext, FfiPoll}; use datafusion::error::Result; @@ -34,16 +34,7 @@ use datafusion::{ }; use futures::{Stream, TryStreamExt}; -use crate::plan_properties::WrappedSchema; - -#[repr(C)] -#[derive(Debug, StableAbi)] -pub struct WrappedArray { - #[sabi(unsafe_opaque_field)] - array: FFI_ArrowArray, - - schema: WrappedSchema, -} +use crate::arrow_wrappers::{WrappedArray, WrappedSchema}; #[repr(C)] #[derive(Debug, StableAbi)] diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 6a697a56d636..925b00987a46 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -42,7 +42,7 @@ use datafusion_proto::{ use prost::Message; use crate::{ - plan_properties::WrappedSchema, + arrow_wrappers::WrappedSchema, session_config::ForeignSessionConfig, table_source::{FFI_TableProviderFilterPushDown, FFI_TableType}, }; From 790f454f957eb4975879cb5291739d0adca72285 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sat, 26 Oct 2024 13:34:43 -0400 Subject: [PATCH 24/28] Add documentation --- datafusion/ffi/src/plan_properties.rs | 73 +++++++++++++---------- datafusion/ffi/src/record_batch_stream.rs | 8 ++- datafusion/ffi/src/session_config.rs | 26 +++++++- datafusion/ffi/src/table_provider.rs | 28 ++++++++- datafusion/ffi/src/table_source.rs | 4 +- 5 files changed, 98 insertions(+), 41 deletions(-) diff --git a/datafusion/ffi/src/plan_properties.rs b/datafusion/ffi/src/plan_properties.rs index c6a8d2064fb9..722681ae4a1d 100644 --- a/datafusion/ffi/src/plan_properties.rs +++ b/datafusion/ffi/src/plan_properties.rs @@ -43,55 +43,32 @@ use prost::Message; use crate::arrow_wrappers::WrappedSchema; -// TODO: should we just make ExecutionMode repr(C)? -#[repr(C)] -#[allow(non_camel_case_types)] -#[derive(Clone, StableAbi)] -pub enum FFI_ExecutionMode { - Bounded, - Unbounded, - PipelineBreaking, -} - -impl From for FFI_ExecutionMode { - fn from(value: ExecutionMode) -> Self { - match value { - ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, - ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, - ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, - } - } -} - -impl From for ExecutionMode { - fn from(value: FFI_ExecutionMode) -> Self { - match value { - FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, - FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, - FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, - } - } -} - +/// A stable struct for sharing [`PlanProperties`] across FFI boundaries. #[repr(C)] #[derive(Debug, StableAbi)] -#[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_PlanProperties { - // Returns protobuf serialized bytes of the partitioning + /// The output partitioning is a [`Partitioning`] protobuf message serialized + /// into bytes to pass across the FFI boundary. pub output_partitioning: unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, + /// Return the execution mode of the plan. pub execution_mode: unsafe extern "C" fn(plan: &Self) -> FFI_ExecutionMode, - // PhysicalSortExprNodeCollection proto + /// The output ordering is a [`PhysicalSortExprNodeCollection`] protobuf message + /// serialized into bytes to pass across the FFI boundary. pub output_ordering: unsafe extern "C" fn(plan: &Self) -> RResult, RStr<'static>>, + /// Return the schema of the plan. pub schema: unsafe extern "C" fn(plan: &Self) -> WrappedSchema, + /// Release the memory of the private data when it is no longer being used. pub release: unsafe extern "C" fn(arg: &mut Self), + /// Internal data. This is only to be accessed by the provider of the plan. + /// The foreign library should never attempt to access this data. pub private_data: *mut c_void, } @@ -261,6 +238,36 @@ impl TryFrom for PlanProperties { } } +/// FFI safe version of [`ExecutionMode`]. +#[repr(C)] +#[allow(non_camel_case_types)] +#[derive(Clone, StableAbi)] +pub enum FFI_ExecutionMode { + Bounded, + Unbounded, + PipelineBreaking, +} + +impl From for FFI_ExecutionMode { + fn from(value: ExecutionMode) -> Self { + match value { + ExecutionMode::Bounded => FFI_ExecutionMode::Bounded, + ExecutionMode::Unbounded => FFI_ExecutionMode::Unbounded, + ExecutionMode::PipelineBreaking => FFI_ExecutionMode::PipelineBreaking, + } + } +} + +impl From for ExecutionMode { + fn from(value: FFI_ExecutionMode) -> Self { + match value { + FFI_ExecutionMode::Bounded => ExecutionMode::Bounded, + FFI_ExecutionMode::Unbounded => ExecutionMode::Unbounded, + FFI_ExecutionMode::PipelineBreaking => ExecutionMode::PipelineBreaking, + } + } +} + #[cfg(test)] mod tests { use datafusion::physical_plan::Partitioning; diff --git a/datafusion/ffi/src/record_batch_stream.rs b/datafusion/ffi/src/record_batch_stream.rs index bdc5a3d24d15..c944e56c5cde 100644 --- a/datafusion/ffi/src/record_batch_stream.rs +++ b/datafusion/ffi/src/record_batch_stream.rs @@ -36,19 +36,25 @@ use futures::{Stream, TryStreamExt}; use crate::arrow_wrappers::{WrappedArray, WrappedSchema}; +/// A stable struct for sharing [`RecordBatchStream`] across FFI boundaries. +/// We use the async-ffi crate for handling async calls across libraries. #[repr(C)] #[derive(Debug, StableAbi)] -#[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_RecordBatchStream { + /// This mirrors the `poll_next` of [`RecordBatchStream`] but does so + /// in a FFI safe manner. pub poll_next: unsafe extern "C" fn( stream: &Self, cx: &mut FfiContext, ) -> FfiPoll>>, + /// Return the schema of the record batch pub schema: unsafe extern "C" fn(stream: &Self) -> WrappedSchema, + /// Internal data. This is only to be accessed by the provider of the plan. + /// The foreign library should never attempt to access this data. pub private_data: *mut c_void, } diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index 86499f44c8cc..f5204b9385f6 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -27,16 +27,35 @@ use abi_stable::{ use datafusion::{config::ConfigOptions, error::Result}; use datafusion::{error::DataFusionError, prelude::SessionConfig}; +/// A stable struct for sharing [`SessionConfig`] across FFI boundaries. +/// Instead of attempting to expose the entire SessionConfig interface, we +/// convert the config options into a map from a string to string and pass +/// those values across the FFI boundary. On the receiver side, we +/// reconstruct a SessionConfig from those values. +/// +/// It is possible that using different versions of DataFusion across the +/// FFI boundary could have differing expectations of the config options. +/// This is a limitation of this approach, but exposing the entire +/// SessionConfig via a FFI interface would be extensive and provide limited +/// value over this version. #[repr(C)] #[derive(Debug, StableAbi)] -#[allow(missing_docs)] #[allow(non_camel_case_types)] pub struct FFI_SessionConfig { + /// Return a hash map from key to value of the config options represented + /// by string values. pub config_options: unsafe extern "C" fn(config: &Self) -> RHashMap, + /// Used to create a clone on the provider of the execution plan. This should + /// only need to be called by the receiver of the plan. + pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. + pub release: unsafe extern "C" fn(arg: &mut Self), + + /// Internal data. This is only to be accessed by the provider of the plan. + /// A [`ForeignSessionConfig`] should never attempt to access this data. pub private_data: *mut c_void, - pub clone: unsafe extern "C" fn(&Self) -> Self, - pub release: unsafe extern "C" fn(config: &mut Self), } unsafe impl Send for FFI_SessionConfig {} @@ -125,6 +144,7 @@ impl Drop for FFI_SessionConfig { } } +/// A wrapper struct for accessing [`SessionConfig`] across a FFI boundary pub struct ForeignSessionConfig(pub SessionConfig); impl TryFrom<&FFI_SessionConfig> for ForeignSessionConfig { diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 925b00987a46..1a94a971e670 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -53,12 +53,25 @@ use super::{ }; use datafusion::error::Result; -/// A stable interface for creating a DataFusion TableProvider. +/// A stable struct for sharing [`TableProvider`] across FFI boundaries. #[repr(C)] #[derive(Debug, StableAbi)] #[allow(non_camel_case_types)] pub struct FFI_TableProvider { + /// Return the table schema pub schema: unsafe extern "C" fn(provider: &Self) -> WrappedSchema, + + /// Perform a scan on the table. See [`TableProvider`] for detailed usage information. + /// + /// # Arguments + /// + /// * `provider` - the table provider + /// * `session_config` - session configuration + /// * `projections` - if specified, only a subset of the columns are returned + /// * `filters_serialized` - filters to apply to the scan, which are a + /// [`LogicalExprList`] protobuf message serialized into bytes to pass + /// across the FFI boundary. + /// * `limit` - if specified, limit the number of rows returned pub scan: unsafe extern "C" fn( provider: &Self, session_config: &FFI_SessionConfig, @@ -67,8 +80,12 @@ pub struct FFI_TableProvider { limit: ROption, ) -> FfiFuture>, + /// Return the type of table. See [`TableType`] for options. pub table_type: unsafe extern "C" fn(provider: &Self) -> FFI_TableType, + /// Based upon the input filters, identify which are supported. The filters + /// are a [`LogicalExprList`] protobuf message serialized into bytes to pass + /// across the FFI boundary. pub supports_filters_pushdown: Option< unsafe extern "C" fn( provider: &FFI_TableProvider, @@ -77,8 +94,15 @@ pub struct FFI_TableProvider { -> RResult, RString>, >, - pub clone: unsafe extern "C" fn(provider: &Self) -> Self, + /// Used to create a clone on the provider of the execution plan. This should + /// only need to be called by the receiver of the plan. + pub clone: unsafe extern "C" fn(plan: &Self) -> Self, + + /// Release the memory of the private data when it is no longer being used. pub release: unsafe extern "C" fn(arg: &mut Self), + + /// Internal data. This is only to be accessed by the provider of the plan. + /// A [`ForeignExecutionPlan`] should never attempt to access this data. pub private_data: *mut c_void, } diff --git a/datafusion/ffi/src/table_source.rs b/datafusion/ffi/src/table_source.rs index 200b1b94c3fd..a59836622ee6 100644 --- a/datafusion/ffi/src/table_source.rs +++ b/datafusion/ffi/src/table_source.rs @@ -18,7 +18,7 @@ use abi_stable::StableAbi; use datafusion::{datasource::TableType, logical_expr::TableProviderFilterPushDown}; -// TODO Should we just define TableProviderFilterPushDown as repr(C)? +/// FFI safe version of [`TableProviderFilterPushDown`]. #[repr(C)] #[derive(StableAbi)] #[allow(non_camel_case_types)] @@ -56,7 +56,7 @@ impl From<&TableProviderFilterPushDown> for FFI_TableProviderFilterPushDown { } } -// TODO Should we just define FFI_TableType as repr(C)? +/// FFI safe version of [`TableType`]. #[repr(C)] #[allow(non_camel_case_types)] #[derive(Debug, Clone, Copy, PartialEq, Eq, StableAbi)] From bf626f492c7504a7b63ca99fbaccd89060b7046c Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Sun, 27 Oct 2024 10:14:56 -0400 Subject: [PATCH 25/28] Small adjustment to documentation --- datafusion/ffi/src/session_config.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/ffi/src/session_config.rs b/datafusion/ffi/src/session_config.rs index f5204b9385f6..aea03cf94e0a 100644 --- a/datafusion/ffi/src/session_config.rs +++ b/datafusion/ffi/src/session_config.rs @@ -144,7 +144,10 @@ impl Drop for FFI_SessionConfig { } } -/// A wrapper struct for accessing [`SessionConfig`] across a FFI boundary +/// A wrapper struct for accessing [`SessionConfig`] across a FFI boundary. +/// The [`SessionConfig`] will be generated from a hash map of the config +/// options in the provider and will be reconstructed on this side of the +/// interface.s pub struct ForeignSessionConfig(pub SessionConfig); impl TryFrom<&FFI_SessionConfig> for ForeignSessionConfig { From bb47819b4c6135e15f8f5029a816aab3389e4f56 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 29 Oct 2024 20:25:35 -0400 Subject: [PATCH 26/28] Add license text --- datafusion/ffi/src/arrow_wrappers.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/datafusion/ffi/src/arrow_wrappers.rs b/datafusion/ffi/src/arrow_wrappers.rs index 2772e9d19867..c5add8782c51 100644 --- a/datafusion/ffi/src/arrow_wrappers.rs +++ b/datafusion/ffi/src/arrow_wrappers.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::sync::Arc; use abi_stable::StableAbi; From 71ae880092c559089bf6cd9b90b352e8af5b4b1f Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Tue, 29 Oct 2024 20:27:21 -0400 Subject: [PATCH 27/28] Fix unnecessary qualification --- datafusion/ffi/src/execution_plan.rs | 4 ++-- datafusion/ffi/src/table_provider.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/ffi/src/execution_plan.rs b/datafusion/ffi/src/execution_plan.rs index 38cf2103bd72..d10eda8990b8 100644 --- a/datafusion/ffi/src/execution_plan.rs +++ b/datafusion/ffi/src/execution_plan.rs @@ -227,7 +227,7 @@ impl ExecutionPlan for ForeignExecutionPlan { self } - fn properties(&self) -> &datafusion::physical_plan::PlanProperties { + fn properties(&self) -> &PlanProperties { &self.properties } @@ -241,7 +241,7 @@ impl ExecutionPlan for ForeignExecutionPlan { fn with_new_children( self: Arc, children: Vec>, - ) -> datafusion::error::Result> { + ) -> Result> { Ok(Arc::new(ForeignExecutionPlan { plan: self.plan.clone(), name: self.name.clone(), diff --git a/datafusion/ffi/src/table_provider.rs b/datafusion/ffi/src/table_provider.rs index 1a94a971e670..011ad96e423d 100644 --- a/datafusion/ffi/src/table_provider.rs +++ b/datafusion/ffi/src/table_provider.rs @@ -348,7 +348,7 @@ impl TableProvider for ForeignTableProvider { match maybe_plan { RResult::ROk(p) => ForeignExecutionPlan::try_from(&p)?, RResult::RErr(_) => { - return Err(datafusion::error::DataFusionError::Internal( + return Err(DataFusionError::Internal( "Unable to perform scan via FFI".to_string(), )) } From 011340c5944ad16ef59d80aa0fc03a2426c13f87 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Wed, 30 Oct 2024 07:09:41 -0400 Subject: [PATCH 28/28] taplo format --- datafusion/ffi/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/ffi/Cargo.toml b/datafusion/ffi/Cargo.toml index 153909e3344d..119747342515 100644 --- a/datafusion/ffi/Cargo.toml +++ b/datafusion/ffi/Cargo.toml @@ -48,4 +48,4 @@ log = { workspace = true } prost = { workspace = true } [dev-dependencies] -tokio = { workspace = true } \ No newline at end of file +tokio = { workspace = true }