Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Duckdb conversion traits and simple arrow impl #2620

Merged
merged 16 commits into from
Mar 10, 2025
407 changes: 373 additions & 34 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ members = [
"vortex-btrblocks",
"vortex-buffer",
"vortex-datafusion",
"vortex-duckdb",
"vortex-dtype",
"vortex-error",
"vortex-expr",
Expand Down Expand Up @@ -79,6 +80,7 @@ datafusion-expr = "45"
datafusion-physical-expr = "45"
datafusion-physical-plan = "45"
divan = { package = "codspeed-divan-compat", version = "2.8.0" }
duckdb = "1.2.0"
dyn-hash = "0.2.0"
enum-iterator = "2.0.0"
exponential-decay-histogram = "=0.1.13"
Expand Down Expand Up @@ -172,6 +174,7 @@ vortex-bytebool = { version = "0.25.2", path = "./encodings/bytebool" }
vortex-datafusion = { version = "0.25.2", path = "./vortex-datafusion" }
vortex-datetime-parts = { version = "0.25.2", path = "./encodings/datetime-parts" }
vortex-dict = { version = "0.25.2", path = "./encodings/dict" }
vortex-duckdb = { version = "0.25.2", path = "./vortex-duckdb" }
vortex-dtype = { version = "0.25.2", path = "./vortex-dtype", default-features = false }
vortex-error = { version = "0.25.2", path = "./vortex-error" }
vortex-expr = { version = "0.25.2", path = "./vortex-expr" }
Expand Down
33 changes: 33 additions & 0 deletions vortex-duckdb/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
[package]
name = "vortex-duckdb"
version = { workspace = true }
description = "Vortex's duckdb adapter"
homepage = { workspace = true }
repository = { workspace = true }
authors = { workspace = true }
license = { workspace = true }
keywords = { workspace = true }
include = { workspace = true }
edition = { workspace = true }
rust-version = { workspace = true }
categories = { workspace = true }
readme = { workspace = true }

[lib]
name = "vortex_duckdb"
path = "src/lib.rs"
bench = false


[dependencies]
duckdb = { workspace = true, features = ["vtab-full", "bundled"] }
itertools = { workspace = true }
vortex-array = { workspace = true }
vortex-dtype = { workspace = true }
vortex-error = { workspace = true }

[dev-dependencies]

[lints]
workspace = true

Empty file added vortex-duckdb/cbindgen.toml
Empty file.
77 changes: 77 additions & 0 deletions vortex-duckdb/src/convert/array/data_chunk_adaptor.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
use duckdb::core::{ArrayVector, DataChunkHandle, FlatVector, ListVector, StructVector};
use duckdb::vtab::arrow::WritableVector;
use vortex_dtype::FieldNames;

pub struct DataChunkHandleSlice<'a> {
chunk: &'a mut DataChunkHandle,
column_index: usize,
}

/// A wrapper around a [`DataChunkHandle`] that extra info to create a vortex array
pub struct NamedDataChunk<'a> {
pub chunk: &'a DataChunkHandle,
pub nullable: Option<&'a [bool]>,
pub names: Option<FieldNames>,
}

/// Since duckdb vectors only have a capacity, not a size this wrapper exists to allow the creation
/// of a vortex array from a duckdb vector.
/// Nullability is also included since the duckdb doesn't have this info its on the table.
pub struct SizedFlatVector {
pub vector: FlatVector,
pub nullable: bool,
pub len: usize,
}

impl<'a> DataChunkHandleSlice<'a> {
pub fn new(chunk: &'a mut DataChunkHandle, column_index: usize) -> Self {
Self {
chunk,
column_index,
}
}
}

impl WritableVector for DataChunkHandleSlice<'_> {
fn array_vector(&mut self) -> ArrayVector {
self.chunk.array_vector(self.column_index)
}

fn flat_vector(&mut self) -> FlatVector {
self.chunk.flat_vector(self.column_index)
}

fn struct_vector(&mut self) -> StructVector {
self.chunk.struct_vector(self.column_index)
}

fn list_vector(&mut self) -> ListVector {
self.chunk.list_vector(self.column_index)
}
}

impl<'a> NamedDataChunk<'a> {
pub fn from_chunk(chunk: &'a DataChunkHandle) -> Self {
Self {
chunk,
nullable: None,
names: None,
}
}

pub fn named_chunk(chunk: &'a DataChunkHandle, names: FieldNames) -> Self {
Self {
chunk,
nullable: None,
names: Some(names),
}
}

pub fn new(chunk: &'a DataChunkHandle, nullable: &'a [bool], names: FieldNames) -> Self {
Self {
chunk,
nullable: Some(nullable),
names: Some(names),
}
}
}
161 changes: 161 additions & 0 deletions vortex-duckdb/src/convert/array/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
mod data_chunk_adaptor;

use duckdb::core::DataChunkHandle;
use duckdb::vtab::arrow::{
WritableVector, flat_vector_to_arrow_array, write_arrow_array_to_vector,
};
use vortex_array::arrays::StructArray;
use vortex_array::arrow::{FromArrowArray, IntoArrowArray};
use vortex_array::validity::Validity;
use vortex_array::variants::StructArrayTrait;
use vortex_array::{Array, ArrayRef};
use vortex_error::{VortexResult, vortex_err};

use crate::convert::array::data_chunk_adaptor::{
DataChunkHandleSlice, NamedDataChunk, SizedFlatVector,
};

pub trait ToDuckDB {
fn to_duckdb(&self, chunk: &mut dyn WritableVector) -> VortexResult<()>;
}

pub fn to_duckdb_chunk(
struct_array: &StructArray,
chunk: &mut DataChunkHandle,
) -> VortexResult<Vec<bool>> {
let mut nullable = vec![false; struct_array.len()];
for (idx, (_name, field)) in struct_array
.names()
.iter()
.zip(struct_array.fields())
.enumerate()
{
field.to_duckdb(&mut DataChunkHandleSlice::new(chunk, idx))?;
nullable[idx] = field.dtype().is_nullable();
}
chunk.set_len(struct_array.len());
Ok(nullable)
}

impl ToDuckDB for ArrayRef {
fn to_duckdb(&self, chunk: &mut dyn WritableVector) -> VortexResult<()> {
let arrow = &self.clone().into_arrow_preferred()?;
write_arrow_array_to_vector(arrow, chunk)
.map_err(|e| vortex_err!("Failed to convert vrotex duckdb array: {}", e.to_string()))
}
}

pub trait FromDuckDB<V> {
fn from_duckdb(vector: V) -> VortexResult<ArrayRef>;
}

impl<'a> FromDuckDB<&'a NamedDataChunk<'a>> for ArrayRef {
fn from_duckdb(named_chunk: &'a NamedDataChunk<'a>) -> VortexResult<ArrayRef> {
let chunk = &named_chunk.chunk;
let names = &named_chunk.names;
let len = chunk.len();

let columns = (0..chunk.num_columns())
.map(|i| {
let vector = chunk.flat_vector(i);
let array = ArrayRef::from_duckdb(SizedFlatVector {
vector,
nullable: named_chunk.nullable.map(|null| null[i]).unwrap_or(true),
len,
})?;

// Figure out the column names
Ok((
names
.as_ref()
.map(|names| names[i].clone())
.unwrap_or_else(|| i.to_string().into()),
array,
))
})
.collect::<VortexResult<Vec<_>>>()?;

let (names, arrays): (Vec<_>, Vec<_>) = columns.into_iter().unzip();

// all top level struct are non nullable is duckdb, only inner columns can be.
StructArray::try_new(names.into(), arrays, len, Validity::NonNullable)
.map(StructArray::into_array)
}
}

impl FromDuckDB<SizedFlatVector> for ArrayRef {
// TODO(joe): going via is slow, make it faster.
fn from_duckdb(mut sized_vector: SizedFlatVector) -> VortexResult<ArrayRef> {
let len = sized_vector.len;
let arrow_arr = flat_vector_to_arrow_array(&mut sized_vector.vector, len).map_err(|e| {
println!("Failed to convert duckdb duckdb array vortex: {}", e);

vortex_err!("Failed to convert duckdb array to vortex: {}", e)
})?;
Ok(ArrayRef::from_arrow(arrow_arr, sized_vector.nullable))
}
}

#[cfg(test)]
mod tests {
use duckdb::core::DataChunkHandle;
use itertools::Itertools;
use vortex_array::arrays::{BoolArray, PrimitiveArray, StructArray, VarBinArray};
use vortex_array::validity::Validity;
use vortex_array::variants::StructArrayTrait;
use vortex_array::{Array, ArrayRef, ToCanonical};
use vortex_dtype::{DType, FieldNames, Nullability};

use crate::convert::array::data_chunk_adaptor::NamedDataChunk;
use crate::convert::array::to_duckdb_chunk;
use crate::{FromDuckDB, ToDuckDBType};

fn data() -> ArrayRef {
let xs = PrimitiveArray::from_iter(0..5);
let ys = VarBinArray::from_vec(
vec!["a", "b", "c", "d", "e"],
DType::Utf8(Nullability::NonNullable),
);
let zs = BoolArray::from_iter([true, true, true, false, false]);

let struct_a = StructArray::try_new(
FieldNames::from(["xs".into(), "ys".into(), "zs".into()]),
vec![xs.into_array(), ys.into_array(), zs.into_array()],
5,
Validity::NonNullable,
)
.unwrap();
struct_a.to_array()
}

#[test]
fn test_vortex_to_duckdb() {
let arr = data();
let ddb_type = arr
.dtype()
.as_struct()
.unwrap()
.fields()
.map(|f| f.to_duckdb_type().unwrap())
.collect_vec();
let struct_arr = arr.to_struct().unwrap();
let mut output_chunk = DataChunkHandle::new(ddb_type.as_slice());
let nullable = to_duckdb_chunk(&struct_arr, &mut output_chunk).unwrap();

let vx_arr = ArrayRef::from_duckdb(&NamedDataChunk::new(
&output_chunk,
&nullable,
FieldNames::from(["xs".into(), "ys".into(), "zs".into()]),
))
.unwrap();
assert_eq!(
struct_arr.names(),
vx_arr.clone().to_struct().unwrap().names()
);
for field in vx_arr.to_struct().unwrap().fields() {
assert_eq!(field.len(), arr.len());
}
assert_eq!(vx_arr.len(), arr.len());
assert_eq!(vx_arr.dtype(), arr.dtype());
}
}
5 changes: 5 additions & 0 deletions vortex-duckdb/src/convert/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod array;
mod types;

pub use array::{FromDuckDB, ToDuckDB, to_duckdb_chunk};
pub use types::{FromDuckDBType, ToDuckDBType};
67 changes: 67 additions & 0 deletions vortex-duckdb/src/convert/types/from.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
use std::sync::Arc;

use duckdb::core::{LogicalTypeHandle, LogicalTypeId};
use vortex_dtype::Nullability::Nullable;
use vortex_dtype::{DType, Nullability, PType, StructDType};

pub trait FromDuckDBType<A> {
// Nullable is inferred from the `NotNullConstraint`.
fn from_duckdb(array: A, nullable: Nullability) -> Self;
}

impl FromDuckDBType<LogicalTypeHandle> for DType {
// Converts a DuckDB logical type handle to a `DType` based on the logical type ID.
fn from_duckdb(type_: LogicalTypeHandle, nullable: Nullability) -> Self {
match type_.id() {
LogicalTypeId::Boolean => DType::Bool(nullable),
LogicalTypeId::Tinyint => DType::Primitive(PType::I8, nullable),
LogicalTypeId::Smallint => DType::Primitive(PType::I16, nullable),
LogicalTypeId::Integer => DType::Primitive(PType::I32, nullable),
LogicalTypeId::Bigint => DType::Primitive(PType::I64, nullable),
LogicalTypeId::UTinyint => DType::Primitive(PType::U8, nullable),
LogicalTypeId::USmallint => DType::Primitive(PType::U16, nullable),
LogicalTypeId::UInteger => DType::Primitive(PType::U32, nullable),
LogicalTypeId::UBigint => DType::Primitive(PType::U64, nullable),
LogicalTypeId::Float => DType::Primitive(PType::F32, nullable),
LogicalTypeId::Double => DType::Primitive(PType::F64, nullable),
LogicalTypeId::Varchar => DType::Utf8(nullable),
LogicalTypeId::Blob => DType::Binary(nullable),
LogicalTypeId::Struct => DType::Struct(Arc::new(from_duckdb_struct(type_)), nullable),
LogicalTypeId::List => DType::List(Arc::new(from_duckdb_list(type_)), nullable),
LogicalTypeId::Timestamp
| LogicalTypeId::Date
| LogicalTypeId::Time
| LogicalTypeId::Interval
// Hugeint is a i128
| LogicalTypeId::Hugeint
| LogicalTypeId::Decimal
| LogicalTypeId::TimestampS
| LogicalTypeId::TimestampMs
| LogicalTypeId::TimestampNs
| LogicalTypeId::Enum
| LogicalTypeId::Map
| LogicalTypeId::Uuid
| LogicalTypeId::Union
| LogicalTypeId::TimestampTZ => todo!(),
}
}
}

fn from_duckdb_list(list: LogicalTypeHandle) -> DType {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here, could consider making this more rustic:
impl From<LogicalTypeHandle>..

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure we can, since we I need a nullability above and I would need a dummy struct to control the from impl here

// Note: the zeroth child of a list is the element type
assert_eq!(list.num_children(), 1);
// TODO: is there list element nullability
FromDuckDBType::from_duckdb(list.child(0), Nullable)
}

fn from_duckdb_struct(struct_: LogicalTypeHandle) -> StructDType {
(0..struct_.num_children())
.map(|i| {
// TODO: is there struct field nullability
let child_nullability = Nullable;
let child_name = struct_.child_name(i);
let child_type = DType::from_duckdb(struct_.child(i), child_nullability);
(child_name, child_type)
})
.collect()
}
5 changes: 5 additions & 0 deletions vortex-duckdb/src/convert/types/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
mod from;
mod to;

pub use from::FromDuckDBType;
pub use to::ToDuckDBType;
Loading
Loading