From 3ac512e2277ae149d000b2c230de8005577ee793 Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Thu, 15 Feb 2024 02:45:42 -0300 Subject: [PATCH 01/14] feat: arrow_cast function as UDF --- datafusion-cli/Cargo.lock | 1 + datafusion/functions/Cargo.toml | 1 + .../expr => functions/src/core}/arrow_cast.rs | 123 ++++++++++++------ datafusion/functions/src/core/mod.rs | 5 +- datafusion/sql/src/expr/function.rs | 8 -- datafusion/sql/src/expr/mod.rs | 1 - datafusion/sql/src/lib.rs | 1 - datafusion/sql/tests/sql_integration.rs | 18 ++- 8 files changed, 103 insertions(+), 55 deletions(-) rename datafusion/{sql/src/expr => functions/src/core}/arrow_cast.rs (90%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 25bb30e5bc56..c11de92f8686 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1231,6 +1231,7 @@ name = "datafusion-functions" version = "35.0.0" dependencies = [ "arrow", + "arrow-schema", "base64", "datafusion-common", "datafusion-execution", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 7109261cc78f..704fb896540a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -47,6 +47,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-schema = { workspace = true } base64 = { version = "0.21", optional = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs similarity index 90% rename from datafusion/sql/src/expr/arrow_cast.rs rename to datafusion/functions/src/core/arrow_cast.rs index 9a0d61f41c01..0600209eb439 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -19,16 +19,50 @@ //! casting to arbitrary arrow types (rather than SQL types) use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc}; +use std::any::Any; +use arrow::compute::cast; use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; -use datafusion_common::{ - plan_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{plan_datafusion_err, DataFusionError, Result, ScalarValue, internal_err}; -use datafusion_common::plan_err; -use datafusion_expr::{Expr, ExprSchemable}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -pub const ARROW_CAST_NAME: &str = "arrow_cast"; +#[derive(Debug)] +pub(super) struct ArrowCastFunc { + signature: Signature, +} + +impl ArrowCastFunc { + pub fn new() -> Self{ + Self { + signature: + Signature::any(2, Volatility::Immutable) + } + } + +} + +impl ScalarUDFImpl for ArrowCastFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "arrow_cast" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + parse_data_type(&arg_types[1].to_string()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + create_arrow_cast(args) + } +} /// Create an [`Expr`] that evaluates the `arrow_cast` function /// @@ -52,26 +86,27 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast"; /// select arrow_cast(column_x, 'Float64') /// ``` /// [`BuiltinScalarFunction`]: datafusion_expr::BuiltinScalarFunction -pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result { +fn create_arrow_cast(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return plan_err!("arrow_cast needs 2 arguments, {} provided", args.len()); + return internal_err!("arrow_cast needs 2 arguments, {} provided", args.len()); } - let arg1 = args.pop().unwrap(); - let arg0 = args.pop().unwrap(); + let arg1 = &args[1]; + let arg0 = &args[0]; - // arg1 must be a string - let data_type_string = if let Expr::Literal(ScalarValue::Utf8(Some(v))) = arg1 { - v - } else { - return plan_err!( - "arrow_cast requires its second argument to be a constant string, got {arg1}" - ); - }; - - // do the actual lookup to the appropriate data type - let data_type = parse_data_type(&data_type_string)?; - - arg0.cast_to(&data_type, schema) + match (arg0, arg1) { + (ColumnarValue::Scalar(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ + let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; + let val0 = arg0.cast_to(&data_type).unwrap(); + Ok(ColumnarValue::Scalar(val0)) + } + (ColumnarValue::Array(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ + let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; + let val0 = cast(&arg0,&data_type)?; + Ok(ColumnarValue::Array(val0)) + } + (ColumnarValue::Scalar(_arg0), ColumnarValue::Scalar(arg1)) => internal_err!("arrow_cast requires its second argument to be a constant string, got {arg1}"), + _ => internal_err!("arrow_cast requires two scalar value as input. got {:?} and {:?}",arg0,arg1) + } } /// Parses `str` into a `DataType`. @@ -80,22 +115,8 @@ pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result /// impl, and maintains the invariant that /// `parse_data_type(data_type.to_string()) == data_type` /// -/// Example: -/// ``` -/// # use datafusion_sql::parse_data_type; -/// # use arrow_schema::DataType; -/// let display_value = "Int32"; -/// -/// // "Int32" is the Display value of `DataType` -/// assert_eq!(display_value, &format!("{}", DataType::Int32)); -/// -/// // parse_data_type coverts "Int32" back to `DataType`: -/// let data_type = parse_data_type(display_value).unwrap(); -/// assert_eq!(data_type, DataType::Int32); -/// ``` -/// /// Remove if added to arrow: -pub fn parse_data_type(val: &str) -> Result { +fn parse_data_type(val: &str) -> Result { Parser::new(val).parse() } @@ -647,6 +668,7 @@ impl Display for Token { #[cfg(test)] mod test { + use arrow::array::{ArrayRef, Int64Array, Int8Array}; use arrow_schema::{IntervalUnit, TimeUnit}; use super::*; @@ -847,4 +869,29 @@ mod test { println!(" Ok"); } } + + #[test] + fn test_arrow_cast_scalar() -> Result<()>{ + let input_arg0 = ColumnarValue::Scalar(ScalarValue::Int8(Some(100i8))); + let input_arg1 = ColumnarValue::Scalar(ScalarValue::Utf8(Some("Int64".to_string()))); + let result = create_arrow_cast(&[input_arg0,input_arg1])?; + let result = result.into_array(1).expect("Failed to cast values"); + let expected = Arc::new(Int64Array::from(vec![100])) as ArrayRef; + assert_eq!(expected.as_ref(),result.as_ref()); + + Ok(()) + } + + #[test] + fn test_arrow_cast_array() -> Result<()>{ + let input_arg0 = ColumnarValue::Array(Arc::new(Int8Array::from(vec![100,101,102])) as ArrayRef); + let input_arg1 = ColumnarValue::Scalar(ScalarValue::Utf8(Some("Int64".to_string()))); + let result = create_arrow_cast(&[input_arg0,input_arg1])?; + let result = result.into_array(1).expect("Failed to cast values"); + let expected = Arc::new(Int64Array::from(vec![100,101,102])) as ArrayRef; + + assert_eq!(expected.as_ref(),result.as_ref()); + + Ok(()) + } } diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 9aab4bd450d1..690daf966c1e 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -18,12 +18,15 @@ //! "core" DataFusion functions mod nullif; +mod arrow_cast; // create UDFs make_udf_function!(nullif::NullIfFunc, NULLIF, nullif); +make_udf_function!(arrow_cast::ArrowCastFunc, ARROW_CAST, arrow_cast); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( - (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression.") + (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression."), + (arrow_cast, arg_1 arg_2, "returns arg_1 parsed to the `arrow_type` given the second argument. This can be used to cast to a specific `arrow_type`.") ); diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 3187f26dcc5d..88f36b7642ca 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -32,8 +32,6 @@ use sqlparser::ast::{ }; use std::str::FromStr; -use super::arrow_cast::ARROW_CAST_NAME; - impl<'a, S: ContextProvider> SqlToRel<'a, S> { pub(super) fn sql_function_to_expr( &self, @@ -230,12 +228,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun, args, distinct, filter, order_by, ))); }; - - // Special case arrow_cast (as its type is dependent on its argument value) - if name == ARROW_CAST_NAME { - let args = self.function_args_to_expr(args, schema, planner_context)?; - return super::arrow_cast::create_arrow_cast(args, schema); - } } // Could not find the relevant function, so return an error diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index ecf510da7bce..c62f2a1b61c8 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -pub(crate) mod arrow_cast; mod binary_op; mod function; mod grouping_set; diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index d805f61397e9..23f9a64d7c81 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -40,5 +40,4 @@ pub mod utils; mod values; pub use datafusion_common::{ResolvedTableReference, TableReference}; -pub use expr::arrow_cast::parse_data_type; pub use sqlparser; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 2124a5224a76..293d55acbb51 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2579,7 +2579,7 @@ fn approx_median_window() { fn select_arrow_cast() { let sql = "SELECT arrow_cast(1234, 'Float64'), arrow_cast('foo', 'LargeUtf8')"; let expected = "\ - Projection: CAST(Int64(1234) AS Float64), CAST(Utf8(\"foo\") AS LargeUtf8)\ + Projection: arrow_cast(Int64(1234), Utf8(\"Float64\")), arrow_cast(Utf8(\"foo\"), Utf8(\"LargeUtf8\"))\ \n EmptyRelation"; quick_test(sql, expected); } @@ -2673,11 +2673,17 @@ fn logical_plan_with_dialect_and_options( dialect: &dyn Dialect, options: ParserOptions, ) -> Result { - let context = MockContextProvider::default().with_udf(make_udf( - "nullif", - vec![DataType::Int32, DataType::Int32], - DataType::Int32, - )); + let context = MockContextProvider::default() + .with_udf(make_udf( + "nullif", + vec![DataType::Int32, DataType::Int32], + DataType::Int32, + )) + .with_udf(make_udf( + "arrow_cast", + vec![DataType::Int64, DataType::Utf8], + DataType::Float64, + )); let planner = SqlToRel::new_with_options(&context, options); let result = DFParser::parse_sql_with_dialect(sql, dialect); From 3f0963d020f308aa3168502830ccbe6bf7ba69b7 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 15 Feb 2024 07:25:47 -0500 Subject: [PATCH 02/14] Improve documentation on how to build `ScalarValue::Struct` and add `ScalarStructBuilder` (#9229) * Improve documentation on how to build `ScalarValue::Struct` and add `ScalarStructBuilder` * Update datafusion/common/src/scalar/struct_builder.rs * Improved docs * update test --------- Co-authored-by: comphead --- .../common/src/{scalar.rs => scalar/mod.rs} | 181 ++++++++++++------ .../common/src/scalar/struct_builder.rs | 152 +++++++++++++++ .../tests/cases/roundtrip_logical_plan.rs | 20 +- 3 files changed, 282 insertions(+), 71 deletions(-) rename datafusion/common/src/{scalar.rs => scalar/mod.rs} (98%) create mode 100644 datafusion/common/src/scalar/struct_builder.rs diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar/mod.rs similarity index 98% rename from datafusion/common/src/scalar.rs rename to datafusion/common/src/scalar/mod.rs index 2395f8acc4d2..29107ab10e7e 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -15,7 +15,9 @@ // specific language governing permissions and limitations // under the License. -//! This module provides ScalarValue, an enum that can be used for storage of single elements +//! [`ScalarValue`]: stores single values + +mod struct_builder; use std::borrow::Borrow; use std::cmp::Ordering; @@ -43,24 +45,45 @@ use arrow::{ compute::kernels::cast::{cast_with_options, CastOptions}, datatypes::{ i256, ArrowDictionaryKeyType, ArrowNativeType, ArrowTimestampType, DataType, - Field, Fields, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, + Field, Float32Type, Int16Type, Int32Type, Int64Type, Int8Type, IntervalDayTimeType, IntervalMonthDayNanoType, IntervalUnit, - IntervalYearMonthType, SchemaBuilder, TimeUnit, TimestampMicrosecondType, + IntervalYearMonthType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType, TimestampSecondType, UInt16Type, UInt32Type, UInt64Type, UInt8Type, DECIMAL128_MAX_PRECISION, }, }; use arrow_array::cast::as_list_array; use arrow_array::{ArrowNativeTypeOp, Scalar}; -use arrow_buffer::NullBuffer; -/// A dynamically typed, nullable single value, (the single-valued counter-part -/// to arrow's [`Array`]) +pub use struct_builder::ScalarStructBuilder; + +/// A dynamically typed, nullable single value. +/// +/// While an arrow [`Array`]) stores one or more values of the same type, in a +/// single column, a `ScalarValue` stores a single value of a single type, the +/// equivalent of 1 row and one column. +/// +/// ```text +/// ┌────────┐ +/// │ value1 │ +/// │ value2 │ ┌────────┐ +/// │ value3 │ │ value2 │ +/// │ ... │ └────────┘ +/// │ valueN │ +/// └────────┘ +/// +/// Array ScalarValue +/// +/// stores multiple, stores a single, +/// possibly null, values of possible null, value +/// the same type +/// ``` /// /// # Performance /// -/// In general, please use arrow [`Array`]s rather than [`ScalarValue`] whenever -/// possible, as it is far more efficient for multiple values. +/// In general, performance will be better using arrow [`Array`]s rather than +/// [`ScalarValue`], as it is far more efficient to process multiple values at +/// once (vecctorized processing). /// /// # Example /// ``` @@ -99,6 +122,66 @@ use arrow_buffer::NullBuffer; /// # } /// ``` /// +/// # Nested Types +/// +/// `List` / `LargeList` / `FixedSizeList` / `Struct` are represented as a +/// single element array of the corresponding type. +/// +/// ## Example: Creating [`ScalarValue::Struct`] using [`ScalarStructBuilder`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field}; +/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; +/// // Build a struct like: {a: 1, b: "foo"} +/// let field_a = Field::new("a", DataType::Int32, false); +/// let field_b = Field::new("b", DataType::Utf8, false); +/// +/// let s1 = ScalarStructBuilder::new() +/// .with_scalar(field_a, ScalarValue::from(1i32)) +/// .with_scalar(field_b, ScalarValue::from("foo")) +/// .build(); +/// ``` +/// +/// ## Example: Creating a null [`ScalarValue::Struct`] using [`ScalarStructBuilder`] +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field}; +/// # use datafusion_common::{ScalarValue, scalar::ScalarStructBuilder}; +/// // Build a struct representing a NULL value +/// let fields = vec![ +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), +/// ]; +/// +/// let s1 = ScalarStructBuilder::new_null(fields); +/// ``` +/// +/// ## Example: Creating [`ScalarValue::Struct`] directly +/// ``` +/// # use std::sync::Arc; +/// # use arrow::datatypes::{DataType, Field, Fields}; +/// # use arrow_array::{ArrayRef, Int32Array, StructArray, StringArray}; +/// # use datafusion_common::ScalarValue; +/// // Build a struct like: {a: 1, b: "foo"} +/// // Field description +/// let fields = Fields::from(vec![ +/// Field::new("a", DataType::Int32, false), +/// Field::new("b", DataType::Utf8, false), +/// ]); +/// // one row arrays for each field +/// let arrays: Vec = vec![ +/// Arc::new(Int32Array::from(vec![1])), +/// Arc::new(StringArray::from(vec!["foo"])), +/// ]; +/// // no nulls for this array +/// let nulls = None; +/// let arr = StructArray::new(fields, arrays, nulls); +/// +/// // Create a ScalarValue::Struct directly +/// let s1 = ScalarValue::Struct(Arc::new(arr)); +/// ``` +/// +/// /// # Further Reading /// See [datatypes](https://arrow.apache.org/docs/python/api/datatypes.html) for /// details on datatypes and the [format](https://github.com/apache/arrow/blob/master/format/Schema.fbs#L354-L375) @@ -153,7 +236,8 @@ pub enum ScalarValue { List(Arc), /// The array must be a LargeListArray with length 1. LargeList(Arc), - /// Represents a single element of a [`StructArray`] as an [`ArrayRef`] + /// Represents a single element [`StructArray`] as an [`ArrayRef`]. See + /// [`ScalarValue`] for examples of how to create instances of this type. Struct(Arc), /// Date stored as a signed 32bit int days since UNIX epoch 1970-01-01 Date32(Option), @@ -2679,20 +2763,13 @@ impl From> for ScalarValue { /// Wrapper to create ScalarValue::Struct for convenience impl From> for ScalarValue { fn from(value: Vec<(&str, ScalarValue)>) -> Self { - let (fields, scalars): (SchemaBuilder, Vec<_>) = value - .into_iter() - .map(|(name, scalar)| (Field::new(name, scalar.data_type(), false), scalar)) - .unzip(); - - let arrays = scalars + value .into_iter() - .map(|scalar| scalar.to_array().unwrap()) - .collect::>(); - - let fields = fields.finish().fields; - let struct_array = StructArray::try_new(fields, arrays, None).unwrap(); - - Self::Struct(Arc::new(struct_array)) + .fold(ScalarStructBuilder::new(), |builder, (name, value)| { + builder.with_name_and_scalar(name, value) + }) + .build() + .unwrap() } } @@ -2710,27 +2787,6 @@ impl From for ScalarValue { } } -// TODO: Remove this after changing to Scalar -// Wrapper for ScalarValue::Struct that checks the length of the arrays, without nulls -impl From<(Fields, Vec)> for ScalarValue { - fn from((fields, arrays): (Fields, Vec)) -> Self { - Self::from((fields, arrays, None)) - } -} - -// TODO: Remove this after changing to Scalar -// Wrapper for ScalarValue::Struct that checks the length of the arrays -impl From<(Fields, Vec, Option)> for ScalarValue { - fn from( - (fields, arrays, nulls): (Fields, Vec, Option), - ) -> Self { - for arr in arrays.iter() { - assert_eq!(arr.len(), 1); - } - Self::Struct(Arc::new(StructArray::new(fields, arrays, nulls))) - } -} - macro_rules! impl_try_from { ($SCALAR:ident, $NATIVE:ident) => { impl TryFrom for $NATIVE { @@ -3247,6 +3303,7 @@ mod tests { use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType}; use arrow::util::pretty::pretty_format_columns; use arrow_buffer::Buffer; + use arrow_schema::Fields; use chrono::NaiveDate; use rand::Rng; @@ -3266,31 +3323,33 @@ mod tests { ), ]); - let arrays = vec![boolean as ArrayRef, int as ArrayRef]; - let fields = Fields::from(vec![ - Field::new("b", DataType::Boolean, false), - Field::new("c", DataType::Int32, false), - ]); - let sv = ScalarValue::from((fields, arrays)); + let sv = ScalarStructBuilder::new() + .with_array(Field::new("b", DataType::Boolean, false), boolean) + .with_array(Field::new("c", DataType::Int32, false), int) + .build() + .unwrap(); + let struct_arr = sv.to_array().unwrap(); let actual = as_struct_array(&struct_arr).unwrap(); assert_eq!(actual, &expected); } #[test] - #[should_panic(expected = "assertion `left == right` failed")] + #[should_panic( + expected = "Error building ScalarValue::Struct. Expected array with exactly one element, found array with 4 elements" + )] fn test_scalar_value_from_for_struct_should_panic() { - let fields = Fields::from(vec![ - Field::new("bool", DataType::Boolean, false), - Field::new("i32", DataType::Int32, false), - ]); - - let arrays = vec![ - Arc::new(BooleanArray::from(vec![false, true, false, false])) as ArrayRef, - Arc::new(Int32Array::from(vec![42, 28, 19, 31])), - ]; - - let _ = ScalarValue::from((fields, arrays)); + let _ = ScalarStructBuilder::new() + .with_array( + Field::new("bool", DataType::Boolean, false), + Arc::new(BooleanArray::from(vec![false, true, false, false])), + ) + .with_array( + Field::new("i32", DataType::Int32, false), + Arc::new(Int32Array::from(vec![42, 28, 19, 31])), + ) + .build() + .unwrap(); } #[test] diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs new file mode 100644 index 000000000000..926e10041751 --- /dev/null +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! [`ScalarStructBuilder`] for building [`ScalarValue::Struct`] + +use crate::error::_internal_err; +use crate::{DataFusionError, Result, ScalarValue}; +use arrow::array::{ArrayRef, StructArray}; +use arrow::datatypes::{DataType, FieldRef, Fields}; +use arrow_schema::Field; +use std::sync::Arc; + +/// Builder for [`ScalarValue::Struct`]. +/// +/// See examples on [`ScalarValue`] +#[derive(Debug, Default)] +pub struct ScalarStructBuilder { + fields: Vec, + arrays: Vec, +} + +impl ScalarStructBuilder { + /// Create a new `ScalarStructBuilder` + pub fn new() -> Self { + Self::default() + } + + /// Return a new [`ScalarValue::Struct`] with the specified fields and a + /// single null value + pub fn new_null(fields: impl IntoFields) -> ScalarValue { + DataType::Struct(fields.into()).try_into().unwrap() + } + + /// Add the specified field and [`ArrayRef`] to the struct. + /// + /// Note the array should have a single row. + pub fn with_array(mut self, field: impl IntoFieldRef, value: ArrayRef) -> Self { + self.fields.push(field.into_field_ref()); + self.arrays.push(value); + self + } + + /// Add the specified field and `ScalarValue` to the struct. + pub fn with_scalar(self, field: impl IntoFieldRef, value: ScalarValue) -> Self { + // valid scalar value should not fail + let array = value.to_array().unwrap(); + self.with_array(field, array) + } + + /// Add a field with the specified name and value to the struct. + /// the field is created with the specified data type and as non nullable + pub fn with_name_and_scalar(self, name: &str, value: ScalarValue) -> Self { + let field = Field::new(name, value.data_type(), false); + self.with_scalar(field, value) + } + + /// Return a [`ScalarValue::Struct`] with the fields and values added so far + /// + /// # Errors + /// + /// If the [`StructArray`] cannot be created (for example if there is a + /// mismatch between field types and arrays) or the arrays do not have + /// exactly one element. + pub fn build(self) -> Result { + let Self { fields, arrays } = self; + + for array in &arrays { + if array.len() != 1 { + return _internal_err!( + "Error building ScalarValue::Struct. \ + Expected array with exactly one element, found array with {} elements", + array.len() + ); + } + } + + let struct_array = StructArray::try_new(Fields::from(fields), arrays, None)?; + Ok(ScalarValue::Struct(Arc::new(struct_array))) + } +} + +/// Trait for converting a type into a [`FieldRef`] +/// +/// Used to avoid having to call `clone()` on a `FieldRef` when adding a field to +/// a `ScalarStructBuilder`. +/// +/// TODO potentially upstream this to arrow-rs so that we can +/// use impl `Into` instead +pub trait IntoFieldRef { + fn into_field_ref(self) -> FieldRef; +} + +impl IntoFieldRef for FieldRef { + fn into_field_ref(self) -> FieldRef { + self + } +} + +impl IntoFieldRef for &FieldRef { + fn into_field_ref(self) -> FieldRef { + self.clone() + } +} + +impl IntoFieldRef for Field { + fn into_field_ref(self) -> FieldRef { + FieldRef::new(self) + } +} + +/// Trait for converting a type into a [`Fields`] +/// +/// This avoids to avoid having to call clone() on an Arc'd `Fields` when adding +/// a field to a `ScalarStructBuilder` +/// +/// TODO potentially upstream this to arrow-rs so that we can +/// use impl `Into` instead +pub trait IntoFields { + fn into(self) -> Fields; +} + +impl IntoFields for Fields { + fn into(self) -> Fields { + self + } +} + +impl IntoFields for &Fields { + fn into(self) -> Fields { + self.clone() + } +} + +impl IntoFields for Vec { + fn into(self) -> Fields { + Fields::from(self) + } +} diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index b6d288da2c3e..68a318b5a6d5 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -21,7 +21,6 @@ use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; use arrow::array::{ArrayRef, FixedSizeListArray}; -use arrow::array::{BooleanArray, Int32Array}; use arrow::csv::WriterBuilder; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, @@ -42,6 +41,7 @@ use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; +use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{internal_err, not_impl_err, plan_err, FileTypeWriterOptions}; use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue}; use datafusion_common::{FileType, Result}; @@ -932,17 +932,17 @@ fn round_trip_scalar_values() { ScalarValue::Binary(None), ScalarValue::LargeBinary(Some(b"bar".to_vec())), ScalarValue::LargeBinary(None), - ScalarValue::from(( - vec![ + ScalarStructBuilder::new() + .with_scalar( Field::new("a", DataType::Int32, true), + ScalarValue::from(23i32), + ) + .with_scalar( Field::new("b", DataType::Boolean, false), - ] - .into(), - vec![ - Arc::new(Int32Array::from(vec![Some(23)])) as ArrayRef, - Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef, - ], - )), + ScalarValue::from(false), + ) + .build() + .unwrap(), ScalarValue::try_from(&DataType::Struct(Fields::from(vec![ Field::new("a", DataType::Int32, true), Field::new("b", DataType::Boolean, false), From 85be1bc5538661b58ca90fb1e042d8d840c9f693 Mon Sep 17 00:00:00 2001 From: Mustafa Akur <106137913+mustafasrepo@users.noreply.github.com> Date: Thu, 15 Feb 2024 17:07:01 +0300 Subject: [PATCH 03/14] Minor: improve Display of output ordering of `StreamTableExec` (#9225) * Initial commit * Update plan --- .../core/src/datasource/physical_plan/mod.rs | 23 ++------------- datafusion/physical-plan/src/display.rs | 28 ++++++++++++++++++- datafusion/physical-plan/src/streaming.rs | 17 +++-------- datafusion/sqllogictest/test_files/window.slt | 2 +- 4 files changed, 34 insertions(+), 36 deletions(-) diff --git a/datafusion/core/src/datasource/physical_plan/mod.rs b/datafusion/core/src/datasource/physical_plan/mod.rs index 11eb9e7867bb..d6546539993b 100644 --- a/datafusion/core/src/datasource/physical_plan/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/mod.rs @@ -59,7 +59,7 @@ use crate::{ listing::{FileRange, PartitionedFile}, object_store::ObjectStoreUrl, }, - physical_plan::display::{OutputOrderingDisplay, ProjectSchemaDisplay}, + physical_plan::display::{display_orderings, ProjectSchemaDisplay}, }; use arrow::{ @@ -129,26 +129,7 @@ impl DisplayAs for FileScanConfig { write!(f, ", limit={limit}")?; } - if let Some(ordering) = orderings.first() { - if !ordering.is_empty() { - let start = if orderings.len() == 1 { - ", output_ordering=" - } else { - ", output_orderings=[" - }; - write!(f, "{}", start)?; - for (idx, ordering) in - orderings.iter().enumerate().filter(|(_, o)| !o.is_empty()) - { - match idx { - 0 => write!(f, "{}", OutputOrderingDisplay(ordering))?, - _ => write!(f, ", {}", OutputOrderingDisplay(ordering))?, - } - } - let end = if orderings.len() == 1 { "" } else { "]" }; - write!(f, "{}", end)?; - } - } + display_orderings(f, &orderings)?; Ok(()) } diff --git a/datafusion/physical-plan/src/display.rs b/datafusion/physical-plan/src/display.rs index 19c2847b09dc..ff106dceb974 100644 --- a/datafusion/physical-plan/src/display.rs +++ b/datafusion/physical-plan/src/display.rs @@ -19,12 +19,13 @@ //! [`crate::displayable`] for examples of how to format use std::fmt; +use std::fmt::Formatter; use super::{accept, ExecutionPlan, ExecutionPlanVisitor}; use arrow_schema::SchemaRef; use datafusion_common::display::{GraphvizBuilder, PlanType, StringifiedPlan}; -use datafusion_physical_expr::PhysicalSortExpr; +use datafusion_physical_expr::{LexOrdering, PhysicalSortExpr}; /// Options for controlling how each [`ExecutionPlan`] should format itself #[derive(Debug, Clone, Copy)] @@ -437,6 +438,31 @@ impl<'a> fmt::Display for OutputOrderingDisplay<'a> { } } +pub fn display_orderings(f: &mut Formatter, orderings: &[LexOrdering]) -> fmt::Result { + if let Some(ordering) = orderings.first() { + if !ordering.is_empty() { + let start = if orderings.len() == 1 { + ", output_ordering=" + } else { + ", output_orderings=[" + }; + write!(f, "{}", start)?; + for (idx, ordering) in + orderings.iter().enumerate().filter(|(_, o)| !o.is_empty()) + { + match idx { + 0 => write!(f, "{}", OutputOrderingDisplay(ordering))?, + _ => write!(f, ", {}", OutputOrderingDisplay(ordering))?, + } + } + let end = if orderings.len() == 1 { "" } else { "]" }; + write!(f, "{}", end)?; + } + } + + Ok(()) +} + #[cfg(test)] mod tests { use std::fmt::Write; diff --git a/datafusion/physical-plan/src/streaming.rs b/datafusion/physical-plan/src/streaming.rs index 59819c6921fb..897682092831 100644 --- a/datafusion/physical-plan/src/streaming.rs +++ b/datafusion/physical-plan/src/streaming.rs @@ -21,7 +21,7 @@ use std::any::Any; use std::sync::Arc; use super::{DisplayAs, DisplayFormatType}; -use crate::display::{OutputOrderingDisplay, ProjectSchemaDisplay}; +use crate::display::{display_orderings, ProjectSchemaDisplay}; use crate::stream::RecordBatchStreamAdapter; use crate::{ExecutionPlan, Partitioning, SendableRecordBatchStream}; @@ -149,18 +149,9 @@ impl DisplayAs for StreamingTableExec { write!(f, ", infinite_source=true")?; } - self.projected_output_ordering - .first() - .map_or(Ok(()), |ordering| { - if !ordering.is_empty() { - write!( - f, - ", output_ordering={}", - OutputOrderingDisplay(ordering) - )?; - } - Ok(()) - }) + display_orderings(f, &self.projected_output_ordering)?; + + Ok(()) } } } diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index fa4445d4cd4c..5a610c16bc7f 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -3582,7 +3582,7 @@ SortPreservingMergeExec: [c@3 ASC NULLS LAST] ------CoalesceBatchesExec: target_batch_size=4096 --------RepartitionExec: partitioning=Hash([d@4], 2), input_partitions=2, preserve_order=true, sort_exprs=a@1 ASC NULLS LAST,b@2 ASC NULLS LAST ----------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1 -------------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_ordering=[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST] +------------StreamingTableExec: partition_sizes=1, projection=[a0, a, b, c, d], infinite_source=true, output_orderings=[[a@1 ASC NULLS LAST, b@2 ASC NULLS LAST], [c@3 ASC NULLS LAST]] # CTAS with NTILE function statement ok From 92d9274e6a2c677fe07938e531be4b45532a89a9 Mon Sep 17 00:00:00 2001 From: Junhao Liu Date: Thu, 15 Feb 2024 10:28:05 -0600 Subject: [PATCH 04/14] Support compute return types from argument values (not just their DataTypes) (#8985) * ScalarValue return types from argument values * change file name * try using ?Sized * use Ok * move method default impl outside trait * Use type trait for ExprSchemable * fix nit * Proposed Return Type from Expr suggestions (#1) * Improve return_type_from_args * Rework example * Update datafusion/core/tests/user_defined/user_defined_scalar_functions.rs --------- Co-authored-by: Junhao Liu * Apply suggestions from code review Co-authored-by: Alex Huang * Fix tests + clippy * rework types to use dyn trait * fmt * docs * Apply suggestions from code review Co-authored-by: Jeffrey Vo * Add docs explaining what happens when both `return_type` and `return_type_from_exprs` are called * clippy * fix doc -- comedy of errors --------- Co-authored-by: Andrew Lamb Co-authored-by: Alex Huang Co-authored-by: Jeffrey Vo --- .../user_defined_scalar_functions.rs | 142 +++++++++++++++++- datafusion/expr/src/expr_schema.rs | 40 ++--- datafusion/expr/src/udf.rs | 69 ++++++++- .../optimizer/src/analyzer/type_coercion.rs | 16 +- datafusion/physical-expr/src/planner.rs | 14 +- datafusion/physical-expr/src/udf.rs | 17 +-- 6 files changed, 245 insertions(+), 53 deletions(-) diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index a86c76b9b6dd..9812789740f7 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -22,12 +22,16 @@ use arrow_schema::{DataType, Field, Schema}; use datafusion::prelude::*; use datafusion::{execution::registry::FunctionRegistry, test_util}; use datafusion_common::cast::as_float64_array; -use datafusion_common::{assert_batches_eq, cast::as_int32_array, Result, ScalarValue}; +use datafusion_common::{ + assert_batches_eq, assert_batches_sorted_eq, cast::as_int32_array, not_impl_err, + plan_err, DataFusionError, ExprSchema, Result, ScalarValue, +}; use datafusion_expr::{ - create_udaf, create_udf, Accumulator, ColumnarValue, LogicalPlanBuilder, ScalarUDF, - ScalarUDFImpl, Signature, Volatility, + create_udaf, create_udf, Accumulator, ColumnarValue, ExprSchemable, + LogicalPlanBuilder, ScalarUDF, ScalarUDFImpl, Signature, Volatility, }; use rand::{thread_rng, Rng}; +use std::any::Any; use std::iter; use std::sync::Arc; @@ -494,6 +498,127 @@ async fn test_user_defined_functions_zero_argument() -> Result<()> { Ok(()) } +#[derive(Debug)] +struct TakeUDF { + signature: Signature, +} + +impl TakeUDF { + fn new() -> Self { + Self { + signature: Signature::any(3, Volatility::Immutable), + } + } +} + +/// Implement a ScalarUDFImpl whose return type is a function of the input values +impl ScalarUDFImpl for TakeUDF { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "take" + } + fn signature(&self) -> &Signature { + &self.signature + } + fn return_type(&self, _args: &[DataType]) -> Result { + not_impl_err!("Not called because the return_type_from_exprs is implemented") + } + + /// This function returns the type of the first or second argument based on + /// the third argument: + /// + /// 1. If the third argument is '0', return the type of the first argument + /// 2. If the third argument is '1', return the type of the second argument + fn return_type_from_exprs( + &self, + arg_exprs: &[Expr], + schema: &dyn ExprSchema, + ) -> Result { + if arg_exprs.len() != 3 { + return plan_err!("Expected 3 arguments, got {}.", arg_exprs.len()); + } + + let take_idx = if let Some(Expr::Literal(ScalarValue::Int64(Some(idx)))) = + arg_exprs.get(2) + { + if *idx == 0 || *idx == 1 { + *idx as usize + } else { + return plan_err!("The third argument must be 0 or 1, got: {idx}"); + } + } else { + return plan_err!( + "The third argument must be a literal of type int64, but got {:?}", + arg_exprs.get(2) + ); + }; + + arg_exprs.get(take_idx).unwrap().get_type(schema) + } + + // The actual implementation + fn invoke(&self, args: &[ColumnarValue]) -> Result { + let take_idx = match &args[2] { + ColumnarValue::Scalar(ScalarValue::Int64(Some(v))) if v < &2 => *v as usize, + _ => unreachable!(), + }; + match &args[take_idx] { + ColumnarValue::Array(array) => Ok(ColumnarValue::Array(array.clone())), + ColumnarValue::Scalar(_) => unimplemented!(), + } + } +} + +#[tokio::test] +async fn verify_udf_return_type() -> Result<()> { + // Create a new ScalarUDF from the implementation + let take = ScalarUDF::from(TakeUDF::new()); + + // SELECT + // take(smallint_col, double_col, 0) as take0, + // take(smallint_col, double_col, 1) as take1 + // FROM alltypes_plain; + let exprs = vec![ + take.call(vec![col("smallint_col"), col("double_col"), lit(0_i64)]) + .alias("take0"), + take.call(vec![col("smallint_col"), col("double_col"), lit(1_i64)]) + .alias("take1"), + ]; + + let ctx = SessionContext::new(); + register_alltypes_parquet(&ctx).await?; + + let df = ctx.table("alltypes_plain").await?.select(exprs)?; + + let schema = df.schema(); + + // The output schema should be + // * type of column smallint_col (int32) + // * type of column double_col (float64) + assert_eq!(schema.field(0).data_type(), &DataType::Int32); + assert_eq!(schema.field(1).data_type(), &DataType::Float64); + + let expected = [ + "+-------+-------+", + "| take0 | take1 |", + "+-------+-------+", + "| 0 | 0.0 |", + "| 0 | 0.0 |", + "| 0 | 0.0 |", + "| 0 | 0.0 |", + "| 1 | 10.1 |", + "| 1 | 10.1 |", + "| 1 | 10.1 |", + "| 1 | 10.1 |", + "+-------+-------+", + ]; + assert_batches_sorted_eq!(&expected, &df.collect().await?); + + Ok(()) +} + fn create_udf_context() -> SessionContext { let ctx = SessionContext::new(); // register a custom UDF @@ -531,6 +656,17 @@ async fn register_aggregate_csv(ctx: &SessionContext) -> Result<()> { Ok(()) } +async fn register_alltypes_parquet(ctx: &SessionContext) -> Result<()> { + let testdata = datafusion::test_util::parquet_test_data(); + ctx.register_parquet( + "alltypes_plain", + &format!("{testdata}/alltypes_plain.parquet"), + ParquetReadOptions::default(), + ) + .await?; + Ok(()) +} + /// Execute SQL and return results as a RecordBatch async fn plan_and_collect(ctx: &SessionContext, sql: &str) -> Result> { ctx.sql(sql).await?.collect().await diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 517d7a35f70a..491b4a852261 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, - DataFusionError, ExprSchema, Result, + internal_err, plan_datafusion_err, plan_err, Column, DFField, DataFusionError, + ExprSchema, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -37,26 +37,28 @@ use std::sync::Arc; /// trait to allow expr to typable with respect to a schema pub trait ExprSchemable { /// given a schema, return the type of the expr - fn get_type(&self, schema: &S) -> Result; + fn get_type(&self, schema: &dyn ExprSchema) -> Result; /// given a schema, return the nullability of the expr - fn nullable(&self, input_schema: &S) -> Result; + fn nullable(&self, input_schema: &dyn ExprSchema) -> Result; /// given a schema, return the expr's optional metadata - fn metadata(&self, schema: &S) -> Result>; + fn metadata(&self, schema: &dyn ExprSchema) -> Result>; /// convert to a field with respect to a schema - fn to_field(&self, input_schema: &DFSchema) -> Result; + fn to_field(&self, input_schema: &dyn ExprSchema) -> Result; /// cast to a type with respect to a schema - fn cast_to(self, cast_to_type: &DataType, schema: &S) -> Result; + fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result; } impl ExprSchemable for Expr { /// Returns the [arrow::datatypes::DataType] of the expression /// based on [ExprSchema] /// - /// Note: [DFSchema] implements [ExprSchema]. + /// Note: [`DFSchema`] implements [ExprSchema]. + /// + /// [`DFSchema`]: datafusion_common::DFSchema /// /// # Examples /// @@ -90,7 +92,7 @@ impl ExprSchemable for Expr { /// expression refers to a column that does not exist in the /// schema, or when the expression is incorrectly typed /// (e.g. `[utf8] + [bool]`). - fn get_type(&self, schema: &S) -> Result { + fn get_type(&self, schema: &dyn ExprSchema) -> Result { match self { Expr::Alias(Alias { expr, name, .. }) => match &**expr { Expr::Placeholder(Placeholder { data_type, .. }) => match &data_type { @@ -136,7 +138,7 @@ impl ExprSchemable for Expr { fun.return_type(&arg_data_types) } ScalarFunctionDefinition::UDF(fun) => { - Ok(fun.return_type(&arg_data_types)?) + Ok(fun.return_type_from_exprs(args, schema)?) } ScalarFunctionDefinition::Name(_) => { internal_err!("Function `Expr` with name should be resolved.") @@ -213,14 +215,16 @@ impl ExprSchemable for Expr { /// Returns the nullability of the expression based on [ExprSchema]. /// - /// Note: [DFSchema] implements [ExprSchema]. + /// Note: [`DFSchema`] implements [ExprSchema]. + /// + /// [`DFSchema`]: datafusion_common::DFSchema /// /// # Errors /// /// This function errors when it is not possible to compute its /// nullability. This happens when the expression refers to a /// column that does not exist in the schema. - fn nullable(&self, input_schema: &S) -> Result { + fn nullable(&self, input_schema: &dyn ExprSchema) -> Result { match self { Expr::Alias(Alias { expr, .. }) | Expr::Not(expr) @@ -327,7 +331,7 @@ impl ExprSchemable for Expr { } } - fn metadata(&self, schema: &S) -> Result> { + fn metadata(&self, schema: &dyn ExprSchema) -> Result> { match self { Expr::Column(c) => Ok(schema.metadata(c)?.clone()), Expr::Alias(Alias { expr, .. }) => expr.metadata(schema), @@ -339,7 +343,7 @@ impl ExprSchemable for Expr { /// /// So for example, a projected expression `col(c1) + col(c2)` is /// placed in an output field **named** col("c1 + c2") - fn to_field(&self, input_schema: &DFSchema) -> Result { + fn to_field(&self, input_schema: &dyn ExprSchema) -> Result { match self { Expr::Column(c) => Ok(DFField::new( c.relation.clone(), @@ -370,7 +374,7 @@ impl ExprSchemable for Expr { /// /// This function errors when it is impossible to cast the /// expression to the target [arrow::datatypes::DataType]. - fn cast_to(self, cast_to_type: &DataType, schema: &S) -> Result { + fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result { let this_type = self.get_type(schema)?; if this_type == *cast_to_type { return Ok(self); @@ -394,10 +398,10 @@ impl ExprSchemable for Expr { } /// return the schema [`Field`] for the type referenced by `get_indexed_field` -fn field_for_index( +fn field_for_index( expr: &Expr, field: &GetFieldAccess, - schema: &S, + schema: &dyn ExprSchema, ) -> Result { let expr_dt = expr.get_type(schema)?; match field { @@ -457,7 +461,7 @@ mod tests { use super::*; use crate::{col, lit}; use arrow::datatypes::{DataType, Fields}; - use datafusion_common::{Column, ScalarValue, TableReference}; + use datafusion_common::{Column, DFSchema, ScalarValue, TableReference}; macro_rules! test_is_expr_nullable { ($EXPR_TYPE:ident) => {{ diff --git a/datafusion/expr/src/udf.rs b/datafusion/expr/src/udf.rs index 3017e1ec0271..5b5d92a628c2 100644 --- a/datafusion/expr/src/udf.rs +++ b/datafusion/expr/src/udf.rs @@ -17,12 +17,13 @@ //! [`ScalarUDF`]: Scalar User Defined Functions +use crate::ExprSchemable; use crate::{ ColumnarValue, Expr, FuncMonotonicity, ReturnTypeFunction, ScalarFunctionImplementation, Signature, }; use arrow::datatypes::DataType; -use datafusion_common::Result; +use datafusion_common::{ExprSchema, Result}; use std::any::Any; use std::fmt; use std::fmt::Debug; @@ -110,7 +111,7 @@ impl ScalarUDF { /// /// If you implement [`ScalarUDFImpl`] directly you should return aliases directly. pub fn with_aliases(self, aliases: impl IntoIterator) -> Self { - Self::new_from_impl(AliasedScalarUDFImpl::new(self, aliases)) + Self::new_from_impl(AliasedScalarUDFImpl::new(self.inner.clone(), aliases)) } /// Returns a [`Expr`] logical expression to call this UDF with specified @@ -146,10 +147,17 @@ impl ScalarUDF { } /// The datatype this function returns given the input argument input types. + /// This function is used when the input arguments are [`Expr`]s. /// - /// See [`ScalarUDFImpl::return_type`] for more details. - pub fn return_type(&self, args: &[DataType]) -> Result { - self.inner.return_type(args) + /// + /// See [`ScalarUDFImpl::return_type_from_exprs`] for more details. + pub fn return_type_from_exprs( + &self, + args: &[Expr], + schema: &dyn ExprSchema, + ) -> Result { + // If the implementation provides a return_type_from_exprs, use it + self.inner.return_type_from_exprs(args, schema) } /// Invoke the function on `args`, returning the appropriate result. @@ -246,9 +254,54 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { fn signature(&self) -> &Signature; /// What [`DataType`] will be returned by this function, given the types of - /// the arguments + /// the arguments. + /// + /// # Notes + /// + /// If you provide an implementation for [`Self::return_type_from_exprs`], + /// DataFusion will not call `return_type` (this function). In this case it + /// is recommended to return [`DataFusionError::Internal`]. + /// + /// [`DataFusionError::Internal`]: datafusion_common::DataFusionError::Internal fn return_type(&self, arg_types: &[DataType]) -> Result; + /// What [`DataType`] will be returned by this function, given the + /// arguments? + /// + /// Note most UDFs should implement [`Self::return_type`] and not this + /// function. The output type for most functions only depends on the types + /// of their inputs (e.g. `sqrt(f32)` is always `f32`). + /// + /// By default, this function calls [`Self::return_type`] with the + /// types of each argument. + /// + /// This method can be overridden for functions that return different + /// *types* based on the *values* of their arguments. + /// + /// For example, the following two function calls get the same argument + /// types (something and a `Utf8` string) but return different types based + /// on the value of the second argument: + /// + /// * `arrow_cast(x, 'Int16')` --> `Int16` + /// * `arrow_cast(x, 'Float32')` --> `Float32` + /// + /// # Notes: + /// + /// This function must consistently return the same type for the same + /// logical input even if the input is simplified (e.g. it must return the same + /// value for `('foo' | 'bar')` as it does for ('foobar'). + fn return_type_from_exprs( + &self, + args: &[Expr], + schema: &dyn ExprSchema, + ) -> Result { + let arg_types = args + .iter() + .map(|arg| arg.get_type(schema)) + .collect::>>()?; + self.return_type(&arg_types) + } + /// Invoke the function on `args`, returning the appropriate result /// /// The function will be invoked passed with the slice of [`ColumnarValue`] @@ -290,13 +343,13 @@ pub trait ScalarUDFImpl: Debug + Send + Sync { /// implement [`ScalarUDFImpl`], which supports aliases, directly if possible. #[derive(Debug)] struct AliasedScalarUDFImpl { - inner: ScalarUDF, + inner: Arc, aliases: Vec, } impl AliasedScalarUDFImpl { pub fn new( - inner: ScalarUDF, + inner: Arc, new_aliases: impl IntoIterator, ) -> Self { let mut aliases = inner.aliases().to_vec(); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 662e0fc7c258..fba77047dd74 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -681,17 +681,17 @@ fn coerce_case_expression(case: Case, schema: &DFSchemaRef) -> Result { let case_type = case .expr .as_ref() - .map(|expr| expr.get_type(&schema)) + .map(|expr| expr.get_type(schema)) .transpose()?; let then_types = case .when_then_expr .iter() - .map(|(_when, then)| then.get_type(&schema)) + .map(|(_when, then)| then.get_type(schema)) .collect::>>()?; let else_type = case .else_expr .as_ref() - .map(|expr| expr.get_type(&schema)) + .map(|expr| expr.get_type(schema)) .transpose()?; // find common coercible types @@ -701,7 +701,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchemaRef) -> Result { let when_types = case .when_then_expr .iter() - .map(|(when, _then)| when.get_type(&schema)) + .map(|(when, _then)| when.get_type(schema)) .collect::>>()?; let coerced_type = get_coerce_type_for_case_expression(&when_types, Some(case_type)); @@ -727,7 +727,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchemaRef) -> Result { let case_expr = case .expr .zip(case_when_coerce_type.as_ref()) - .map(|(case_expr, coercible_type)| case_expr.cast_to(coercible_type, &schema)) + .map(|(case_expr, coercible_type)| case_expr.cast_to(coercible_type, schema)) .transpose()? .map(Box::new); let when_then = case @@ -735,7 +735,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchemaRef) -> Result { .into_iter() .map(|(when, then)| { let when_type = case_when_coerce_type.as_ref().unwrap_or(&DataType::Boolean); - let when = when.cast_to(when_type, &schema).map_err(|e| { + let when = when.cast_to(when_type, schema).map_err(|e| { DataFusionError::Context( format!( "WHEN expressions in CASE couldn't be \ @@ -744,13 +744,13 @@ fn coerce_case_expression(case: Case, schema: &DFSchemaRef) -> Result { Box::new(e), ) })?; - let then = then.cast_to(&then_else_coerce_type, &schema)?; + let then = then.cast_to(&then_else_coerce_type, schema)?; Ok((Box::new(when), Box::new(then))) }) .collect::>>()?; let else_expr = case .else_expr - .map(|expr| expr.cast_to(&then_else_coerce_type, &schema)) + .map(|expr| expr.cast_to(&then_else_coerce_type, schema)) .transpose()? .map(Box::new); diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index 6408af5cda99..b8491aea2d6f 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -272,11 +272,15 @@ pub fn create_physical_expr( execution_props, ) } - ScalarFunctionDefinition::UDF(fun) => udf::create_physical_expr( - fun.clone().as_ref(), - &physical_args, - input_schema, - ), + ScalarFunctionDefinition::UDF(fun) => { + let return_type = fun.return_type_from_exprs(args, input_dfschema)?; + + udf::create_physical_expr( + fun.clone().as_ref(), + &physical_args, + return_type, + ) + } ScalarFunctionDefinition::Name(_) => { internal_err!("Function `Expr` with name should be resolved.") } diff --git a/datafusion/physical-expr/src/udf.rs b/datafusion/physical-expr/src/udf.rs index e0117fecb4e8..d9c7c9e5c2a6 100644 --- a/datafusion/physical-expr/src/udf.rs +++ b/datafusion/physical-expr/src/udf.rs @@ -17,28 +17,24 @@ //! UDF support use crate::{PhysicalExpr, ScalarFunctionExpr}; -use arrow::datatypes::Schema; +use arrow_schema::DataType; use datafusion_common::Result; pub use datafusion_expr::ScalarUDF; use std::sync::Arc; /// Create a physical expression of the UDF. -/// This function errors when `args`' can't be coerced to a valid argument type of the UDF. +/// +/// Arguments: pub fn create_physical_expr( fun: &ScalarUDF, input_phy_exprs: &[Arc], - input_schema: &Schema, + return_type: DataType, ) -> Result> { - let input_exprs_types = input_phy_exprs - .iter() - .map(|e| e.data_type(input_schema)) - .collect::>>()?; - Ok(Arc::new(ScalarFunctionExpr::new( fun.name(), fun.fun(), input_phy_exprs.to_vec(), - fun.return_type(&input_exprs_types)?, + return_type, fun.monotonicity()?, fun.signature().type_signature.supports_zero_argument(), ))) @@ -46,7 +42,6 @@ pub fn create_physical_expr( #[cfg(test)] mod tests { - use arrow::datatypes::Schema; use arrow_schema::DataType; use datafusion_common::Result; use datafusion_expr::{ @@ -102,7 +97,7 @@ mod tests { // create and register the udf let udf = ScalarUDF::from(TestScalarUDF::new()); - let p_expr = create_physical_expr(&udf, &[], &Schema::empty())?; + let p_expr = create_physical_expr(&udf, &[], DataType::Float64)?; assert_eq!( p_expr From 8aaea5d6eb8e918bb876b3b2374e9f847087ce6d Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 15 Feb 2024 11:30:59 -0500 Subject: [PATCH 05/14] Dont call multiunzip when no stats (#9220) * Dont call multiunzip when no stats * Update docstring --- datafusion/core/benches/sql_planner.rs | 2 +- datafusion/core/src/datasource/listing/table.rs | 9 +++++++-- datafusion/core/src/datasource/statistics.rs | 10 ++++++++-- 3 files changed, 16 insertions(+), 5 deletions(-) diff --git a/datafusion/core/benches/sql_planner.rs b/datafusion/core/benches/sql_planner.rs index 4615d0a0f55c..6f54054530da 100644 --- a/datafusion/core/benches/sql_planner.rs +++ b/datafusion/core/benches/sql_planner.rs @@ -234,7 +234,7 @@ fn criterion_benchmark(c: &mut Criterion) { let sql = std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q)) .unwrap(); c.bench_function(&format!("physical_plan_tpch_{}", q), |b| { - b.iter(|| logical_plan(&ctx, &sql)) + b.iter(|| physical_plan(&ctx, &sql)) }); } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 094b26bfbd99..56e64f556c12 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -880,8 +880,13 @@ impl ListingTable { .boxed() .buffered(ctx.config_options().execution.meta_fetch_concurrency); - let (files, statistics) = - get_statistics_with_limit(files, self.schema(), limit).await?; + let (files, statistics) = get_statistics_with_limit( + files, + self.schema(), + limit, + self.options.collect_stat, + ) + .await?; Ok(( split_files(files, self.options.target_partitions), diff --git a/datafusion/core/src/datasource/statistics.rs b/datafusion/core/src/datasource/statistics.rs index 73896f8eb7c1..c67227f966a2 100644 --- a/datafusion/core/src/datasource/statistics.rs +++ b/datafusion/core/src/datasource/statistics.rs @@ -29,12 +29,15 @@ use itertools::izip; use itertools::multiunzip; /// Get all files as well as the file level summary statistics (no statistic for partition columns). -/// If the optional `limit` is provided, includes only sufficient files. -/// Needed to read up to `limit` number of rows. +/// If the optional `limit` is provided, includes only sufficient files. Needed to read up to +/// `limit` number of rows. `collect_stats` is passed down from the configuration parameter on +/// `ListingTable`. If it is false we only construct bare statistics and skip a potentially expensive +/// call to `multiunzip` for constructing file level summary statistics. pub async fn get_statistics_with_limit( all_files: impl Stream>, file_schema: SchemaRef, limit: Option, + collect_stats: bool, ) -> Result<(Vec, Statistics)> { let mut result_files = vec![]; // These statistics can be calculated as long as at least one file provides @@ -78,6 +81,9 @@ pub async fn get_statistics_with_limit( while let Some(current) = all_files.next().await { let (file, file_stats) = current?; result_files.push(file); + if !collect_stats { + continue; + } // We accumulate the number of rows, total byte size and null // counts across all the files in question. If any file does not From e4f4031899e94e92262c7ad946075976786fdcf9 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 15 Feb 2024 14:07:46 -0800 Subject: [PATCH 06/14] Use setup-macos-aarch64-builder for aarch64 CI pipeline (#9242) --- .github/workflows/rust.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 027134a3c5b9..154ac9766e0d 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -322,7 +322,7 @@ jobs: with: submodules: true - name: Setup Rust toolchain - uses: ./.github/actions/setup-macos-builder + uses: ./.github/actions/setup-macos-aarch64-builder - name: Run tests (excluding doctests) shell: bash run: | From 40353fe44425eb55d7cb8fb9814b65c6db0ece8d Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Fri, 16 Feb 2024 19:25:10 +0800 Subject: [PATCH 07/14] GROUP-BY prioritizes input columns in case of ambiguity (#9228) * GROUP-BY prioritizes input columns in case of ambiguity * Update datafusion/sqllogictest/test_files/aggregate.slt Co-authored-by: Andrew Lamb * Update datafusion/sqllogictest/test_files/aggregate.slt Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- datafusion/sql/src/select.rs | 7 +++++-- .../sqllogictest/test_files/aggregate.slt | 20 +++++++++++++++++++ 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 7862715e5f1d..f47720d4674a 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -88,8 +88,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // having and group by clause may reference aliases defined in select projection let projected_plan = self.project(base_plan.clone(), select_exprs.clone())?; - let mut combined_schema = (**projected_plan.schema()).clone(); - combined_schema.merge(base_plan.schema()); + // Place the fields of the base plan at the front so that when there are references + // with the same name, the fields of the base plan will be searched first. + // See https://github.com/apache/arrow-datafusion/issues/9162 + let mut combined_schema = base_plan.schema().as_ref().clone(); + combined_schema.merge(projected_plan.schema()); // this alias map is resolved and looked up in both having exprs and group by exprs let alias_map = extract_aliases(&select_exprs); diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 136fb39c673e..f50134e63509 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -3179,3 +3179,23 @@ NULL statement ok DROP TABLE t; + +# Test for the case when the column name is ambiguous +statement ok +CREATE TABLE t(a BIGINT) AS VALUES(1), (2), (3); + +# The column name referenced by GROUP-BY is ambiguous, prefer the column in base plan +query I +SELECT 0 as "t.a" FROM t GROUP BY t.a; +---- +0 +0 +0 + +# The column name referenced by HAVING is ambiguous, prefer the column in the base plan +query I +SELECT 0 AS "t.a" FROM t HAVING MAX(t.a) = 0; +---- + +statement ok +DROP TABLE t; \ No newline at end of file From e07a79c3a595599f1055b06ad9d04ac58af828fc Mon Sep 17 00:00:00 2001 From: Cancai Cai <77189278+caicancai@users.noreply.github.com> Date: Fri, 16 Feb 2024 19:45:40 +0800 Subject: [PATCH 08/14] chore: improve catalog test in mod.rs (#9244) --- datafusion/core/src/catalog/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/catalog/mod.rs b/datafusion/core/src/catalog/mod.rs index 6eba43f7df79..a05a480bef44 100644 --- a/datafusion/core/src/catalog/mod.rs +++ b/datafusion/core/src/catalog/mod.rs @@ -321,7 +321,7 @@ mod tests { } } - let schema = Arc::new(MemorySchemaProvider::new()) as _; + let schema = Arc::new(MemorySchemaProvider::new()) as Arc; let catalog = Arc::new(TestProvider {}); match catalog.register_schema("foo", schema) { @@ -353,7 +353,7 @@ mod tests { let cat = Arc::new(MemoryCatalogProvider::new()) as Arc; let schema = Arc::new(MemorySchemaProvider::new()) as Arc; - cat.register_schema("foo", schema.clone()).unwrap(); + cat.register_schema("foo", schema).unwrap(); assert!(cat.deregister_schema("foo", false).unwrap().is_some()); } From 14cb9627184bc1680cac8965533ee8f0b06973b2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Feb 2024 08:06:59 -0500 Subject: [PATCH 09/14] Add example for `ScalarStructBuilder::new_null`, fix display for `null` `ScalarValue::Struct` (#9238) * Minor: Add example for `ScalarStructBuilder::new_null` * Fix null display * fix docs * tweak --- datafusion/common/src/scalar/mod.rs | 61 ++++++++++++++++++- .../common/src/scalar/struct_builder.rs | 36 ++++++++++- 2 files changed, 94 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 29107ab10e7e..7e53415090e0 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -83,7 +83,7 @@ pub use struct_builder::ScalarStructBuilder; /// /// In general, performance will be better using arrow [`Array`]s rather than /// [`ScalarValue`], as it is far more efficient to process multiple values at -/// once (vecctorized processing). +/// once (vectorized processing). /// /// # Example /// ``` @@ -3103,6 +3103,11 @@ impl fmt::Display for ScalarValue { // ScalarValue Struct should always have a single element assert_eq!(struct_arr.len(), 1); + if struct_arr.null_count() == struct_arr.len() { + write!(f, "NULL")?; + return Ok(()); + } + let columns = struct_arr.columns(); let fields = struct_arr.fields(); let nulls = struct_arr.nulls(); @@ -3298,6 +3303,7 @@ mod tests { as_string_array, as_struct_array, as_uint32_array, as_uint64_array, }; + use crate::assert_batches_eq; use arrow::buffer::OffsetBuffer; use arrow::compute::{is_null, kernels}; use arrow::datatypes::{ArrowNumericType, ArrowPrimitiveType}; @@ -5690,6 +5696,59 @@ mod tests { check_array(array); } + #[test] + fn test_struct_display() { + let field_a = Field::new("a", DataType::Int32, true); + let field_b = Field::new("b", DataType::Utf8, true); + + let s = ScalarStructBuilder::new() + .with_scalar(field_a, ScalarValue::from(1i32)) + .with_scalar(field_b, ScalarValue::Utf8(None)) + .build() + .unwrap(); + + assert_eq!(s.to_string(), "{a:1,b:}"); + + let ScalarValue::Struct(arr) = s else { + panic!("Expected struct"); + }; + + //verify compared to arrow display + let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap(); + let expected = [ + "+-------------+", + "| s |", + "+-------------+", + "| {a: 1, b: } |", + "+-------------+", + ]; + assert_batches_eq!(&expected, &[batch]); + } + + #[test] + fn test_struct_display_null() { + let fields = vec![Field::new("a", DataType::Int32, false)]; + let s = ScalarStructBuilder::new_null(fields); + assert_eq!(s.to_string(), "NULL"); + + let ScalarValue::Struct(arr) = s else { + panic!("Expected struct"); + }; + + //verify compared to arrow display + let batch = RecordBatch::try_from_iter(vec![("s", arr as _)]).unwrap(); + + #[rustfmt::skip] + let expected = [ + "+---+", + "| s |", + "+---+", + "| |", + "+---+", + ]; + assert_batches_eq!(&expected, &[batch]); + } + #[test] fn test_build_timestamp_millisecond_list() { let values = vec![ScalarValue::TimestampMillisecond(Some(1), None)]; diff --git a/datafusion/common/src/scalar/struct_builder.rs b/datafusion/common/src/scalar/struct_builder.rs index 926e10041751..1192757e890b 100644 --- a/datafusion/common/src/scalar/struct_builder.rs +++ b/datafusion/common/src/scalar/struct_builder.rs @@ -39,8 +39,40 @@ impl ScalarStructBuilder { Self::default() } - /// Return a new [`ScalarValue::Struct`] with the specified fields and a - /// single null value + /// Return a new [`ScalarValue::Struct`] with a single `null` value. + /// + /// Note this is different from a struct where each of the specified fields + /// are null (e.g. `{a: NULL}`) + /// + /// # Example + /// + /// ```rust + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::scalar::ScalarStructBuilder; + /// let fields = vec![ + /// Field::new("a", DataType::Int32, false), + /// ]; + /// let sv = ScalarStructBuilder::new_null(fields); + /// // Note this is `NULL`, not `{a: NULL}` + /// assert_eq!(format!("{sv}"), "NULL"); + ///``` + /// + /// To create a struct where the *fields* are null, use `Self::new()` and + /// pass null values for each field: + /// + /// ```rust + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::scalar::{ScalarStructBuilder, ScalarValue}; + /// // make a nullable field + /// let field = Field::new("a", DataType::Int32, true); + /// // add a null value for the "a" field + /// let sv = ScalarStructBuilder::new() + /// .with_scalar(field, ScalarValue::Int32(None)) + /// .build() + /// .unwrap(); + /// // value is not null, but field is + /// assert_eq!(format!("{sv}"), "{a:}"); + /// ``` pub fn new_null(fields: impl IntoFields) -> ScalarValue { DataType::Struct(fields.into()).try_into().unwrap() } From 441a4356b889edde0802ec1b394f1f10c18283c3 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 16 Feb 2024 09:52:46 -0700 Subject: [PATCH 10/14] Prepare 36.0.0-rc1 (#9251) --- .gitignore | 4 + Cargo.toml | 28 +-- benchmarks/Cargo.toml | 8 +- datafusion-cli/Cargo.lock | 137 ++++++++------- datafusion-cli/Cargo.toml | 4 +- datafusion/CHANGELOG.md | 1 + datafusion/core/Cargo.toml | 6 +- datafusion/optimizer/Cargo.toml | 4 +- datafusion/proto/Cargo.toml | 2 +- datafusion/sqllogictest/Cargo.toml | 2 +- dev/changelog/36.0.0.md | 264 +++++++++++++++++++++++++++++ dev/update_datafusion_versions.py | 21 ++- docs/Cargo.toml | 2 +- docs/source/user-guide/configs.md | 2 +- 14 files changed, 382 insertions(+), 103 deletions(-) create mode 100644 dev/changelog/36.0.0.md diff --git a/.gitignore b/.gitignore index 203455e4a796..05479fd0f07d 100644 --- a/.gitignore +++ b/.gitignore @@ -107,3 +107,7 @@ datafusion/sqllogictests/test_files/tpch/data/* # Scratch temp dir for sqllogictests datafusion/sqllogictest/test_files/scratch* + +# rat +filtered_rat.txt +rat.txt diff --git a/Cargo.toml b/Cargo.toml index 2e2a0103973d..31c0005d582d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/arrow-datafusion" rust-version = "1.72" -version = "35.0.0" +version = "36.0.0" [workspace.dependencies] arrow = { version = "50.0.0", features = ["prettyprint"] } @@ -46,19 +46,19 @@ bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.4.0" -datafusion = { path = "datafusion/core", version = "35.0.0" } -datafusion-common = { path = "datafusion/common", version = "35.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "35.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "35.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "35.0.0" } -datafusion-functions-array = { path = "datafusion/functions-array", version = "35.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "35.0.0" } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "35.0.0" } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "35.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "35.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "35.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "35.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "35.0.0" } +datafusion = { path = "datafusion/core", version = "36.0.0" } +datafusion-common = { path = "datafusion/common", version = "36.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "36.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "36.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "36.0.0" } +datafusion-functions-array = { path = "datafusion/functions-array", version = "36.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "36.0.0" } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "36.0.0" } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "36.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "36.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "36.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "36.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "36.0.0" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 3ee547410744..90ff83bd53d7 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-benchmarks" description = "DataFusion Benchmarks" -version = "35.0.0" +version = "36.0.0" edition = { workspace = true } authors = ["Apache Arrow "] homepage = "https://github.com/apache/arrow-datafusion" @@ -33,8 +33,8 @@ snmalloc = ["snmalloc-rs"] [dependencies] arrow = { workspace = true } -datafusion = { path = "../datafusion/core", version = "35.0.0" } -datafusion-common = { path = "../datafusion/common", version = "35.0.0" } +datafusion = { path = "../datafusion/core", version = "36.0.0" } +datafusion-common = { path = "../datafusion/common", version = "36.0.0" } env_logger = { workspace = true } futures = { workspace = true } log = { workspace = true } @@ -49,4 +49,4 @@ test-utils = { path = "../test-utils/", version = "0.1.0" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } [dev-dependencies] -datafusion-proto = { path = "../datafusion/proto", version = "35.0.0" } +datafusion-proto = { path = "../datafusion/proto", version = "36.0.0" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 25bb30e5bc56..69456446f52b 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -25,9 +25,9 @@ checksum = "aae1277d39aeec15cb388266ecc24b11c80469deae6067e17a1a7aa9e5c1f234" [[package]] name = "ahash" -version = "0.8.7" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +checksum = "42cd52102d3df161c77a887b608d7a4897d7cc112886a9537b738a887a03aaff" dependencies = [ "cfg-if", "const-random", @@ -270,7 +270,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap 2.2.2", + "indexmap 2.2.3", "lexical-core", "num", "serde", @@ -384,7 +384,7 @@ checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -802,9 +802,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.14.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +checksum = "d32a994c2b3ca201d9b263612a374263f05e7adde37c4707f693dcd375076d1f" [[package]] name = "byteorder" @@ -880,9 +880,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.8.5" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91d7b79e99bfaa0d47da0687c43aa3b7381938a62ad3a6498599039321f660b7" +checksum = "d59ae0466b83e838b81a54256c39d5d7c20b9d7daa10510a242d9b75abd5936e" dependencies = [ "chrono", "chrono-tz-build", @@ -1023,9 +1023,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.3.2" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" dependencies = [ "cfg-if", ] @@ -1074,7 +1074,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "30d2b3721e861707777e3195b0158f950ae6dc4a27e4d02ff9f67e3eb3de199e" dependencies = [ "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -1098,7 +1098,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "35.0.0" +version = "36.0.0" dependencies = [ "ahash", "apache-avro", @@ -1126,7 +1126,7 @@ dependencies = [ "glob", "half", "hashbrown 0.14.3", - "indexmap 2.2.2", + "indexmap 2.2.3", "itertools", "log", "num-traits", @@ -1148,7 +1148,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1176,7 +1176,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "35.0.0" +version = "36.0.0" dependencies = [ "ahash", "apache-avro", @@ -1195,7 +1195,7 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "chrono", @@ -1214,7 +1214,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "35.0.0" +version = "36.0.0" dependencies = [ "ahash", "arrow", @@ -1228,7 +1228,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "base64", @@ -1241,7 +1241,7 @@ dependencies = [ [[package]] name = "datafusion-functions-array" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "datafusion-common", @@ -1253,7 +1253,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "async-trait", @@ -1269,7 +1269,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "35.0.0" +version = "36.0.0" dependencies = [ "ahash", "arrow", @@ -1288,7 +1288,7 @@ dependencies = [ "half", "hashbrown 0.14.3", "hex", - "indexmap 2.2.2", + "indexmap 2.2.3", "itertools", "log", "md-5", @@ -1303,7 +1303,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "35.0.0" +version = "36.0.0" dependencies = [ "ahash", "arrow", @@ -1319,7 +1319,7 @@ dependencies = [ "futures", "half", "hashbrown 0.14.3", - "indexmap 2.2.2", + "indexmap 2.2.3", "itertools", "log", "once_cell", @@ -1332,7 +1332,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "35.0.0" +version = "36.0.0" dependencies = [ "arrow", "arrow-schema", @@ -1417,9 +1417,9 @@ checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" [[package]] name = "either" -version = "1.9.0" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" +checksum = "11157ac094ffbdde99aa67b23417ebdd801842852b500e395a45a9c0aac03e4a" [[package]] name = "encoding_rs" @@ -1607,7 +1607,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -1691,7 +1691,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap 2.2.2", + "indexmap 2.2.3", "slab", "tokio", "tokio-util", @@ -1751,9 +1751,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0c62115964e08cb8039170eb33c1d0e2388a256930279edca206fff675f82c3" +checksum = "bd5256b483761cd23699d0da46cc6fd2ee3be420bbe6d020ae4a091e70b7e9fd" [[package]] name = "hex" @@ -1908,9 +1908,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.2" +version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824b2ae422412366ba479e8111fd301f7b5faece8149317bb81925979a53f520" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" dependencies = [ "equivalent", "hashbrown 0.14.3", @@ -1954,9 +1954,9 @@ checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "jobserver" -version = "0.1.27" +version = "0.1.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c37f63953c4c63420ed5fd3d6d398c719489b9f872b9fa683262f8edd363c7d" +checksum = "ab46a6e9526ddef3ae7f787c06f0f2600639ba80ea3eade3d8e670a2230f51d6" dependencies = [ "libc", ] @@ -2258,19 +2258,18 @@ checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" [[package]] name = "num-integer" -version = "0.1.45" +version = "0.1.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" dependencies = [ - "autocfg", "num-traits", ] [[package]] name = "num-iter" -version = "0.1.43" +version = "0.1.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" +checksum = "d869c01cc0c455284163fd0092f1f93835385ccab5a98a0dcc497b2f8bf055a9" dependencies = [ "autocfg", "num-integer", @@ -2291,9 +2290,9 @@ dependencies = [ [[package]] name = "num-traits" -version = "0.2.17" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" dependencies = [ "autocfg", "libm", @@ -2305,7 +2304,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi 0.3.5", + "hermit-abi 0.3.6", "libc", ] @@ -2338,7 +2337,7 @@ dependencies = [ "rand", "reqwest", "ring 0.17.7", - "rustls-pemfile 2.0.0", + "rustls-pemfile 2.1.0", "serde", "serde_json", "snafu", @@ -2467,7 +2466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e1d3afd2628e69da2be385eb6f2fd57c8ac7977ceeff6dc166ff1657b0e386a9" dependencies = [ "fixedbitset", - "indexmap 2.2.2", + "indexmap 2.2.3", ] [[package]] @@ -2525,7 +2524,7 @@ checksum = "266c042b60c9c76b8d53061e52b2e0d1116abc57cefc8c5cd671619a56ac3690" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -2542,9 +2541,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pkg-config" -version = "0.3.29" +version = "0.3.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" +checksum = "d231b230927b5e4ad203db57bbcbee2802f6bce620b1e4a9024a07d94e2907ec" [[package]] name = "powerfmt" @@ -2920,9 +2919,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.0.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35e4980fa29e4c4b212ffb3db068a564cbf560e51d3944b7c88bd8bf5bec64f4" +checksum = "3c333bb734fcdedcea57de1602543590f545f127dc8b533324318fd492c5c70b" dependencies = [ "base64", "rustls-pki-types", @@ -2930,9 +2929,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.2.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a716eb65e3158e90e17cd93d855216e27bde02745ab842f2cab4a39dba1bacf" +checksum = "048a63e5b3ac996d78d402940b5fa47973d2d080c6c6fffa1d0f19c4445310b7" [[package]] name = "rustls-webpki" @@ -3065,7 +3064,7 @@ checksum = "33c85360c95e7d137454dc81d9a4ed2b8efd8fbe19cee57357b32b9771fccb67" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3200,7 +3199,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3246,7 +3245,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3259,7 +3258,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3281,9 +3280,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.48" +version = "2.0.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3531638e407dfc0814761abb7c00a5b54992b849452a0646b7f65c9f770f3f" +checksum = "915aea9e586f80826ee59f8453c1101f9d1c4b3964cd2460185ee8e299ada496" dependencies = [ "proc-macro2", "quote", @@ -3352,22 +3351,22 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.56" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" +checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.56" +version = "1.0.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" +checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3462,7 +3461,7 @@ checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3559,7 +3558,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3604,7 +3603,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] @@ -3758,7 +3757,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", "wasm-bindgen-shared", ] @@ -3792,7 +3791,7 @@ checksum = "642f325be6301eb8107a83d12a8ac6c1e1c54345a7ef1a9261962dfefda09e66" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4050,7 +4049,7 @@ checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.48", + "syn 2.0.49", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index e40aa6107c7d..45e7b740bf87 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "35.0.0" +version = "36.0.0" authors = ["Apache Arrow "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -35,7 +35,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "35.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "parquet", "regex_expressions", "unicode_expressions", "compression"] } +datafusion = { path = "../datafusion/core", version = "36.0.0", features = ["avro", "crypto_expressions", "encoding_expressions", "parquet", "regex_expressions", "unicode_expressions", "compression"] } datafusion-common = { path = "../datafusion/common" } dirs = "4.0.0" env_logger = "0.9" diff --git a/datafusion/CHANGELOG.md b/datafusion/CHANGELOG.md index ae9da0e865e9..2d09782a3982 100644 --- a/datafusion/CHANGELOG.md +++ b/datafusion/CHANGELOG.md @@ -19,6 +19,7 @@ # Changelog +- [36.0.0](../dev/changelog/36.0.0.md) - [35.0.0](../dev/changelog/35.0.0.md) - [34.0.0](../dev/changelog/34.0.0.md) - [33.0.0](../dev/changelog/33.0.0.md) diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 38d3012a5ac7..09718791d0f7 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -67,13 +67,13 @@ bytes = { workspace = true } bzip2 = { version = "0.4.3", optional = true } chrono = { workspace = true } dashmap = { workspace = true } -datafusion-common = { path = "../common", version = "35.0.0", features = ["object_store"], default-features = false } +datafusion-common = { path = "../common", version = "36.0.0", features = ["object_store"], default-features = false } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } datafusion-functions = { workspace = true } datafusion-functions-array = { workspace = true, optional = true } -datafusion-optimizer = { path = "../optimizer", version = "35.0.0", default-features = false } -datafusion-physical-expr = { path = "../physical-expr", version = "35.0.0", default-features = false } +datafusion-optimizer = { path = "../optimizer", version = "36.0.0", default-features = false } +datafusion-physical-expr = { path = "../physical-expr", version = "36.0.0", default-features = false } datafusion-physical-plan = { workspace = true } datafusion-sql = { workspace = true } flate2 = { version = "1.0.24", optional = true } diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index e4e9660f93b4..cac46eda9871 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -44,7 +44,7 @@ async-trait = { workspace = true } chrono = { workspace = true } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } -datafusion-physical-expr = { path = "../physical-expr", version = "35.0.0", default-features = false } +datafusion-physical-expr = { path = "../physical-expr", version = "36.0.0", default-features = false } hashbrown = { version = "0.14", features = ["raw"] } itertools = { workspace = true } log = { workspace = true } @@ -52,5 +52,5 @@ regex-syntax = "0.8.0" [dev-dependencies] ctor = { workspace = true } -datafusion-sql = { path = "../sql", version = "35.0.0" } +datafusion-sql = { path = "../sql", version = "36.0.0" } env_logger = "0.11.0" diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index a1032d28a2e6..59a9129c6d7a 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -44,7 +44,7 @@ parquet = ["datafusion/parquet", "datafusion-common/parquet"] [dependencies] arrow = { workspace = true } chrono = { workspace = true } -datafusion = { path = "../core", version = "35.0.0" } +datafusion = { path = "../core", version = "36.0.0" } datafusion-common = { workspace = true } datafusion-expr = { workspace = true } object_store = { workspace = true } diff --git a/datafusion/sqllogictest/Cargo.toml b/datafusion/sqllogictest/Cargo.toml index 4f8d1b6ac403..7c54d6bf355b 100644 --- a/datafusion/sqllogictest/Cargo.toml +++ b/datafusion/sqllogictest/Cargo.toml @@ -37,7 +37,7 @@ bigdecimal = { workspace = true } bytes = { version = "1.4.0", optional = true } chrono = { workspace = true, optional = true } clap = { version = "4.4.8", features = ["derive", "env"] } -datafusion = { path = "../core", version = "35.0.0" } +datafusion = { path = "../core", version = "36.0.0" } datafusion-common = { workspace = true } futures = { version = "0.3.28" } half = { workspace = true } diff --git a/dev/changelog/36.0.0.md b/dev/changelog/36.0.0.md new file mode 100644 index 000000000000..86f6a380ceb0 --- /dev/null +++ b/dev/changelog/36.0.0.md @@ -0,0 +1,264 @@ + + +## [36.0.0](https://github.com/apache/arrow-datafusion/tree/36.0.0) (2024-02-16) + +[Full Changelog](https://github.com/apache/arrow-datafusion/compare/35.0.0...36.0.0) + +**Breaking changes:** + +- Deprecate make_scalar_function [#8878](https://github.com/apache/arrow-datafusion/pull/8878) (viirya) +- Change `Accumulator::evaluate` and `Accumulator::state` to take `&mut self` [#8925](https://github.com/apache/arrow-datafusion/pull/8925) (alamb) +- Rename `CatalogList` to `CatalogProviderList` [#9002](https://github.com/apache/arrow-datafusion/pull/9002) (comphead) +- Remove some recursive cloning from logical planning [#9050](https://github.com/apache/arrow-datafusion/pull/9050) (ozankabak) +- Support `FixedSizeList` type coercion [#8902](https://github.com/apache/arrow-datafusion/pull/8902) (Weijun-H) +- Add `ColumnarValue::values_to_arrays`, deprecate `columnar_values_to_array` [#9114](https://github.com/apache/arrow-datafusion/pull/9114) (alamb) + +**Performance related:** + +- Minor: Add new Extended ClickBench benchmark queries [#8950](https://github.com/apache/arrow-datafusion/pull/8950) (alamb) + +**Implemented enhancements:** + +- feat: support `stride` in `array_slice`, change indexes to be`1` based [#8829](https://github.com/apache/arrow-datafusion/pull/8829) (Weijun-H) +- feat: emitting partial join results in `HashJoinStream` [#8020](https://github.com/apache/arrow-datafusion/pull/8020) (korowa) +- feat:implement sql style 'ends_with' and 'instr' string function [#8862](https://github.com/apache/arrow-datafusion/pull/8862) (zy-kkk) +- feat: Support parquet bloom filter pruning for decimal128 [#8930](https://github.com/apache/arrow-datafusion/pull/8930) (Ted-Jiang) +- feat: Disable client console highlight by default [#9013](https://github.com/apache/arrow-datafusion/pull/9013) (comphead) +- feat: support the ergonomics of getting list slice with stride [#8946](https://github.com/apache/arrow-datafusion/pull/8946) (Weijun-H) +- feat: Parallel Arrow file format reading [#8897](https://github.com/apache/arrow-datafusion/pull/8897) (my-vegetable-has-exploded) +- feat: support array_reverse [#9023](https://github.com/apache/arrow-datafusion/pull/9023) (Weijun-H) +- feat: issue #8969 adding position function [#8988](https://github.com/apache/arrow-datafusion/pull/8988) (Lordworms) +- feat: support `LargeList` in `flatten` [#9110](https://github.com/apache/arrow-datafusion/pull/9110) (Weijun-H) +- feat: improve `make_date` performance [#9112](https://github.com/apache/arrow-datafusion/pull/9112) (r3stl355) +- feat: add github action to self-assign the issue [#9132](https://github.com/apache/arrow-datafusion/pull/9132) (r3stl355) +- feat: add ability to query the remote http(s) location directly in datafusion-cli [#9150](https://github.com/apache/arrow-datafusion/pull/9150) (r3stl355) +- feat: implement select directly from s3 and gcs locations in datafusion-cli [#9199](https://github.com/apache/arrow-datafusion/pull/9199) (r3stl355) +- feat: support block gzip for streams [#9175](https://github.com/apache/arrow-datafusion/pull/9175) (tshauck) + +**Fixed bugs:** + +- fix: recursive initialize method [#8937](https://github.com/apache/arrow-datafusion/pull/8937) (waynexia) +- fix: common_subexpr_eliminate rule should not apply to short-circuit expression [#8928](https://github.com/apache/arrow-datafusion/pull/8928) (haohuaijin) +- fix: issue #8922 make row group test more readable [#8941](https://github.com/apache/arrow-datafusion/pull/8941) (Lordworms) +- fix: allow placeholders to be substituted when coercible [#8977](https://github.com/apache/arrow-datafusion/pull/8977) (kallisti-dev) +- fix: unambiguously truncate time in date_trunc function [#9068](https://github.com/apache/arrow-datafusion/pull/9068) (mhilton) +- fix: schema metadata retrieval when listing parquet table [#9134](https://github.com/apache/arrow-datafusion/pull/9134) (brayanjuls) + +**Documentation updates:** + +- Prepare 35.0.0-rc1 [#8924](https://github.com/apache/arrow-datafusion/pull/8924) (andygrove) +- Update project links [#8954](https://github.com/apache/arrow-datafusion/pull/8954) (comphead) +- Document parallelism and thread scheduling in the architecture guide [#8986](https://github.com/apache/arrow-datafusion/pull/8986) (alamb) +- chore: fix license badge in README [#9008](https://github.com/apache/arrow-datafusion/pull/9008) (suyanhanx) +- docs: fix array_position docs [#9003](https://github.com/apache/arrow-datafusion/pull/9003) (tshauck) +- Docs: improve contributor guide to explain how to work with tickets [#8999](https://github.com/apache/arrow-datafusion/pull/8999) (alamb) +- Document minimum required rust version [#9071](https://github.com/apache/arrow-datafusion/pull/9071) (comphead) +- Minor: Add ParadeDB to the list of users [#9018](https://github.com/apache/arrow-datafusion/pull/9018) (alamb) +- Update minimum rust version to 1.72 [#8997](https://github.com/apache/arrow-datafusion/pull/8997) (alamb) +- docs: add docs and example showing how to get the expression data type [#9118](https://github.com/apache/arrow-datafusion/pull/9118) (r3stl355) +- chore: Fix incorrect comment in substrait consumer [#9123](https://github.com/apache/arrow-datafusion/pull/9123) (caicancai) +- Minor: Fix Self referential links in readme [#9119](https://github.com/apache/arrow-datafusion/pull/9119) (alamb) +- Examples link in catalogs.rs leads to a 404 [#9194](https://github.com/apache/arrow-datafusion/pull/9194) (Omega359) +- Create `datafusion-functions-array` crate and move `ArrayToString` function into it [#9113](https://github.com/apache/arrow-datafusion/pull/9113) (alamb) + +**Merged pull requests:** + +- Add hash_join_single_partition_threshold_rows config [#8720](https://github.com/apache/arrow-datafusion/pull/8720) (maruschin) +- Prepare 35.0.0-rc1 [#8924](https://github.com/apache/arrow-datafusion/pull/8924) (andygrove) +- feat: support `stride` in `array_slice`, change indexes to be`1` based [#8829](https://github.com/apache/arrow-datafusion/pull/8829) (Weijun-H) +- fix: recursive initialize method [#8937](https://github.com/apache/arrow-datafusion/pull/8937) (waynexia) +- Fix expr partial ord test [#8908](https://github.com/apache/arrow-datafusion/pull/8908) (mustafasrepo) +- Simplify windows builtin functions return type [#8920](https://github.com/apache/arrow-datafusion/pull/8920) (comphead) +- Fix handling of nested leaf columns in parallel parquet writer [#8923](https://github.com/apache/arrow-datafusion/pull/8923) (devinjdangelo) +- feat: emitting partial join results in `HashJoinStream` [#8020](https://github.com/apache/arrow-datafusion/pull/8020) (korowa) +- fix: common_subexpr_eliminate rule should not apply to short-circuit expression [#8928](https://github.com/apache/arrow-datafusion/pull/8928) (haohuaijin) +- Support GroupsAccumulator accumulator for udaf [#8892](https://github.com/apache/arrow-datafusion/pull/8892) (guojidan) +- test: Port tests in `partitioned_csv.rs` to sqllogictest [#8919](https://github.com/apache/arrow-datafusion/pull/8919) (simicd) +- [CI] Fix RUSTFLAGS [#8929](https://github.com/apache/arrow-datafusion/pull/8929) (Jefffrey) +- Minor: Update datafusion-cli README to explain why it is not in the w… [#8938](https://github.com/apache/arrow-datafusion/pull/8938) (alamb) +- Add syntax highlight to datafusion-cli [#8918](https://github.com/apache/arrow-datafusion/pull/8918) (trungda) +- Update substrait requirement from 0.22.1 to 0.23.0 [#8943](https://github.com/apache/arrow-datafusion/pull/8943) (dependabot[bot]) +- Deprecate make_scalar_function [#8878](https://github.com/apache/arrow-datafusion/pull/8878) (viirya) +- Update project links [#8954](https://github.com/apache/arrow-datafusion/pull/8954) (comphead) +- fix: issue #8922 make row group test more readable [#8941](https://github.com/apache/arrow-datafusion/pull/8941) (Lordworms) +- feat:implement sql style 'ends_with' and 'instr' string function [#8862](https://github.com/apache/arrow-datafusion/pull/8862) (zy-kkk) +- [MINOR]: Extract aggregate topk function to `aggregate_topk.slt` [#8948](https://github.com/apache/arrow-datafusion/pull/8948) (mustafasrepo) +- Combine multiple `IN` lists in `ExprSimplifier` [#8949](https://github.com/apache/arrow-datafusion/pull/8949) (jayzhan211) +- Fix clippy failures: error: use of deprecated function `functions::make_scalar_function [#8972](https://github.com/apache/arrow-datafusion/pull/8972) (alamb) +- feat: Support parquet bloom filter pruning for decimal128 [#8930](https://github.com/apache/arrow-datafusion/pull/8930) (Ted-Jiang) +- [MINOR]: Update create_window_expr to refer only input schema [#8945](https://github.com/apache/arrow-datafusion/pull/8945) (mustafasrepo) +- Don't error in simplify_expressions rule [#8957](https://github.com/apache/arrow-datafusion/pull/8957) (haohuaijin) +- Use .zip to avoid unwrap [#8956](https://github.com/apache/arrow-datafusion/pull/8956) (Luv-Ray) +- Change `Accumulator::evaluate` and `Accumulator::state` to take `&mut self` [#8925](https://github.com/apache/arrow-datafusion/pull/8925) (alamb) +- Enhance simplifier by adding Canonicalize [#8780](https://github.com/apache/arrow-datafusion/pull/8780) (yyy1000) +- Find the correct fields when using page filter on `struct` fields in parquet [#8848](https://github.com/apache/arrow-datafusion/pull/8848) (manoj-inukolunu) +- fix: allow placeholders to be substituted when coercible [#8977](https://github.com/apache/arrow-datafusion/pull/8977) (kallisti-dev) +- Minor: improve CatalogProvider documentation with rationale and info about remote catalogs [#8968](https://github.com/apache/arrow-datafusion/pull/8968) (alamb) +- Improve to_timestamp docs [#8981](https://github.com/apache/arrow-datafusion/pull/8981) (Omega359) +- Add helper function for processing scalar function input [#8962](https://github.com/apache/arrow-datafusion/pull/8962) (viirya) +- Fix optimize projections bug [#8960](https://github.com/apache/arrow-datafusion/pull/8960) (mustafasrepo) +- NOT operator not return internal error when args are not boolean value [#8982](https://github.com/apache/arrow-datafusion/pull/8982) (guojidan) +- Minor: Add new Extended ClickBench benchmark queries [#8950](https://github.com/apache/arrow-datafusion/pull/8950) (alamb) +- Minor: Add comments to MSRV CI check to help if it fails [#8995](https://github.com/apache/arrow-datafusion/pull/8995) (alamb) +- Minor: Document memory management design on `MemoryPool` [#8966](https://github.com/apache/arrow-datafusion/pull/8966) (alamb) +- Fix LEAD/LAG window functions when default value null [#8989](https://github.com/apache/arrow-datafusion/pull/8989) (comphead) +- Optimize MIN/MAX when relation is empty [#8940](https://github.com/apache/arrow-datafusion/pull/8940) (viirya) +- [task #8203] Port tests in joins.rs to sqllogictest [#8996](https://github.com/apache/arrow-datafusion/pull/8996) (Tangruilin) +- [task #8213]Port tests in select.rs to sqllogictest [#8967](https://github.com/apache/arrow-datafusion/pull/8967) (Tangruilin) +- test: Port (last) `repartition.rs` query to sqllogictest [#8936](https://github.com/apache/arrow-datafusion/pull/8936) (simicd) +- Update to sqlparser `0.42.0` [#9000](https://github.com/apache/arrow-datafusion/pull/9000) (alamb) +- [MINOR]: Fix Optimize Projections Bug [#8992](https://github.com/apache/arrow-datafusion/pull/8992) (mustafasrepo) +- Make Topk aggregate tests deterministic [#8998](https://github.com/apache/arrow-datafusion/pull/8998) (mustafasrepo) +- Add support for Postgres LIKE operators [#8894](https://github.com/apache/arrow-datafusion/pull/8894) (gruuya) +- bug: Datafusion doesn't respect case sensitive table references [#8964](https://github.com/apache/arrow-datafusion/pull/8964) (xhwhis) +- Document parallelism and thread scheduling in the architecture guide [#8986](https://github.com/apache/arrow-datafusion/pull/8986) (alamb) +- Fix None Projections in Projection Pushdown [#9005](https://github.com/apache/arrow-datafusion/pull/9005) (berkaysynnada) +- Lead and Lag window functions should support default value with datatype other than Int64 [#9001](https://github.com/apache/arrow-datafusion/pull/9001) (viirya) +- chore: fix license badge in README [#9008](https://github.com/apache/arrow-datafusion/pull/9008) (suyanhanx) +- Minor: fix: #9010 - Optimizer schema change assert error is incorrect [#9012](https://github.com/apache/arrow-datafusion/pull/9012) (curtisleefulton) +- docs: fix array_position docs [#9003](https://github.com/apache/arrow-datafusion/pull/9003) (tshauck) +- Rename `CatalogList` to `CatalogProviderList` [#9002](https://github.com/apache/arrow-datafusion/pull/9002) (comphead) +- Safeguard against potential inexact row count being smaller than exact null count [#9007](https://github.com/apache/arrow-datafusion/pull/9007) (gruuya) +- Recursive CTEs: Stage 3 - add execution support [#8840](https://github.com/apache/arrow-datafusion/pull/8840) (matthewgapp) +- sqllogictest: move the creation of the nan_table from Rust to slt [#9022](https://github.com/apache/arrow-datafusion/pull/9022) (jonahgao) +- TreeNode refactor code deduplication: Part 3 [#8817](https://github.com/apache/arrow-datafusion/pull/8817) (ozankabak) +- feat: Disable client console highlight by default [#9013](https://github.com/apache/arrow-datafusion/pull/9013) (comphead) +- [task #8917] Implement information_schema.schemata [#8993](https://github.com/apache/arrow-datafusion/pull/8993) (Tangruilin) +- Properly encode STRING_AGG, NTH_VALUE in physical plan protobufs [#9027](https://github.com/apache/arrow-datafusion/pull/9027) (scsmithr) +- [task #8201] Port tests in expr.rs to sqllogictest, finish the left c… [#9014](https://github.com/apache/arrow-datafusion/pull/9014) (Tangruilin) +- Fix the clippy error of use of deprecated method [#9034](https://github.com/apache/arrow-datafusion/pull/9034) (viirya) +- feat: support the ergonomics of getting list slice with stride [#8946](https://github.com/apache/arrow-datafusion/pull/8946) (Weijun-H) +- Cache common referred expression at the window input [#9009](https://github.com/apache/arrow-datafusion/pull/9009) (mustafasrepo) +- Optimize `COUNT( DISTINCT ...)` for strings (up to 9x faster) [#8849](https://github.com/apache/arrow-datafusion/pull/8849) (jayzhan211) +- feat: Parallel Arrow file format reading [#8897](https://github.com/apache/arrow-datafusion/pull/8897) (my-vegetable-has-exploded) +- Change remove from swap to shift in index map [#9049](https://github.com/apache/arrow-datafusion/pull/9049) (mustafasrepo) +- Relax join keys constraint from Column to any physical expression for physical join operators [#8991](https://github.com/apache/arrow-datafusion/pull/8991) (viirya) +- Minor: Improve memory helper trait documentation [#9025](https://github.com/apache/arrow-datafusion/pull/9025) (alamb) +- Docs: improve contributor guide to explain how to work with tickets [#8999](https://github.com/apache/arrow-datafusion/pull/8999) (alamb) +- fix issue where upper and lower functions only work correctly on ascii character [#9054](https://github.com/apache/arrow-datafusion/pull/9054) (Omega359) +- Minor: small updates to bench.sh [#9035](https://github.com/apache/arrow-datafusion/pull/9035) (kmitchener) +- Chore: explicitly list out all Expr types in TypeCoercionRewriter::mutate [#9038](https://github.com/apache/arrow-datafusion/pull/9038) (guojidan) +- Minor: improve scalar functions document [#9029](https://github.com/apache/arrow-datafusion/pull/9029) (Weijun-H) +- [MINOR] Alter a SHJ test for relaxing "on" condition [#9065](https://github.com/apache/arrow-datafusion/pull/9065) (metesynnada) +- Remove some recursive cloning from logical planning [#9050](https://github.com/apache/arrow-datafusion/pull/9050) (ozankabak) +- minor: remove useless macro [#8979](https://github.com/apache/arrow-datafusion/pull/8979) (jackwener) +- Causality Analysis for Builtin Window Functions [#9048](https://github.com/apache/arrow-datafusion/pull/9048) (mustafasrepo) +- Minor: add doc examples for RawTableAllocExt [#9059](https://github.com/apache/arrow-datafusion/pull/9059) (alamb) +- Update substrait requirement from 0.23.0 to 0.24.0 [#9067](https://github.com/apache/arrow-datafusion/pull/9067) (dependabot[bot]) +- Remove single_file_output option from FileSinkConfig and Copy statement [#9041](https://github.com/apache/arrow-datafusion/pull/9041) (yyy1000) +- Add a make_date function [#9040](https://github.com/apache/arrow-datafusion/pull/9040) (Omega359) +- Speedup `DFSchema::merge` using HashSet indices [#9020](https://github.com/apache/arrow-datafusion/pull/9020) (simonvandel) +- Document minimum required rust version [#9071](https://github.com/apache/arrow-datafusion/pull/9071) (comphead) +- Return proper number of expressions for nth_value_agg [#9044](https://github.com/apache/arrow-datafusion/pull/9044) (mustafasrepo) +- ScalarUDF with zero arguments should be provided with one null array as parameter [#9031](https://github.com/apache/arrow-datafusion/pull/9031) (viirya) +- Update strum requirement from 0.25.0 to 0.26.1 [#9046](https://github.com/apache/arrow-datafusion/pull/9046) (dependabot[bot]) +- Create `datafusion-functions` crate, extract encode and decode to [#8705](https://github.com/apache/arrow-datafusion/pull/8705) (alamb) +- Add documentation for streaming usecase [#9070](https://github.com/apache/arrow-datafusion/pull/9070) (mustafasrepo) +- fix: unambiguously truncate time in date_trunc function [#9068](https://github.com/apache/arrow-datafusion/pull/9068) (mhilton) +- feat: support array_reverse [#9023](https://github.com/apache/arrow-datafusion/pull/9023) (Weijun-H) +- prettier to_timestamp_invoke [#9078](https://github.com/apache/arrow-datafusion/pull/9078) (Tangruilin) +- Handle invalid types for negation [#9066](https://github.com/apache/arrow-datafusion/pull/9066) (trungda) +- Minor: reduce unwraps in datetime_expressions.rs [#9072](https://github.com/apache/arrow-datafusion/pull/9072) (alamb) +- Remove custom doubling strategy + add examples to `VecAllocEx` [#9058](https://github.com/apache/arrow-datafusion/pull/9058) (alamb) +- Split physical_plan_tpch into separate benchmarks [#9043](https://github.com/apache/arrow-datafusion/pull/9043) (simonvandel) +- Minor: Add ParadeDB to the list of users [#9018](https://github.com/apache/arrow-datafusion/pull/9018) (alamb) +- [MINOR]: Add check for unnecessary projection [#9079](https://github.com/apache/arrow-datafusion/pull/9079) (mustafasrepo) +- chore(placeholder): update error message and add tests [#9073](https://github.com/apache/arrow-datafusion/pull/9073) (appletreeisyellow) +- refer to #8781, convert the internal_err! in datetime_expression.rs to exec_err! [#9083](https://github.com/apache/arrow-datafusion/pull/9083) (Tangruilin) +- Add benchmarks for to_timestamp and make_date functions [#9086](https://github.com/apache/arrow-datafusion/pull/9086) (Omega359) +- chore: Clarify ParadeDB branding [#9088](https://github.com/apache/arrow-datafusion/pull/9088) (philippemnoel) +- doc: Add example how to include latest datafusion [#9076](https://github.com/apache/arrow-datafusion/pull/9076) (comphead) +- Update minimum rust version to 1.72 [#8997](https://github.com/apache/arrow-datafusion/pull/8997) (alamb) +- Fix typo in an error message [#9099](https://github.com/apache/arrow-datafusion/pull/9099) (AdamGS) +- Update InfluxDB links in Known Users section of documentation [#9092](https://github.com/apache/arrow-datafusion/pull/9092) (alamb) +- Support `FixedSizeList` type coercion [#8902](https://github.com/apache/arrow-datafusion/pull/8902) (Weijun-H) +- Improve Canonicalize API [#8983](https://github.com/apache/arrow-datafusion/pull/8983) (alamb) +- Update env_logger requirement from 0.10 to 0.11 [#8944](https://github.com/apache/arrow-datafusion/pull/8944) (dependabot[bot]) +- Split count_distinct.rs into separate modules [#9087](https://github.com/apache/arrow-datafusion/pull/9087) (alamb) +- Fix update_expr for projection pushdown [#9096](https://github.com/apache/arrow-datafusion/pull/9096) (viirya) +- Improve `InListSImplifier` -- add test, commend and avoid clones [#8971](https://github.com/apache/arrow-datafusion/pull/8971) (alamb) +- feat: issue #8969 adding position function [#8988](https://github.com/apache/arrow-datafusion/pull/8988) (Lordworms) +- Cleanup regex_expressions.rs to remove \_regexp_match function [#9107](https://github.com/apache/arrow-datafusion/pull/9107) (Omega359) +- Unnest with single expression [#9069](https://github.com/apache/arrow-datafusion/pull/9069) (jayzhan211) +- Minor: improve GroupsAccumulator and Accumulator documentation [#8963](https://github.com/apache/arrow-datafusion/pull/8963) (alamb) +- move InList related simplify to one place [#9037](https://github.com/apache/arrow-datafusion/pull/9037) (guojidan) +- docs: add docs and example showing how to get the expression data type [#9118](https://github.com/apache/arrow-datafusion/pull/9118) (r3stl355) +- Add http(s) support to the command line [#8753](https://github.com/apache/arrow-datafusion/pull/8753) (kcolford) +- Remove External Table Backwards Compatibility Options [#9105](https://github.com/apache/arrow-datafusion/pull/9105) (yyy1000) +- feat: support `LargeList` in `flatten` [#9110](https://github.com/apache/arrow-datafusion/pull/9110) (Weijun-H) +- feat: improve `make_date` performance [#9112](https://github.com/apache/arrow-datafusion/pull/9112) (r3stl355) +- Refactor min/max value update in Parquet statistics [#9120](https://github.com/apache/arrow-datafusion/pull/9120) (Weijun-H) +- chore: Fix incorrect comment in substrait consumer [#9123](https://github.com/apache/arrow-datafusion/pull/9123) (caicancai) +- Minor: Fix Self referential links in readme [#9119](https://github.com/apache/arrow-datafusion/pull/9119) (alamb) +- Add `ColumnarValue::values_to_arrays`, deprecate `columnar_values_to_array` [#9114](https://github.com/apache/arrow-datafusion/pull/9114) (alamb) +- Support Copy with Remote Object Stores in datafusion-cli [#9064](https://github.com/apache/arrow-datafusion/pull/9064) (manoj-inukolunu) +- Fix Dockerfile min rust version to 1.72 [#9135](https://github.com/apache/arrow-datafusion/pull/9135) (alamb) +- fix: schema metadata retrieval when listing parquet table [#9134](https://github.com/apache/arrow-datafusion/pull/9134) (brayanjuls) +- Update parse_protobuf_file_scan_config to remove any partition columns from the file_schema in FileScanConfig [#9126](https://github.com/apache/arrow-datafusion/pull/9126) (bcmcmill) +- feat: add github action to self-assign the issue [#9132](https://github.com/apache/arrow-datafusion/pull/9132) (r3stl355) +- Fix NULL values in FixedSizeList creation [#9141](https://github.com/apache/arrow-datafusion/pull/9141) (Weijun-H) +- Add `FunctionRegistry::register_udaf` and `FunctionRegistry::register_udwf` [#9075](https://github.com/apache/arrow-datafusion/pull/9075) (alamb) +- Change ScalarValue::Struct to ArrayRef [#7893](https://github.com/apache/arrow-datafusion/pull/7893) (jayzhan211) +- Support join filter for `SortMergeJoin` [#9080](https://github.com/apache/arrow-datafusion/pull/9080) (viirya) +- Typo in docstring [#9149](https://github.com/apache/arrow-datafusion/pull/9149) (tv42) +- RecordBatchReceiverStreamBuilder: don't stringify errors [#9155](https://github.com/apache/arrow-datafusion/pull/9155) (tv42) +- port position test to scalar [#9128](https://github.com/apache/arrow-datafusion/pull/9128) (Lordworms) +- Minor: Improve `DataFrame` docs, add examples [#9159](https://github.com/apache/arrow-datafusion/pull/9159) (alamb) +- feat: add ability to query the remote http(s) location directly in datafusion-cli [#9150](https://github.com/apache/arrow-datafusion/pull/9150) (r3stl355) +- Add `regexp_like, improve docs and examples for `regexp_match` [#9137](https://github.com/apache/arrow-datafusion/pull/9137) (Omega359) +- Partial Sort Plan Implementation [#9125](https://github.com/apache/arrow-datafusion/pull/9125) (ahmetenis) +- Update tonic requirement from 0.10 to 0.11 [#9176](https://github.com/apache/arrow-datafusion/pull/9176) (dependabot[bot]) +- minor: fix error message function naming [#9168](https://github.com/apache/arrow-datafusion/pull/9168) (comphead) +- Minor: Update `DataFrame::write_table` docs [#9169](https://github.com/apache/arrow-datafusion/pull/9169) (alamb) +- Improve PhysicalExpr documentation [#9180](https://github.com/apache/arrow-datafusion/pull/9180) (alamb) +- Fix sphinx warnings [#9142](https://github.com/apache/arrow-datafusion/pull/9142) (ongchi) +- Use concat to simplify Nested Scalar creation [#9174](https://github.com/apache/arrow-datafusion/pull/9174) (jayzhan211) +- Minor: Remove unecessary map_err [#9186](https://github.com/apache/arrow-datafusion/pull/9186) (alamb) +- Add example of using `PruningPredicate` to datafusion-examples [#9183](https://github.com/apache/arrow-datafusion/pull/9183) (alamb) +- Use prep_null_mask_filter to handle nulls in selection mask [#9163](https://github.com/apache/arrow-datafusion/pull/9163) (viirya) +- [Document] Adding UDF by impl ScalarUDFImpl [#9172](https://github.com/apache/arrow-datafusion/pull/9172) (yyy1000) +- Docs: Extend `PruningPredicate` with background and implementation info [#9184](https://github.com/apache/arrow-datafusion/pull/9184) (alamb) +- chore: make tokio a workspace dependency [#9187](https://github.com/apache/arrow-datafusion/pull/9187) (PsiACE) +- Examples link in catalogs.rs leads to a 404 [#9194](https://github.com/apache/arrow-datafusion/pull/9194) (Omega359) +- Add test pipeline for Mac aarch64 [#9191](https://github.com/apache/arrow-datafusion/pull/9191) (viirya) +- Add string aggregate grouping fuzz test, add `MemTable::with_sort_exprs` [#9190](https://github.com/apache/arrow-datafusion/pull/9190) (alamb) +- Create `datafusion-functions-array` crate and move `ArrayToString` function into it [#9113](https://github.com/apache/arrow-datafusion/pull/9113) (alamb) +- Add constant expression support to equivalence properties [#9198](https://github.com/apache/arrow-datafusion/pull/9198) (mustafasrepo) +- chore: update tpch-docker docker repository [#9204](https://github.com/apache/arrow-datafusion/pull/9204) (pmcgleenon) +- feat: implement select directly from s3 and gcs locations in datafusion-cli [#9199](https://github.com/apache/arrow-datafusion/pull/9199) (r3stl355) +- MINOR: Add "fs" feature to "tokio", fix "features" typo. [#9210](https://github.com/apache/arrow-datafusion/pull/9210) (mustafasrepo) +- Add `to_char` function implementation using chrono formats [#9181](https://github.com/apache/arrow-datafusion/pull/9181) (Omega359) +- Add `SessionContext::read_batches` [#9197](https://github.com/apache/arrow-datafusion/pull/9197) (Lordworms) +- feat: support block gzip for streams [#9175](https://github.com/apache/arrow-datafusion/pull/9175) (tshauck) +- chore(pruning): Support `IS NOT NULL` predicates in `PruningPredicate` [#9208](https://github.com/apache/arrow-datafusion/pull/9208) (appletreeisyellow) +- Add cargo audit CI [#9182](https://github.com/apache/arrow-datafusion/pull/9182) (ongchi) +- Move `nullif` and `isnan` to datafusion-functions [#9216](https://github.com/apache/arrow-datafusion/pull/9216) (alamb) +- Bugfix - Projection Removal Conditions [#9215](https://github.com/apache/arrow-datafusion/pull/9215) (berkaysynnada) +- Partitioning fixes [#9207](https://github.com/apache/arrow-datafusion/pull/9207) (esheppa) +- Return an error when a column does not exist in window function [#9202](https://github.com/apache/arrow-datafusion/pull/9202) (PhVHoang) +- Revert "chore(pruning): Support `IS NOT NULL` predicates in `PruningPredicate` (#9208)" [#9232](https://github.com/apache/arrow-datafusion/pull/9232) (appletreeisyellow) +- Improve documentation on how to build `ScalarValue::Struct` and add `ScalarStructBuilder` [#9229](https://github.com/apache/arrow-datafusion/pull/9229) (alamb) +- Minor: improve Display of output ordering of `StreamTableExec` [#9225](https://github.com/apache/arrow-datafusion/pull/9225) (mustafasrepo) +- Support compute return types from argument values (not just their DataTypes) [#8985](https://github.com/apache/arrow-datafusion/pull/8985) (yyy1000) +- Dont call multiunzip when no stats [#9220](https://github.com/apache/arrow-datafusion/pull/9220) (matthewmturner) +- Use setup-macos-aarch64-builder for aarch64 CI pipeline [#9242](https://github.com/apache/arrow-datafusion/pull/9242) (viirya) +- GROUP-BY prioritizes input columns in case of ambiguity [#9228](https://github.com/apache/arrow-datafusion/pull/9228) (jonahgao) +- Minor: chore: improve catalog test in mod.rs [#9244](https://github.com/apache/arrow-datafusion/pull/9244) (caicancai) +- Add example for `ScalarStructBuilder::new_null`, fix display for `null` `ScalarValue::Struct` [#9238](https://github.com/apache/arrow-datafusion/pull/9238) (alamb) diff --git a/dev/update_datafusion_versions.py b/dev/update_datafusion_versions.py index 19701b813671..12b0a90d4ab6 100755 --- a/dev/update_datafusion_versions.py +++ b/dev/update_datafusion_versions.py @@ -28,20 +28,22 @@ import tomlkit crates = { - 'datafusion': 'datafusion/core/Cargo.toml', - 'datafusion-cli': 'datafusion-cli/Cargo.toml', 'datafusion-common': 'datafusion/common/Cargo.toml', - 'datafusion-expr': 'datafusion/expr/Cargo.toml', + 'datafusion': 'datafusion/core/Cargo.toml', 'datafusion-execution': 'datafusion/execution/Cargo.toml', + 'datafusion-expr': 'datafusion/expr/Cargo.toml', + 'datafusion-functions': 'datafusion/functions/Cargo.toml', + 'datafusion-functions-array': 'datafusion/functions-array/Cargo.toml', 'datafusion-optimizer': 'datafusion/optimizer/Cargo.toml', 'datafusion-physical-expr': 'datafusion/physical-expr/Cargo.toml', 'datafusion-physical-plan': 'datafusion/physical-plan/Cargo.toml', 'datafusion-proto': 'datafusion/proto/Cargo.toml', - 'datafusion-substrait': 'datafusion/substrait/Cargo.toml', 'datafusion-sql': 'datafusion/sql/Cargo.toml', 'datafusion-sqllogictest': 'datafusion/sqllogictest/Cargo.toml', + 'datafusion-substrait': 'datafusion/substrait/Cargo.toml', 'datafusion-wasmtest': 'datafusion/wasmtest/Cargo.toml', 'datafusion-benchmarks': 'benchmarks/Cargo.toml', + 'datafusion-cli': 'datafusion-cli/Cargo.toml', 'datafusion-examples': 'datafusion-examples/Cargo.toml', 'datafusion-docs': 'docs/Cargo.toml', } @@ -55,9 +57,18 @@ def update_workspace_version(new_version: str): doc = tomlkit.parse(data) pkg = doc.get('workspace').get('package') - print('workspace pacakge', pkg) + print('workspace package', pkg) pkg['version'] = new_version + doc = tomlkit.parse(data) + + for crate in crates.keys(): + df_dep = doc.get('workspace').get('dependencies', {}).get(crate) + # skip crates that pin datafusion using git hash + if df_dep is not None and df_dep.get('version') is not None: + print(f'updating {crate} dependency in {cargo_toml}') + df_dep['version'] = new_version + with open(cargo_toml, 'w') as f: f.write(tomlkit.dumps(doc)) diff --git a/docs/Cargo.toml b/docs/Cargo.toml index 7eecd11df80b..39f4520ff7b2 100644 --- a/docs/Cargo.toml +++ b/docs/Cargo.toml @@ -29,4 +29,4 @@ authors = { workspace = true } rust-version = { workspace = true } [dependencies] -datafusion = { path = "../datafusion/core", version = "35.0.0", default-features = false } +datafusion = { path = "../datafusion/core", version = "36.0.0", default-features = false } diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 8b039102d4d7..081eb44230f9 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -64,7 +64,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | NULL | Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 35.0.0 | Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 36.0.0 | Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | From bb00b63f97e85928ecdd61bd832f001ea316b965 Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Sat, 17 Feb 2024 05:11:49 -0600 Subject: [PATCH 11/14] fix: issue #9130 substitute redundant columns when doing cross join (#9154) * fix: issue #9130 substitute redundant columns when doing cross join * add test * fix:bugs * optimize code * optimize code * deleting extra debug info * adding test and optimized code * fix test * fix test --- datafusion-cli/src/exec.rs | 1 - datafusion/expr/src/logical_plan/builder.rs | 47 +++++++++++++- datafusion/expr/src/logical_plan/plan.rs | 6 +- datafusion/sql/src/select.rs | 1 - .../same_column_name_cross_join.slt | 64 +++++++++++++++++++ 5 files changed, 115 insertions(+), 4 deletions(-) create mode 100644 datafusion/sqllogictest/test_files/same_column_name_cross_join.slt diff --git a/datafusion-cli/src/exec.rs b/datafusion-cli/src/exec.rs index 6ca8dfe927a3..d1e4a5b3638e 100644 --- a/datafusion-cli/src/exec.rs +++ b/datafusion-cli/src/exec.rs @@ -228,7 +228,6 @@ async fn exec_and_print( | LogicalPlan::DescribeTable(_) | LogicalPlan::Analyze(_) ); - let df = ctx.execute_logical_plan(plan).await?; let physical_plan = df.create_physical_plan().await?; diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 68499f09d22b..39df96d61f45 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1124,7 +1124,27 @@ impl LogicalPlanBuilder { )?)) } } - +pub fn change_redundant_column(fields: Vec) -> Vec { + let mut name_map = HashMap::new(); + fields + .into_iter() + .map(|field| { + let counter = name_map.entry(field.name().to_string()).or_insert(0); + *counter += 1; + if *counter > 1 { + let new_name = format!("{}:{}", field.name(), *counter - 1); + DFField::new( + field.qualifier().cloned(), + &new_name, + field.data_type().clone(), + field.is_nullable(), + ) + } else { + field + } + }) + .collect() +} /// Creates a schema for a join operation. /// The fields from the left side are first pub fn build_join_schema( @@ -1237,6 +1257,7 @@ pub(crate) fn validate_unique_names<'a>( expressions: impl IntoIterator, ) -> Result<()> { let mut unique_names = HashMap::new(); + expressions.into_iter().enumerate().try_for_each(|(position, expr)| { let name = expr.display_name()?; match unique_names.get(&name) { @@ -1375,6 +1396,7 @@ pub fn project( .push(columnize_expr(normalize_col(e, &plan)?, input_schema)), } } + validate_unique_names("Projections", projected_expr.iter())?; Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection) @@ -2076,4 +2098,27 @@ mod tests { Ok(()) } + #[test] + fn test_change_redundant_column() -> Result<()> { + let t1_field_1 = DFField::new_unqualified("a", DataType::Int32, false); + let t2_field_1 = DFField::new_unqualified("a", DataType::Int32, false); + let t2_field_3 = DFField::new_unqualified("a", DataType::Int32, false); + let t1_field_2 = DFField::new_unqualified("b", DataType::Int32, false); + let t2_field_2 = DFField::new_unqualified("b", DataType::Int32, false); + + let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; + let remove_redundant = change_redundant_column(field_vec); + + assert_eq!( + remove_redundant, + vec![ + DFField::new_unqualified("a", DataType::Int32, false), + DFField::new_unqualified("a:1", DataType::Int32, false), + DFField::new_unqualified("b", DataType::Int32, false), + DFField::new_unqualified("b:1", DataType::Int32, false), + DFField::new_unqualified("a:2", DataType::Int32, false), + ] + ); + Ok(()) + } } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index b7d75dc0ae80..ba768cf3c6d6 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use super::dml::CopyTo; use super::DdlStatement; +use crate::builder::change_redundant_column; use crate::dml::CopyOptions; use crate::expr::{ Alias, Exists, InSubquery, Placeholder, Sort as SortExpr, WindowFunction, @@ -1891,7 +1892,9 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let schema: Schema = plan.schema().as_ref().clone().into(); + let fields = change_redundant_column(plan.schema().fields().clone()); + let meta_data = plan.schema().as_ref().metadata().clone(); + let schema: Schema = DFSchema::new_with_metadata(fields, meta_data)?.into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); @@ -2181,6 +2184,7 @@ impl TableScan { df_schema.with_functional_dependencies(func_dependencies) })?; let projected_schema = Arc::new(projected_schema); + Ok(Self { table_name, source: table_source, diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index f47720d4674a..b5b6ada48fb0 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -77,7 +77,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // handle named windows before processing the projection expression check_conflicting_windows(&select.named_window)?; match_window_definitions(&mut select.projection, &select.named_window)?; - // process the SELECT expressions, with wildcards expanded. let select_exprs = self.prepare_select_exprs( &base_plan, diff --git a/datafusion/sqllogictest/test_files/same_column_name_cross_join.slt b/datafusion/sqllogictest/test_files/same_column_name_cross_join.slt new file mode 100644 index 000000000000..de004267c938 --- /dev/null +++ b/datafusion/sqllogictest/test_files/same_column_name_cross_join.slt @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +# prepare the tables + +statement ok +create table t1 (a int, b int); + +statement ok +create table t2 (a int, b int); + +statement ok +create table t3 (a int, b int); + +statement ok +insert into t1 values (1, 2); + +statement ok +insert into t2 values (3, 4); + +statement ok +insert into t3 values (5, 6); + +query IIIIII +select * from (t1 cross join t2) as t cross join t3; +------- +---- +1 2 3 4 5 6 + + + +query IIIIIIII +select * from (t1 cross join t2) as t cross join (t2 cross join t3) +------- +---- +1 2 3 4 3 4 5 6 + + +query IIIIIIIIIIII +select * from (t1 cross join t2) as t cross join (t2 cross join t3) cross join (t1 cross join t3) as tt +-------- +---- +1 2 3 4 3 4 5 6 1 2 5 6 + +query IIIIIIIIIIIIIIII +select * from (t1 cross join t2) as t cross join (t2 cross join t3) cross join (t1 cross join t3) as tt cross join (t2 cross join t3) as ttt; +-------- +---- +1 2 3 4 3 4 5 6 1 2 5 6 3 4 5 6 From d4357ec42b7759402a60020266ca64508f8fa99a Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Sat, 17 Feb 2024 11:15:43 -0800 Subject: [PATCH 12/14] Add more doc to InputOrderMode (#9255) --- datafusion/physical-plan/src/ordering.rs | 11 +++++++---- datafusion/physical-plan/src/windows/mod.rs | 8 ++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/datafusion/physical-plan/src/ordering.rs b/datafusion/physical-plan/src/ordering.rs index 047f89eef193..8b596b9cb23e 100644 --- a/datafusion/physical-plan/src/ordering.rs +++ b/datafusion/physical-plan/src/ordering.rs @@ -28,11 +28,14 @@ /// - A `PARTITION BY a, b` or a `PARTITION BY b, a` can use `Sorted` mode. /// /// ## Aggregations -/// - A `GROUP BY b` clause can use `Linear` mode. -/// - A `GROUP BY a, c` or a `GROUP BY BY c, a` can use -/// `PartiallySorted([0])` or `PartiallySorted([1])` modes, respectively. +/// - A `GROUP BY b` clause can use `Linear` mode, as the only one permutation `[b]` +/// cannot satisfy the existing ordering. +/// - A `GROUP BY a, c` or a `GROUP BY c, a` can use +/// `PartiallySorted([0])` or `PartiallySorted([1])` modes, respectively, as +/// the permutation `[a]` satisfies the existing ordering. /// (The vector stores the index of `a` in the respective PARTITION BY expression.) -/// - A `GROUP BY a, b` or a `GROUP BY b, a` can use `Sorted` mode. +/// - A `GROUP BY a, b` or a `GROUP BY b, a` can use `Sorted` mode, as the +/// full permutation `[a, b]` satisfies the existing ordering. /// /// Note these are the same examples as above, but with `GROUP BY` instead of /// `PARTITION BY` to make the examples easier to read. diff --git a/datafusion/physical-plan/src/windows/mod.rs b/datafusion/physical-plan/src/windows/mod.rs index 01818405b810..693d20e90a66 100644 --- a/datafusion/physical-plan/src/windows/mod.rs +++ b/datafusion/physical-plan/src/windows/mod.rs @@ -329,11 +329,11 @@ pub(crate) fn calc_requirements< (!sort_reqs.is_empty()).then_some(sort_reqs) } -/// This function calculates the indices such that when partition by expressions reordered with this indices +/// This function calculates the indices such that when partition by expressions reordered with the indices /// resulting expressions define a preset for existing ordering. -// For instance, if input is ordered by a, b, c and PARTITION BY b, a is used -// This vector will be [1, 0]. It means that when we iterate b,a columns with the order [1, 0] -// resulting vector (a, b) is a preset of the existing ordering (a, b, c). +/// For instance, if input is ordered by a, b, c and PARTITION BY b, a is used, +/// this vector will be [1, 0]. It means that when we iterate b, a columns with the order [1, 0] +/// resulting vector (a, b) is a preset of the existing ordering (a, b, c). pub(crate) fn get_ordered_partition_by_indices( partition_by_exprs: &[Arc], input: &Arc, From f47f298623b058624f24b6188093463ef2f70087 Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Thu, 15 Feb 2024 02:45:42 -0300 Subject: [PATCH 13/14] feat: arrow_cast function as UDF --- datafusion-cli/Cargo.lock | 1 + datafusion/functions/Cargo.toml | 1 + .../expr => functions/src/core}/arrow_cast.rs | 123 ++++++++++++------ datafusion/functions/src/core/mod.rs | 5 +- datafusion/sql/src/expr/function.rs | 8 -- datafusion/sql/src/expr/mod.rs | 1 - datafusion/sql/src/lib.rs | 1 - datafusion/sql/tests/sql_integration.rs | 18 ++- 8 files changed, 103 insertions(+), 55 deletions(-) rename datafusion/{sql/src/expr => functions/src/core}/arrow_cast.rs (90%) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 69456446f52b..b02e5d714f41 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1231,6 +1231,7 @@ name = "datafusion-functions" version = "36.0.0" dependencies = [ "arrow", + "arrow-schema", "base64", "datafusion-common", "datafusion-execution", diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 7109261cc78f..704fb896540a 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -47,6 +47,7 @@ path = "src/lib.rs" [dependencies] arrow = { workspace = true } +arrow-schema = { workspace = true } base64 = { version = "0.21", optional = true } datafusion-common = { workspace = true } datafusion-execution = { workspace = true } diff --git a/datafusion/sql/src/expr/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs similarity index 90% rename from datafusion/sql/src/expr/arrow_cast.rs rename to datafusion/functions/src/core/arrow_cast.rs index 9a0d61f41c01..0600209eb439 100644 --- a/datafusion/sql/src/expr/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -19,16 +19,50 @@ //! casting to arbitrary arrow types (rather than SQL types) use std::{fmt::Display, iter::Peekable, str::Chars, sync::Arc}; +use std::any::Any; +use arrow::compute::cast; use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; -use datafusion_common::{ - plan_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, -}; +use datafusion_common::{plan_datafusion_err, DataFusionError, Result, ScalarValue, internal_err}; -use datafusion_common::plan_err; -use datafusion_expr::{Expr, ExprSchemable}; +use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -pub const ARROW_CAST_NAME: &str = "arrow_cast"; +#[derive(Debug)] +pub(super) struct ArrowCastFunc { + signature: Signature, +} + +impl ArrowCastFunc { + pub fn new() -> Self{ + Self { + signature: + Signature::any(2, Volatility::Immutable) + } + } + +} + +impl ScalarUDFImpl for ArrowCastFunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "arrow_cast" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + parse_data_type(&arg_types[1].to_string()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + create_arrow_cast(args) + } +} /// Create an [`Expr`] that evaluates the `arrow_cast` function /// @@ -52,26 +86,27 @@ pub const ARROW_CAST_NAME: &str = "arrow_cast"; /// select arrow_cast(column_x, 'Float64') /// ``` /// [`BuiltinScalarFunction`]: datafusion_expr::BuiltinScalarFunction -pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result { +fn create_arrow_cast(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return plan_err!("arrow_cast needs 2 arguments, {} provided", args.len()); + return internal_err!("arrow_cast needs 2 arguments, {} provided", args.len()); } - let arg1 = args.pop().unwrap(); - let arg0 = args.pop().unwrap(); + let arg1 = &args[1]; + let arg0 = &args[0]; - // arg1 must be a string - let data_type_string = if let Expr::Literal(ScalarValue::Utf8(Some(v))) = arg1 { - v - } else { - return plan_err!( - "arrow_cast requires its second argument to be a constant string, got {arg1}" - ); - }; - - // do the actual lookup to the appropriate data type - let data_type = parse_data_type(&data_type_string)?; - - arg0.cast_to(&data_type, schema) + match (arg0, arg1) { + (ColumnarValue::Scalar(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ + let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; + let val0 = arg0.cast_to(&data_type).unwrap(); + Ok(ColumnarValue::Scalar(val0)) + } + (ColumnarValue::Array(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ + let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; + let val0 = cast(&arg0,&data_type)?; + Ok(ColumnarValue::Array(val0)) + } + (ColumnarValue::Scalar(_arg0), ColumnarValue::Scalar(arg1)) => internal_err!("arrow_cast requires its second argument to be a constant string, got {arg1}"), + _ => internal_err!("arrow_cast requires two scalar value as input. got {:?} and {:?}",arg0,arg1) + } } /// Parses `str` into a `DataType`. @@ -80,22 +115,8 @@ pub fn create_arrow_cast(mut args: Vec, schema: &DFSchema) -> Result /// impl, and maintains the invariant that /// `parse_data_type(data_type.to_string()) == data_type` /// -/// Example: -/// ``` -/// # use datafusion_sql::parse_data_type; -/// # use arrow_schema::DataType; -/// let display_value = "Int32"; -/// -/// // "Int32" is the Display value of `DataType` -/// assert_eq!(display_value, &format!("{}", DataType::Int32)); -/// -/// // parse_data_type coverts "Int32" back to `DataType`: -/// let data_type = parse_data_type(display_value).unwrap(); -/// assert_eq!(data_type, DataType::Int32); -/// ``` -/// /// Remove if added to arrow: -pub fn parse_data_type(val: &str) -> Result { +fn parse_data_type(val: &str) -> Result { Parser::new(val).parse() } @@ -647,6 +668,7 @@ impl Display for Token { #[cfg(test)] mod test { + use arrow::array::{ArrayRef, Int64Array, Int8Array}; use arrow_schema::{IntervalUnit, TimeUnit}; use super::*; @@ -847,4 +869,29 @@ mod test { println!(" Ok"); } } + + #[test] + fn test_arrow_cast_scalar() -> Result<()>{ + let input_arg0 = ColumnarValue::Scalar(ScalarValue::Int8(Some(100i8))); + let input_arg1 = ColumnarValue::Scalar(ScalarValue::Utf8(Some("Int64".to_string()))); + let result = create_arrow_cast(&[input_arg0,input_arg1])?; + let result = result.into_array(1).expect("Failed to cast values"); + let expected = Arc::new(Int64Array::from(vec![100])) as ArrayRef; + assert_eq!(expected.as_ref(),result.as_ref()); + + Ok(()) + } + + #[test] + fn test_arrow_cast_array() -> Result<()>{ + let input_arg0 = ColumnarValue::Array(Arc::new(Int8Array::from(vec![100,101,102])) as ArrayRef); + let input_arg1 = ColumnarValue::Scalar(ScalarValue::Utf8(Some("Int64".to_string()))); + let result = create_arrow_cast(&[input_arg0,input_arg1])?; + let result = result.into_array(1).expect("Failed to cast values"); + let expected = Arc::new(Int64Array::from(vec![100,101,102])) as ArrayRef; + + assert_eq!(expected.as_ref(),result.as_ref()); + + Ok(()) + } } diff --git a/datafusion/functions/src/core/mod.rs b/datafusion/functions/src/core/mod.rs index 9aab4bd450d1..690daf966c1e 100644 --- a/datafusion/functions/src/core/mod.rs +++ b/datafusion/functions/src/core/mod.rs @@ -18,12 +18,15 @@ //! "core" DataFusion functions mod nullif; +mod arrow_cast; // create UDFs make_udf_function!(nullif::NullIfFunc, NULLIF, nullif); +make_udf_function!(arrow_cast::ArrowCastFunc, ARROW_CAST, arrow_cast); // Export the functions out of this package, both as expr_fn as well as a list of functions export_functions!( - (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression.") + (nullif, arg_1 arg_2, "returns NULL if value1 equals value2; otherwise it returns value1. This can be used to perform the inverse operation of the COALESCE expression."), + (arrow_cast, arg_1 arg_2, "returns arg_1 parsed to the `arrow_type` given the second argument. This can be used to cast to a specific `arrow_type`.") ); diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index 64b8d6957d2b..25530bd39f6c 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -32,8 +32,6 @@ use sqlparser::ast::{ }; use std::str::FromStr; -use super::arrow_cast::ARROW_CAST_NAME; - impl<'a, S: ContextProvider> SqlToRel<'a, S> { pub(super) fn sql_function_to_expr( &self, @@ -234,12 +232,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { fun, args, distinct, filter, order_by, ))); }; - - // Special case arrow_cast (as its type is dependent on its argument value) - if name == ARROW_CAST_NAME { - let args = self.function_args_to_expr(args, schema, planner_context)?; - return super::arrow_cast::create_arrow_cast(args, schema); - } } // Could not find the relevant function, so return an error diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index ecf510da7bce..c62f2a1b61c8 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -pub(crate) mod arrow_cast; mod binary_op; mod function; mod grouping_set; diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index d805f61397e9..23f9a64d7c81 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -40,5 +40,4 @@ pub mod utils; mod values; pub use datafusion_common::{ResolvedTableReference, TableReference}; -pub use expr::arrow_cast::parse_data_type; pub use sqlparser; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 55551d1d25a3..0a67e962376c 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -2579,7 +2579,7 @@ fn approx_median_window() { fn select_arrow_cast() { let sql = "SELECT arrow_cast(1234, 'Float64'), arrow_cast('foo', 'LargeUtf8')"; let expected = "\ - Projection: CAST(Int64(1234) AS Float64), CAST(Utf8(\"foo\") AS LargeUtf8)\ + Projection: arrow_cast(Int64(1234), Utf8(\"Float64\")), arrow_cast(Utf8(\"foo\"), Utf8(\"LargeUtf8\"))\ \n EmptyRelation"; quick_test(sql, expected); } @@ -2673,11 +2673,17 @@ fn logical_plan_with_dialect_and_options( dialect: &dyn Dialect, options: ParserOptions, ) -> Result { - let context = MockContextProvider::default().with_udf(make_udf( - "nullif", - vec![DataType::Int32, DataType::Int32], - DataType::Int32, - )); + let context = MockContextProvider::default() + .with_udf(make_udf( + "nullif", + vec![DataType::Int32, DataType::Int32], + DataType::Int32, + )) + .with_udf(make_udf( + "arrow_cast", + vec![DataType::Int64, DataType::Utf8], + DataType::Float64, + )); let planner = SqlToRel::new_with_options(&context, options); let result = DFParser::parse_sql_with_dialect(sql, dialect); From 20a9154a9538bbe44907bd842830d7bd93c2c36c Mon Sep 17 00:00:00 2001 From: Brayan Jules Date: Sun, 18 Feb 2024 23:50:55 -0300 Subject: [PATCH 14/14] feat: implemented return type exprs in ArrowCastFunc which allow arrow_cast to be resolved when added to an expression. --- .../user_defined/user_defined_aggregates.rs | 10 +++--- datafusion/functions/src/core/arrow_cast.rs | 32 +++++++++++++++---- .../sqllogictest/test_files/arrow_typeof.slt | 10 +++--- .../sqllogictest/test_files/clickbench.slt | 4 +-- datafusion/sqllogictest/test_files/expr.slt | 2 +- 5 files changed, 38 insertions(+), 20 deletions(-) diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs index 0b29ad10d670..76e743589ff7 100644 --- a/datafusion/core/tests/user_defined/user_defined_aggregates.rs +++ b/datafusion/core/tests/user_defined/user_defined_aggregates.rs @@ -185,11 +185,11 @@ async fn test_udaf_shadows_builtin_fn() { // compute with builtin `sum` aggregator let expected = [ - "+-------------+", - "| SUM(t.time) |", - "+-------------+", - "| 19000 |", - "+-------------+", + "+---------------------------------------+", + "| SUM(arrow_cast(t.time,Utf8(\"Int64\"))) |", + "+---------------------------------------+", + "| 19000 |", + "+---------------------------------------+", ]; assert_batches_eq!(expected, &execute(&ctx, sql).await.unwrap()); diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index 0600209eb439..dcefd9c38a50 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -23,9 +23,9 @@ use std::any::Any; use arrow::compute::cast; use arrow_schema::{DataType, Field, IntervalUnit, TimeUnit}; -use datafusion_common::{plan_datafusion_err, DataFusionError, Result, ScalarValue, internal_err}; +use datafusion_common::{plan_datafusion_err, DataFusionError, Result, ScalarValue, ExprSchema, plan_err}; -use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; +use datafusion_expr::{ColumnarValue, Expr, ScalarUDFImpl, Signature, Volatility}; #[derive(Debug)] pub(super) struct ArrowCastFunc { @@ -59,6 +59,21 @@ impl ScalarUDFImpl for ArrowCastFunc { parse_data_type(&arg_types[1].to_string()) } + fn return_type_from_exprs(&self, args: &[Expr], _schema: &dyn ExprSchema) -> Result { + if args.len() != 2 { + return plan_err!("arrow_cast needs 2 arguments, {} provided", args.len()); + } + let arg1 = args.get(1); + + match arg1 { + Some(Expr::Literal(ScalarValue::Utf8(arg1))) => { + let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; + Ok(data_type) + } + _ => plan_err!("arrow_cast requires its second argument to be a constant string, got {:?}",arg1.unwrap().to_string()) + } + } + fn invoke(&self, args: &[ColumnarValue]) -> Result { create_arrow_cast(args) } @@ -88,7 +103,7 @@ impl ScalarUDFImpl for ArrowCastFunc { /// [`BuiltinScalarFunction`]: datafusion_expr::BuiltinScalarFunction fn create_arrow_cast(args: &[ColumnarValue]) -> Result { if args.len() != 2 { - return internal_err!("arrow_cast needs 2 arguments, {} provided", args.len()); + return plan_err!("arrow_cast needs 2 arguments, {} provided", args.len()); } let arg1 = &args[1]; let arg0 = &args[0]; @@ -96,16 +111,19 @@ fn create_arrow_cast(args: &[ColumnarValue]) -> Result { match (arg0, arg1) { (ColumnarValue::Scalar(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; - let val0 = arg0.cast_to(&data_type).unwrap(); - Ok(ColumnarValue::Scalar(val0)) + let val0 = match arg0.cast_to(&data_type) { + Ok(val0) => Ok(ColumnarValue::Scalar(val0)), + Err(error) => plan_err!("{:?}",error.to_string()), + }; + val0 } (ColumnarValue::Array(arg0),ColumnarValue::Scalar(ScalarValue::Utf8(arg1))) =>{ let data_type = parse_data_type( arg1.clone().unwrap().as_str())?; let val0 = cast(&arg0,&data_type)?; Ok(ColumnarValue::Array(val0)) } - (ColumnarValue::Scalar(_arg0), ColumnarValue::Scalar(arg1)) => internal_err!("arrow_cast requires its second argument to be a constant string, got {arg1}"), - _ => internal_err!("arrow_cast requires two scalar value as input. got {:?} and {:?}",arg0,arg1) + (ColumnarValue::Scalar(_arg0), ColumnarValue::Scalar(arg1)) => plan_err!("arrow_cast requires its second argument to be a constant string, got {arg1}"), + _ => plan_err!("arrow_cast requires two scalar value as input. got {:?} and {:?}",arg0,arg1) } } diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index 8e2a091423da..f77397c987a7 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -95,7 +95,7 @@ SELECT arrow_cast('1', 'Int16') query error Error during planning: arrow_cast needs 2 arguments, 1 provided SELECT arrow_cast('1') -query error Error during planning: arrow_cast requires its second argument to be a constant string, got Int64\(43\) +query error Error during planning: arrow_cast requires its second argument to be a constant string, got "Int64\(43\)" SELECT arrow_cast('1', 43) query error Error unrecognized word: unknown @@ -315,7 +315,7 @@ select arrow_cast(interval '30 minutes', 'Duration(Second)'); ---- 0 days 0 hours 30 mins 0 secs -query error DataFusion error: Error during planning: Cannot automatically convert Utf8 to Duration\(Second\) +query error DataFusion error: Error during planning: "Arrow error: Cast error: Casting from Utf8 to Duration\(Second\) not supported" select arrow_cast('30 minutes', 'Duration(Second)'); @@ -336,7 +336,7 @@ select arrow_cast(timestamp '2000-01-01T00:00:00Z', 'Timestamp(Nanosecond, Some( ---- 2000-01-01T00:00:00+08:00 -statement error Arrow error: Parser error: Invalid timezone "\+25:00": '\+25:00' is not a valid timezone +statement error DataFusion error: Error during planning: "Arrow error: Parser error: Invalid timezone \\"\+25:00\\": '\+25:00' is not a valid timezone" select arrow_cast(timestamp '2000-01-01T00:00:00', 'Timestamp(Nanosecond, Some( "+25:00" ))'); @@ -405,7 +405,7 @@ select arrow_cast([1], 'FixedSizeList(1, Int64)'); ---- [1] -query error DataFusion error: Arrow error: Cast error: Cannot cast to FixedSizeList\(4\): value at index 0 has length 3 +query error DataFusion error: Error during planning: "Arrow error: Cast error: Cannot cast to FixedSizeList\(4\): value at index 0 has length 3" select arrow_cast(make_array(1, 2, 3), 'FixedSizeList(4, Int64)'); query ? @@ -421,4 +421,4 @@ FixedSizeList(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0 query ? select arrow_cast([1, 2, 3], 'FixedSizeList(3, Int64)'); ---- -[1, 2, 3] \ No newline at end of file +[1, 2, 3] diff --git a/datafusion/sqllogictest/test_files/clickbench.slt b/datafusion/sqllogictest/test_files/clickbench.slt index b61bee670811..c2dba435263d 100644 --- a/datafusion/sqllogictest/test_files/clickbench.slt +++ b/datafusion/sqllogictest/test_files/clickbench.slt @@ -274,5 +274,5 @@ query PI SELECT DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) AS M, COUNT(*) AS PageViews FROM hits WHERE "CounterID" = 62 AND "EventDate"::INT::DATE >= '2013-07-14' AND "EventDate"::INT::DATE <= '2013-07-15' AND "IsRefresh" = 0 AND "DontCountHits" = 0 GROUP BY DATE_TRUNC('minute', to_timestamp_seconds("EventTime")) ORDER BY DATE_TRUNC('minute', M) LIMIT 10 OFFSET 1000; ---- -query -drop table hits; \ No newline at end of file +statement ok +drop table hits; diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 9e4e3aa8185d..9697c32c5f88 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -63,7 +63,7 @@ SELECT NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL # test_array_cast_invalid_timezone_will_panic -statement error Parser error: Invalid timezone "Foo": 'Foo' is not a valid timezone +statement error DataFusion error: Error during planning: "Arrow error: Parser error: Invalid timezone \\"Foo\\": 'Foo' is not a valid timezone" SELECT arrow_cast('2021-01-02T03:04:00', 'Timestamp(Nanosecond, Some("Foo"))') # test_array_index