From 5e4ab592ea59f61c609cf0346f2c7b38f5792f20 Mon Sep 17 00:00:00 2001 From: Athul T R Date: Thu, 12 Sep 2024 21:37:19 +0530 Subject: [PATCH] Added array_any_value function (#12329) * Added array_any_value function * Added test cases --- datafusion/functions-nested/src/extract.rs | 122 ++++++++++++++++- datafusion/functions-nested/src/lib.rs | 2 + .../tests/cases/roundtrip_logical_plan.rs | 6 + datafusion/sqllogictest/test_files/array.slt | 127 ++++++++++++++++++ docs/source/user-guide/expressions.md | 1 + .../source/user-guide/sql/scalar_functions.md | 30 +++++ 6 files changed, 287 insertions(+), 1 deletion(-) diff --git a/datafusion/functions-nested/src/extract.rs b/datafusion/functions-nested/src/extract.rs index b9e82f371369..7dfc736b76d3 100644 --- a/datafusion/functions-nested/src/extract.rs +++ b/datafusion/functions-nested/src/extract.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! [`ScalarUDFImpl`] definitions for array_element, array_slice, array_pop_front and array_pop_back functions. +//! [`ScalarUDFImpl`] definitions for array_element, array_slice, array_pop_front, array_pop_back, and array_any_value functions. use arrow::array::Array; use arrow::array::ArrayRef; @@ -69,6 +69,14 @@ make_udf_expr_and_func!( array_pop_back_udf ); +make_udf_expr_and_func!( + ArrayAnyValue, + array_any_value, + array, + "returns the first non-null element in the array.", + array_any_value_udf +); + #[derive(Debug)] pub(super) struct ArrayElement { signature: Signature, @@ -687,3 +695,115 @@ where ); general_array_slice::(array, &from_array, &to_array, None) } + +#[derive(Debug)] +pub(super) struct ArrayAnyValue { + signature: Signature, + aliases: Vec, +} + +impl ArrayAnyValue { + pub fn new() -> Self { + Self { + signature: Signature::array(Volatility::Immutable), + aliases: vec![String::from("list_any_value")], + } + } +} + +impl ScalarUDFImpl for ArrayAnyValue { + fn as_any(&self) -> &dyn Any { + self + } + fn name(&self) -> &str { + "array_any_value" + } + fn signature(&self) -> &Signature { + &self.signature + } + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[0] { + List(field) + | LargeList(field) + | FixedSizeList(field, _) => Ok(field.data_type().clone()), + _ => plan_err!( + "array_any_value can only accept List, LargeList or FixedSizeList as the argument" + ), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> Result { + make_scalar_function(array_any_value_inner)(args) + } + fn aliases(&self) -> &[String] { + &self.aliases + } +} + +fn array_any_value_inner(args: &[ArrayRef]) -> Result { + if args.len() != 1 { + return exec_err!("array_any_value expects one argument"); + } + + match &args[0].data_type() { + List(_) => { + let array = as_list_array(&args[0])?; + general_array_any_value::(array) + } + LargeList(_) => { + let array = as_large_list_array(&args[0])?; + general_array_any_value::(array) + } + data_type => exec_err!("array_any_value does not support type: {:?}", data_type), + } +} + +fn general_array_any_value( + array: &GenericListArray, +) -> Result +where + i64: TryInto, +{ + let values = array.values(); + let original_data = values.to_data(); + let capacity = Capacities::Array(array.len()); + + let mut mutable = + MutableArrayData::with_capacities(vec![&original_data], true, capacity); + + for (row_index, offset_window) in array.offsets().windows(2).enumerate() { + let start = offset_window[0]; + let end = offset_window[1]; + let len = end - start; + + // array is null + if len == O::usize_as(0) { + mutable.extend_nulls(1); + continue; + } + + let row_value = array.value(row_index); + match row_value.nulls() { + Some(row_nulls_buffer) => { + // nulls are present in the array so try to take the first valid element + if let Some(first_non_null_index) = + row_nulls_buffer.valid_indices().next() + { + let index = start.as_usize() + first_non_null_index; + mutable.extend(0, index, index + 1) + } else { + // all the elements in the array are null + mutable.extend_nulls(1); + } + } + None => { + // no nulls are present in the array so take the first element + let index = start.as_usize(); + mutable.extend(0, index, index + 1); + } + } + } + + let data = mutable.freeze(); + Ok(arrow::array::make_array(data)) +} diff --git a/datafusion/functions-nested/src/lib.rs b/datafusion/functions-nested/src/lib.rs index b548cf6db8b1..301ddb36fc56 100644 --- a/datafusion/functions-nested/src/lib.rs +++ b/datafusion/functions-nested/src/lib.rs @@ -79,6 +79,7 @@ pub mod expr_fn { pub use super::distance::array_distance; pub use super::empty::array_empty; pub use super::except::array_except; + pub use super::extract::array_any_value; pub use super::extract::array_element; pub use super::extract::array_pop_back; pub use super::extract::array_pop_front; @@ -128,6 +129,7 @@ pub fn all_default_nested_functions() -> Vec> { extract::array_pop_back_udf(), extract::array_pop_front_udf(), extract::array_slice_udf(), + extract::array_any_value_udf(), make_array::make_array_udf(), array_has::array_has_udf(), array_has::array_has_all_udf(), diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index c550847250a0..1ff39e9e65b7 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -841,6 +841,12 @@ async fn roundtrip_expr_api() -> Result<()> { ), array_pop_front(make_array(vec![lit(1), lit(2), lit(3)])), array_pop_back(make_array(vec![lit(1), lit(2), lit(3)])), + array_any_value(make_array(vec![ + lit(ScalarValue::Null), + lit(1), + lit(2), + lit(3), + ])), array_reverse(make_array(vec![lit(1), lit(2), lit(3)])), array_position( make_array(vec![lit(1), lit(2), lit(3), lit(4)]), diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index f92bcfdf3421..72b057b44473 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -2014,6 +2014,133 @@ select array_slice(a, -1, 2, 1), array_slice(a, -1, 2), query error DataFusion error: Error during planning: Error during planning: array_slice does not support zero arguments select array_slice(); +## array_any_value (aliases: list_any_value) + +# Testing with empty arguments should result in an error +query error +select array_any_value(); + +# Testing with non-array arguments should result in an error +query error +select array_any_value(1), array_any_value('a'), array_any_value(NULL); + +# array_any_value scalar function #1 (with null and non-null elements) + +query ITII +select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o')), array_any_value(make_array(NULL, NULL)), array_any_value(make_array(NULL, NULL, 1, 2, 3)); +---- +1 h NULL 1 + +query ITITI +select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'LargeList(Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3), 'LargeList(Int64)'));; +---- +1 h NULL NULL 1 + +query ITITI +select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), array_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Int64)')), array_any_value(arrow_cast(make_array(NULL, NULL), 'FixedSizeList(2, Utf8)')), array_any_value(arrow_cast(make_array(NULL, NULL, 1, 2, 3, 4), 'FixedSizeList(6, Int64)')); +---- +1 h NULL NULL 1 + +# array_any_value scalar function #2 (with nested array) + +query ? +select array_any_value(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10))); +---- +[, 1, 2, 3, 4, 5] + +query ? +select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'LargeList(List(Int64))')); +---- +[, 1, 2, 3, 4, 5] + +query ? +select array_any_value(arrow_cast(make_array(NULL, make_array(NULL, 1, 2, 3, 4, 5), make_array(NULL, 6, 7, 8, 9, 10)), 'FixedSizeList(3, List(Int64))')); +---- +[, 1, 2, 3, 4, 5] + +# array_any_value scalar function #3 (using function alias `list_any_value`) +query IT +select list_any_value(make_array(NULL, 1, 2, 3, 4, 5)), list_any_value(make_array(NULL, 'h', 'e', 'l', 'l', 'o')); +---- +1 h + +query IT +select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'LargeList(Utf8)')); +---- +1 h + +query IT +select list_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'FixedSizeList(6, Int64)')), list_any_value(arrow_cast(make_array(NULL, 'h', 'e', 'l', 'l', 'o'), 'FixedSizeList(6, Utf8)')); +---- +1 h + +# array_any_value with columns + +query I +select array_any_value(column1) from slices; +---- +2 +11 +21 +31 +NULL +41 +51 + +query I +select array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices; +---- +2 +11 +21 +31 +NULL +41 +51 + +query I +select array_any_value(column1) from fixed_slices; +---- +2 +11 +21 +31 +41 +51 + +# array_any_value with columns and scalars + +query II +select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from slices; +---- +1 2 +1 11 +1 21 +1 31 +1 NULL +1 41 +1 51 + +query II +select array_any_value(arrow_cast(make_array(NULL, 1, 2, 3, 4, 5), 'LargeList(Int64)')), array_any_value(arrow_cast(column1, 'LargeList(Int64)')) from slices; +---- +1 2 +1 11 +1 21 +1 31 +1 NULL +1 41 +1 51 + +query II +select array_any_value(make_array(NULL, 1, 2, 3, 4, 5)), array_any_value(column1) from fixed_slices; +---- +1 2 +1 11 +1 21 +1 31 +1 41 +1 51 # make_array with nulls query ??????? diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index ad5a9cb75152..c8f0ffbec701 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -211,6 +211,7 @@ select log(-1), log(0), sqrt(-1); | Syntax | Description | | ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| array_any_value(array) | Returns the first non-null element in the array. `array_any_value([NULL, 1, 2, 3]) -> 1` | | array_append(array, element) | Appends an element to the end of an array. `array_append([1, 2, 3], 4) -> [1, 2, 3, 4]` | | array_concat(array[, ..., array_n]) | Concatenates arrays. `array_concat([1, 2, 3], [4, 5, 6]) -> [1, 2, 3, 4, 5, 6]` | | array_has(array, element) | Returns true if the array contains the element `array_has([1,2,3], 1) -> true` | diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 6a2181b8db40..636a33100ed0 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -2085,6 +2085,7 @@ to_unixtime(expression[, ..., format_n]) ## Array Functions +- [array_any_value](#array_any_value) - [array_append](#array_append) - [array_sort](#array_sort) - [array_cat](#array_cat) @@ -2129,6 +2130,7 @@ to_unixtime(expression[, ..., format_n]) - [empty](#empty) - [flatten](#flatten) - [generate_series](#generate_series) +- [list_any_value] (#list_any_value) - [list_append](#list_append) - [list_sort](#list_sort) - [list_cat](#list_cat) @@ -2173,6 +2175,30 @@ to_unixtime(expression[, ..., format_n]) - [unnest](#unnest) - [range](#range) +### `array_any_value` + +Returns the first non-null element in the array. + +``` +array_any_value(array) +``` + +#### Arguments + +- **array**: Array expression. + Can be a constant, column, or function, and any combination of array operators. + +#### Example + +``` +> select array_any_value([NULL, 1, 2, 3]); ++--------------------------------------------------------------+ +| array_any_value(List([NULL,1,2,3])) | ++--------------------------------------------------------------+ +| 1 | ++--------------------------------------------------------------+ +``` + ### `array_append` Appends an element to the end of an array. @@ -3238,6 +3264,10 @@ generate_series(start, stop, step) +------------------------------------+ ``` +### `list_any_value` + +_Alias of [array_any_value](#array_any_value)._ + ### `list_append` _Alias of [array_append](#array_append)._