From 3ffeb52c1c9891e63fcb17db41283d7299af6f18 Mon Sep 17 00:00:00 2001 From: Ruixiang Tan Date: Thu, 24 Aug 2023 00:35:42 +0800 Subject: [PATCH] feat: support `array_pop_back` function (#7348) --- datafusion/expr/src/built_in_function.rs | 6 + datafusion/expr/src/expr_fn.rs | 9 + .../physical-expr/src/array_expressions.rs | 161 ++++++++++++++++++ datafusion/physical-expr/src/functions.rs | 4 +- datafusion/proto/proto/datafusion.proto | 1 + datafusion/proto/src/generated/pbjson.rs | 3 + datafusion/proto/src/generated/prost.rs | 3 + .../proto/src/logical_plan/from_proto.rs | 6 +- datafusion/proto/src/logical_plan/to_proto.rs | 1 + datafusion/sqllogictest/test_files/array.slt | 75 ++++++++ docs/source/user-guide/expressions.md | 1 + .../source/user-guide/sql/scalar_functions.md | 25 +++ 12 files changed, 293 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/built_in_function.rs b/datafusion/expr/src/built_in_function.rs index e97269e474a2..de046cde8991 100644 --- a/datafusion/expr/src/built_in_function.rs +++ b/datafusion/expr/src/built_in_function.rs @@ -134,6 +134,8 @@ pub enum BuiltinScalarFunction { ArrayHasAll, /// array_has_any ArrayHasAny, + /// array_pop_back + ArrayPopBack, /// array_dims ArrayDims, /// array_element @@ -370,6 +372,7 @@ impl BuiltinScalarFunction { BuiltinScalarFunction::ArrayElement => Volatility::Immutable, BuiltinScalarFunction::ArrayLength => Volatility::Immutable, BuiltinScalarFunction::ArrayNdims => Volatility::Immutable, + BuiltinScalarFunction::ArrayPopBack => Volatility::Immutable, BuiltinScalarFunction::ArrayPosition => Volatility::Immutable, BuiltinScalarFunction::ArrayPositions => Volatility::Immutable, BuiltinScalarFunction::ArrayPrepend => Volatility::Immutable, @@ -552,6 +555,7 @@ impl BuiltinScalarFunction { }, BuiltinScalarFunction::ArrayLength => Ok(UInt64), BuiltinScalarFunction::ArrayNdims => Ok(UInt64), + BuiltinScalarFunction::ArrayPopBack => Ok(input_expr_types[0].clone()), BuiltinScalarFunction::ArrayPosition => Ok(UInt64), BuiltinScalarFunction::ArrayPositions => { Ok(List(Arc::new(Field::new("item", UInt64, true)))) @@ -823,6 +827,7 @@ impl BuiltinScalarFunction { // for now, the list is small, as we do not have many built-in functions. match self { BuiltinScalarFunction::ArrayAppend => Signature::any(2, self.volatility()), + BuiltinScalarFunction::ArrayPopBack => Signature::any(1, self.volatility()), BuiltinScalarFunction::ArrayConcat => { Signature::variadic_any(self.volatility()) } @@ -1333,6 +1338,7 @@ fn aliases(func: &BuiltinScalarFunction) -> &'static [&'static str] { } BuiltinScalarFunction::ArrayLength => &["array_length", "list_length"], BuiltinScalarFunction::ArrayNdims => &["array_ndims", "list_ndims"], + BuiltinScalarFunction::ArrayPopBack => &["array_pop_back", "list_pop_back"], BuiltinScalarFunction::ArrayPosition => &[ "array_position", "list_position", diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index e3fd5ceb206b..a7f7d9b6e691 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -545,6 +545,14 @@ scalar_expr!( array element, "appends an element to the end of an array." ); + +scalar_expr!( + ArrayPopBack, + array_pop_back, + array, + "returns the array without the last element." +); + nary_scalar_expr!(ArrayConcat, array_concat, "concatenates arrays."); scalar_expr!( ArrayHas, @@ -1087,6 +1095,7 @@ mod test { test_scalar_expr!(FromUnixtime, from_unixtime, unixtime); test_scalar_expr!(ArrayAppend, array_append, array, element); + test_scalar_expr!(ArrayPopBack, array_pop_back, array); test_unary_scalar_expr!(ArrayDims, array_dims); test_scalar_expr!(ArrayLength, array_length, array, dimension); test_unary_scalar_expr!(ArrayNdims, array_ndims); diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 97d7ee4610ce..98b14fdbc391 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -599,6 +599,22 @@ pub fn array_slice(args: &[ArrayRef]) -> Result { define_array_slice(list_array, key, extra_key, false) } +pub fn array_pop_back(args: &[ArrayRef]) -> Result { + let list_array = as_list_array(&args[0])?; + let key = vec![0; list_array.len()]; + let extra_key: Vec<_> = list_array + .iter() + .map(|x| x.map_or(0, |arr| arr.len() as i64 - 1)) + .collect(); + + define_array_slice( + list_array, + &Int64Array::from(key), + &Int64Array::from(extra_key), + false, + ) +} + macro_rules! append { ($ARRAY:expr, $ELEMENT:expr, $ARRAY_TYPE:ident) => {{ let mut offsets: Vec = vec![0]; @@ -2005,6 +2021,151 @@ mod tests { ); } + #[test] + fn test_array_pop_back() { + // array_pop_back([1, 2, 3, 4]) = [1, 2, 3] + let list_array = return_array().into_array(1); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!( + &[1, 2, 3], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + + // array_pop_back([1, 2, 3]) = [1, 2] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!( + &[1, 2], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + + // array_pop_back([1, 2]) = [1] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!( + &[1], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + + // array_pop_back([1]) = [] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!( + &[], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + // array_pop_back([]) = [] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!( + &[], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + + // array_pop_back([1, NULL, 3, NULL]) = [1, NULL, 3] + let list_array = return_array_with_nulls().into_array(1); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert_eq!(3, result.values().len()); + assert_eq!( + &[false, true, false], + &[ + result.values().is_null(0), + result.values().is_null(1), + result.values().is_null(2) + ] + ); + } + #[test] + fn test_nested_array_pop_back() { + // array_pop_back([[1, 2, 3, 4], [5, 6, 7, 8]]) = [[1, 2, 3, 4]] + let list_array = return_nested_array().into_array(1); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_slice"); + let result = + as_list_array(&arr).expect("failed to initialize function array_slice"); + assert_eq!( + &[1, 2, 3, 4], + result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .values() + ); + + // array_pop_back([[1, 2, 3, 4]]) = [] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert!(result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .is_empty()); + // array_pop_back([]) = [] + let list_array = Arc::new(result.clone()); + let arr = array_pop_back(&[list_array]) + .expect("failed to initialize function array_pop_back"); + let result = + as_list_array(&arr).expect("failed to initialize function array_pop_back"); + assert!(result + .value(0) + .as_any() + .downcast_ref::() + .unwrap() + .is_empty()); + } + #[test] fn test_array_slice() { // array_slice([1, 2, 3, 4], 1, 3) = [1, 2, 3] diff --git a/datafusion/physical-expr/src/functions.rs b/datafusion/physical-expr/src/functions.rs index 2d6dbfdf52c3..420e40e9c48b 100644 --- a/datafusion/physical-expr/src/functions.rs +++ b/datafusion/physical-expr/src/functions.rs @@ -449,10 +449,12 @@ pub fn create_physical_fun( BuiltinScalarFunction::Flatten => { Arc::new(|args| make_scalar_function(array_expressions::flatten)(args)) } - BuiltinScalarFunction::ArrayNdims => { Arc::new(|args| make_scalar_function(array_expressions::array_ndims)(args)) } + BuiltinScalarFunction::ArrayPopBack => { + Arc::new(|args| make_scalar_function(array_expressions::array_pop_back)(args)) + } BuiltinScalarFunction::ArrayPosition => { Arc::new(|args| make_scalar_function(array_expressions::array_position)(args)) } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index f31a593ad5fe..a0148c8af077 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -598,6 +598,7 @@ enum ScalarFunction { Isnan = 113; Iszero = 114; ArrayEmpty = 115; + ArrayPopBack = 116; } message ScalarFunctionNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 7d1a18349c14..948516117428 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -18948,6 +18948,7 @@ impl serde::Serialize for ScalarFunction { Self::Isnan => "Isnan", Self::Iszero => "Iszero", Self::ArrayEmpty => "ArrayEmpty", + Self::ArrayPopBack => "ArrayPopBack", }; serializer.serialize_str(variant) } @@ -19075,6 +19076,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Isnan", "Iszero", "ArrayEmpty", + "ArrayPopBack", ]; struct GeneratedVisitor; @@ -19233,6 +19235,7 @@ impl<'de> serde::Deserialize<'de> for ScalarFunction { "Isnan" => Ok(ScalarFunction::Isnan), "Iszero" => Ok(ScalarFunction::Iszero), "ArrayEmpty" => Ok(ScalarFunction::ArrayEmpty), + "ArrayPopBack" => Ok(ScalarFunction::ArrayPopBack), _ => Err(serde::de::Error::unknown_variant(value, FIELDS)), } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index fc55b7e23af2..22a4beadb826 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -2378,6 +2378,7 @@ pub enum ScalarFunction { Isnan = 113, Iszero = 114, ArrayEmpty = 115, + ArrayPopBack = 116, } impl ScalarFunction { /// String value of the enum field names used in the ProtoBuf definition. @@ -2502,6 +2503,7 @@ impl ScalarFunction { ScalarFunction::Isnan => "Isnan", ScalarFunction::Iszero => "Iszero", ScalarFunction::ArrayEmpty => "ArrayEmpty", + ScalarFunction::ArrayPopBack => "ArrayPopBack", } } /// Creates an enum from field names used in the ProtoBuf definition. @@ -2623,6 +2625,7 @@ impl ScalarFunction { "Isnan" => Some(Self::Isnan), "Iszero" => Some(Self::Iszero), "ArrayEmpty" => Some(Self::ArrayEmpty), + "ArrayPopBack" => Some(Self::ArrayPopBack), _ => None, } } diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index c5ab0c25f628..7fbf2ff52c0d 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -59,7 +59,7 @@ use datafusion_expr::{ WindowFrameUnits, }; use datafusion_expr::{ - array_empty, + array_empty, array_pop_back, expr::{Alias, Placeholder}, }; use std::sync::Arc; @@ -464,6 +464,7 @@ impl From<&protobuf::ScalarFunction> for BuiltinScalarFunction { ScalarFunction::Flatten => Self::Flatten, ScalarFunction::ArrayLength => Self::ArrayLength, ScalarFunction::ArrayNdims => Self::ArrayNdims, + ScalarFunction::ArrayPopBack => Self::ArrayPopBack, ScalarFunction::ArrayPosition => Self::ArrayPosition, ScalarFunction::ArrayPositions => Self::ArrayPositions, ScalarFunction::ArrayPrepend => Self::ArrayPrepend, @@ -1272,6 +1273,9 @@ pub fn parse_expr( parse_expr(&args[0], registry)?, parse_expr(&args[1], registry)?, )), + ScalarFunction::ArrayPopBack => { + Ok(array_pop_back(parse_expr(&args[0], registry)?)) + } ScalarFunction::ArrayPrepend => Ok(array_prepend( parse_expr(&args[0], registry)?, parse_expr(&args[1], registry)?, diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index 82df53af92c3..b90e3b0e4b0e 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -1460,6 +1460,7 @@ impl TryFrom<&BuiltinScalarFunction> for protobuf::ScalarFunction { BuiltinScalarFunction::Flatten => Self::Flatten, BuiltinScalarFunction::ArrayLength => Self::ArrayLength, BuiltinScalarFunction::ArrayNdims => Self::ArrayNdims, + BuiltinScalarFunction::ArrayPopBack => Self::ArrayPopBack, BuiltinScalarFunction::ArrayPosition => Self::ArrayPosition, BuiltinScalarFunction::ArrayPositions => Self::ArrayPositions, BuiltinScalarFunction::ArrayPrepend => Self::ArrayPrepend, diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index a4969b1e2067..f54c2f71718c 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -79,6 +79,17 @@ AS VALUES (make_array(51, 52, NULL, 54, 55, 56, 57, 58, 59, 60), 5, NULL) ; +statement ok +CREATE TABLE arrayspop +AS VALUES + (make_array(1, 2, NULL)), + (make_array(3, 4, 5, NULL)), + (make_array(6, 7, 8, NULL, 9)), + (make_array(NULL, NULL, 100)), + (NULL), + (make_array(NULL, 10, 11, 12)) +; + statement ok CREATE TABLE nested_arrays AS VALUES @@ -687,6 +698,67 @@ NULL 23 NULL 43 5 NULL +## array_pop_back (aliases: `list_pop_back`) + +# array_pop_back scalar function #1 +query ?? +select array_pop_back(make_array(1, 2, 3, 4, 5)), array_pop_back(make_array('h', 'e', 'l', 'l', 'o')); +---- +[1, 2, 3, 4] [h, e, l, l] + +# array_pop_back scalar function #2 (after array_pop_back, array is empty) +query ? +select array_pop_back(make_array(1)); +---- +[] + +# array_pop_back scalar function #3 (array_pop_back the empty array) +query ? +select array_pop_back(array_pop_back(make_array(1))); +---- +[] + +# array_pop_back scalar function #4 (array_pop_back the arrays which have NULL) +query ?? +select array_pop_back(make_array(1, 2, 3, 4, NULL)), array_pop_back(make_array(NULL, 'e', 'l', NULL, 'o')); +---- +[1, 2, 3, 4] [, e, l, ] + +# array_pop_back scalar function #5 (array_pop_back the nested arrays) +query ? +select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), make_array(4, 5, 6))); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] + +# array_pop_back scalar function #6 (array_pop_back the nested arrays with NULL) +query ? +select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), make_array(1, 2, 3), make_array(1, 7, 4), NULL)); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], [1, 2, 3], [1, 7, 4]] + +# array_pop_back scalar function #7 (array_pop_back the nested arrays with NULL) +query ? +select array_pop_back(make_array(make_array(1, 2, 3), make_array(2, 9, 1), make_array(7, 8, 9), NULL, make_array(1, 7, 4))); +---- +[[1, 2, 3], [2, 9, 1], [7, 8, 9], ] + +# array_pop_back scalar function #8 (after array_pop_back, nested array is empty) +query ? +select array_pop_back(make_array(make_array(1, 2, 3))); +---- +[] + +# array_pop_back with columns +query ? +select array_pop_back(column1) from arrayspop; +---- +[1, 2] +[3, 4, 5] +[6, 7, 8, ] +[, ] +[] +[, 10, 11] + ## array_slice (aliases: list_slice) # array_slice scalar function #1 (with positive indexes) @@ -2430,6 +2502,9 @@ drop table arrays; statement ok drop table slices; +statement ok +drop table arrayspop; + statement ok drop table arrays_values; diff --git a/docs/source/user-guide/expressions.md b/docs/source/user-guide/expressions.md index d8dfa7af5310..a481e525fe14 100644 --- a/docs/source/user-guide/expressions.md +++ b/docs/source/user-guide/expressions.md @@ -194,6 +194,7 @@ Unlike to some databases the math functions in Datafusion works the same way as | flatten(array) | Converts an array of arrays to a flat array `flatten([[1], [2, 3], [4, 5, 6]]) -> [1, 2, 3, 4, 5, 6]` | | array_length(array, dimension) | Returns the length of the array dimension. `array_length([1, 2, 3, 4, 5]) -> 5` | | array_ndims(array) | Returns the number of dimensions of the array. `array_ndims([[1, 2, 3], [4, 5, 6]]) -> 2` | +| array_pop_back(array) | Returns the array without the last element. `array_pop_back([1, 2, 3]) -> [1, 2]` | | array_position(array, element) | Searches for an element in the array, returns first occurrence. `array_position([1, 2, 2, 3, 4], 2) -> 2` | | array_positions(array, element) | Searches for an element in the array, returns all occurrences. `array_positions([1, 2, 2, 3, 4], 2) -> [2, 3]` | | array_prepend(array, element) | Prepends an element to the beginning of an array. `array_prepend(1, [2, 3, 4]) -> [1, 2, 3, 4]` | diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index c6e2f5ddd828..fc6293850239 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -1481,6 +1481,7 @@ from_unixtime(expression) - [array_length](#array_length) - [array_ndims](#array_ndims) - [array_prepend](#array_prepend) +- [array_pop_back](#array_pop_back) - [array_position](#array_position) - [array_positions](#array_positions) - [array_push_back](#array_push_back) @@ -1830,6 +1831,30 @@ array_prepend(element, array) - list_prepend - list_push_front +### `array_pop_back` + +Returns the array without the last element. + +``` +array_pop_back(array) +``` + +#### Arguments + +- **array**: Array expression. + Can be a constant, column, or function, and any combination of array operators. + +#### Example + +``` +❯ select array_pop_back([1, 2, 3]); ++-------------------------------+ +| array_pop_back(List([1,2,3])) | ++-------------------------------+ +| [1, 2] | ++-------------------------------+ +``` + ### `array_position` Returns a string with an input string repeated a specified number.