From 7831deecee348e6839d48f6dbc2a13d067757317 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 00:18:10 +0800 Subject: [PATCH 1/6] add stringview support for RIGHT --- datafusion/functions/src/unicode/right.rs | 65 ++++++++++++++++++++--- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 20cbbe020ff1..9875cc4e8a62 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -19,17 +19,18 @@ use std::any::Any; use std::cmp::{max, Ordering}; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; use arrow::datatypes::DataType; -use datafusion_common::cast::{as_generic_string_array, as_int64_array}; +use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::cast::{ + as_generic_string_array, as_int64_array, as_string_view_array, +}; use datafusion_common::exec_err; use datafusion_common::Result; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct RightFunc { signature: Signature, @@ -46,7 +47,11 @@ impl RightFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])], + vec![ + Exact(vec![Utf8View, Int64]), + Exact(vec![Utf8, Int64]), + Exact(vec![LargeUtf8, Int64]), + ], Volatility::Immutable, ), } @@ -72,9 +77,14 @@ impl ScalarUDFImpl for RightFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { - DataType::Utf8 => make_scalar_function(right::, vec![])(args), + DataType::Utf8 | DataType::Utf8View => { + make_scalar_function(right::, vec![])(args) + } DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), - other => exec_err!("Unsupported data type {other:?} for function right"), + other => exec_err!( + "Unsupported data type {other:?} for function right,\ + expected Utf8View, Utf8 or LargeUtf8." + ), } } } @@ -83,6 +93,47 @@ impl ScalarUDFImpl for RightFunc { /// right('abcde', 2) = 'de' /// The implementation uses UTF-8 code points as characters pub fn right(args: &[ArrayRef]) -> Result { + if args[0].data_type() == &DataType::Utf8View { + string_view_right(args) + } else { + string_right::(args) + } +} + +// Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need +// to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View +// See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166 +fn string_view_right(args: &[ArrayRef]) -> Result { + let string_array = as_string_view_array(&args[0])?; + let n_array = as_int64_array(&args[1])?; + + let result = string_array + .iter() + .zip(n_array.iter()) + .map(|(string, n)| match (string, n) { + (Some(string), Some(n)) => match n.cmp(&0) { + Ordering::Less => Some( + string + .chars() + .skip(n.unsigned_abs() as usize) + .collect::(), + ), + Ordering::Equal => Some("".to_string()), + Ordering::Greater => Some( + string + .chars() + .skip(max(string.chars().count() as i64 - n, 0) as usize) + .collect::(), + ), + }, + _ => None, + }) + .collect::>(); // the return type of Utf8View is currently Utf8 + + Ok(Arc::new(result) as ArrayRef) +} + +fn string_right(args: &[ArrayRef]) -> Result { let string_array = as_generic_string_array::(&args[0])?; let n_array = as_int64_array(&args[1])?; From 42b8d6ea831a8abd0549abbe80a63798769fe046 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 00:23:05 +0800 Subject: [PATCH 2/6] add tests of stringview support for RIGHT --- .../sqllogictest/test_files/string_view.slt | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fcd71b7f7e94..4f1119ccea6b 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -809,16 +809,28 @@ logical_plan 03)----TableScan: test projection=[column1_utf8view] ## Ensure no casts for RIGHT -## TODO file ticket query TT EXPLAIN SELECT RIGHT(column1_utf8view, 3) as c2 FROM test; ---- logical_plan -01)Projection: right(CAST(test.column1_utf8view AS Utf8), Int64(3)) AS c2 +01)Projection: right(test.column1_utf8view, Int64(3)) AS c2 02)--TableScan: test projection=[column1_utf8view] +# Test outputs of RIGHT +query TTT +SELECT + RIGHT(column1_utf8view, 3) as c1, + RIGHT(column1_utf8view, 0) as c2, + RIGHT(column1_utf8view, -3) as c3 +FROM test; +---- +rew (empty) rew +eng (empty) ngpeng +ael (empty) hael +NULL NULL NULL + ## Ensure no casts for RPAD ## TODO file ticket query TT From 6506cabf35668c95b49883c5540df553e26f4d7d Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:54:39 +0800 Subject: [PATCH 3/6] combine functions by ArrayAccessor and ArrayIter --- datafusion/functions/src/unicode/right.rs | 54 ++++++----------------- 1 file changed, 14 insertions(+), 40 deletions(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 9875cc4e8a62..b9ebc6532df6 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::cmp::{max, Ordering}; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, GenericStringArray, OffsetSizeTrait}; +use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_str_type}; @@ -93,52 +93,26 @@ impl ScalarUDFImpl for RightFunc { /// right('abcde', 2) = 'de' /// The implementation uses UTF-8 code points as characters pub fn right(args: &[ArrayRef]) -> Result { + let n_array = as_int64_array(&args[1])?; if args[0].data_type() == &DataType::Utf8View { - string_view_right(args) + // string_view_right(args) + let string_array = as_string_view_array(&args[0])?; + right_impl::(&mut string_array.iter(), n_array) } else { - string_right::(args) + // string_right::(args) + let string_array = &as_generic_string_array::(&args[0])?; + right_impl::(&mut string_array.iter(), n_array) } } // Currently the return type can only be Utf8 or LargeUtf8, to reach fully support, we need // to edit the `get_optimal_return_type` in utils.rs to make the udfs be able to return Utf8View // See https://github.com/apache/datafusion/issues/11790#issuecomment-2283777166 -fn string_view_right(args: &[ArrayRef]) -> Result { - let string_array = as_string_view_array(&args[0])?; - let n_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() - .zip(n_array.iter()) - .map(|(string, n)| match (string, n) { - (Some(string), Some(n)) => match n.cmp(&0) { - Ordering::Less => Some( - string - .chars() - .skip(n.unsigned_abs() as usize) - .collect::(), - ), - Ordering::Equal => Some("".to_string()), - Ordering::Greater => Some( - string - .chars() - .skip(max(string.chars().count() as i64 - n, 0) as usize) - .collect::(), - ), - }, - _ => None, - }) - .collect::>(); // the return type of Utf8View is currently Utf8 - - Ok(Arc::new(result) as ArrayRef) -} - -fn string_right(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let n_array = as_int64_array(&args[1])?; - - let result = string_array - .iter() +fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( + string_array_iter: &mut ArrayIter, + n_array: &Int64Array, +) -> Result { + let result = string_array_iter .zip(n_array.iter()) .map(|(string, n)| match (string, n) { (Some(string), Some(n)) => match n.cmp(&0) { @@ -165,7 +139,7 @@ fn string_right(args: &[ArrayRef]) -> Result { #[cfg(test)] mod tests { - use arrow::array::{Array, StringArray}; + use arrow::array::StringArray; use arrow::datatypes::DataType::Utf8; use datafusion_common::{Result, ScalarValue}; From 630de25713788f1021c48ef7b6a0b99fba915e19 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 11:57:57 +0800 Subject: [PATCH 4/6] fix fmt --- datafusion/functions/src/unicode/right.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index b9ebc6532df6..2b38d43a482d 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -19,7 +19,9 @@ use std::any::Any; use std::cmp::{max, Ordering}; use std::sync::Arc; -use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait}; +use arrow::array::{ + ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, +}; use arrow::datatypes::DataType; use crate::utils::{make_scalar_function, utf8_to_str_type}; From 67395149a55ea60de1a7cd087cbe678bcdbc6804 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:01:54 +0800 Subject: [PATCH 5/6] fix clippy --- datafusion/functions/src/unicode/right.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index 2b38d43a482d..f3fd5d05c7b1 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -20,7 +20,7 @@ use std::cmp::{max, Ordering}; use std::sync::Arc; use arrow::array::{ - ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, + Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, }; use arrow::datatypes::DataType; @@ -141,7 +141,7 @@ fn right_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor>( #[cfg(test)] mod tests { - use arrow::array::StringArray; + use arrow::array::{Array, StringArray}; use arrow::datatypes::DataType::Utf8; use datafusion_common::{Result, ScalarValue}; From 459124b250a9cbd4184570729557ce57136c8bea Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+kev1n8@users.noreply.github.com> Date: Tue, 13 Aug 2024 12:50:26 +0800 Subject: [PATCH 6/6] fix fmt --- datafusion/functions/src/unicode/right.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index f3fd5d05c7b1..9d542bb2c006 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -20,7 +20,8 @@ use std::cmp::{max, Ordering}; use std::sync::Arc; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, OffsetSizeTrait, + Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, + OffsetSizeTrait, }; use arrow::datatypes::DataType;