From 1087ace137548ded02c3fae77d8f9461d9045ecf Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 09:37:31 -0500 Subject: [PATCH 1/9] feat/11953: Support StringView for TRANSLATE() fn Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 6 +++++- datafusion/sqllogictest/test_files/string_view.slt | 3 +-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 5f64d8875bf5..93054c13b9ad 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -46,7 +46,10 @@ impl TranslateFunc { use DataType::*; Self { signature: Signature::one_of( - vec![Exact(vec![Utf8, Utf8, Utf8])], + vec![ + Exact(vec![Utf8View, Utf8, Utf8]), + Exact(vec![Utf8, Utf8, Utf8]) + ], Volatility::Immutable, ), } @@ -72,6 +75,7 @@ impl ScalarUDFImpl for TranslateFunc { fn invoke(&self, args: &[ColumnarValue]) -> Result { match args[0].data_type() { + DataType::Utf8View => make_scalar_function(translate::, vec![])(args), DataType::Utf8 => make_scalar_function(translate::, vec![])(args), DataType::LargeUtf8 => make_scalar_function(translate::, vec![])(args), other => { diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fcd71b7f7e94..6731e06cc2da 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -895,14 +895,13 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for TRANSLATE -## TODO file ticket query TT EXPLAIN SELECT TRANSLATE(column1_utf8view, 'foo', 'bar') as c FROM test; ---- logical_plan -01)Projection: translate(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Utf8("bar")) AS c +01)Projection: translate(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for FIND_IN_SET From 6fd2fa3ccf509ad86ed3bb118b4c9092ec3ee7aa Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 13:41:36 -0500 Subject: [PATCH 2/9] formatting Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 93054c13b9ad..ff41b0a26c4c 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -48,7 +48,7 @@ impl TranslateFunc { signature: Signature::one_of( vec![ Exact(vec![Utf8View, Utf8, Utf8]), - Exact(vec![Utf8, Utf8, Utf8]) + Exact(vec![Utf8, Utf8, Utf8]), ], Volatility::Immutable, ), From d1cc4a9c60f2323f8800a8957e72082012019119 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 14:31:59 -0500 Subject: [PATCH 3/9] fixes internal error for GenericByteArray cast Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 72 +++++++++++++------ 1 file changed, 50 insertions(+), 22 deletions(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index ff41b0a26c4c..53021bea6505 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -18,18 +18,19 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; -use arrow::datatypes::DataType; +use arrow::array::{ + ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, + StringArray, +}; +use arrow::datatypes::{DataType, Int32Type, Int64Type}; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; -use datafusion_common::cast::as_generic_string_array; +use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; -use crate::utils::{make_scalar_function, utf8_to_str_type}; - #[derive(Debug)] pub struct TranslateFunc { signature: Signature, @@ -74,28 +75,55 @@ impl ScalarUDFImpl for TranslateFunc { } fn invoke(&self, args: &[ColumnarValue]) -> Result { - match args[0].data_type() { - DataType::Utf8View => make_scalar_function(translate::, vec![])(args), - DataType::Utf8 => make_scalar_function(translate::, vec![])(args), - DataType::LargeUtf8 => make_scalar_function(translate::, vec![])(args), - other => { - exec_err!("Unsupported data type {other:?} for function translate") - } + make_scalar_function(invoke_translate, vec![])(args) + } +} + +fn invoke_translate(args: &[ArrayRef]) -> Result { + match args[0].data_type() { + DataType::Utf8View => { + let string_array = args[0].as_string_view(); + let from_array = args[1].as_string::(); + let to_array = args[1].as_string::(); + translate::(string_array, from_array, to_array) + } + DataType::Utf8 => { + let string_array = args[0].as_string::(); + let from_array = args[1].as_string::(); + let to_array = args[1].as_string::(); + translate::(string_array, from_array, to_array) + } + DataType::LargeUtf8 => { + let string_array = args[0].as_string::(); + let from_array = args[1].as_string::(); + let to_array = args[1].as_string::(); + translate::(string_array, from_array, to_array) + } + other => { + exec_err!("Unsupported data type {other:?} for function translate") } } } /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' -fn translate(args: &[ArrayRef]) -> Result { - let string_array = as_generic_string_array::(&args[0])?; - let from_array = as_generic_string_array::(&args[1])?; - let to_array = as_generic_string_array::(&args[2])?; - - let result = string_array - .iter() - .zip(from_array.iter()) - .zip(to_array.iter()) +fn translate<'a, T: ArrowPrimitiveType, V, B>( + string_array: V, + from_array: B, + to_array: B, +) -> Result +where + V: ArrayAccessor, + B: ArrayAccessor, + T::Native: OffsetSizeTrait, +{ + let string_array_iter = ArrayIter::new(string_array); + let from_array_iter = ArrayIter::new(from_array); + let to_array_iter = ArrayIter::new(to_array); + + let result = string_array_iter + .zip(from_array_iter) + .zip(to_array_iter) .map(|((string, from), to)| match (string, from, to) { (Some(string), Some(from), Some(to)) => { // create a hashmap of [char, index] to change from O(n) to O(1) for from list @@ -124,7 +152,7 @@ fn translate(args: &[ArrayRef]) -> Result { } _ => None, }) - .collect::>(); + .collect::(); Ok(Arc::new(result) as ArrayRef) } From b97ccb48445e4984a8844610d9174e27e2473fc9 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 14:57:46 -0500 Subject: [PATCH 4/9] adds additional TRANSLATE test Signed-off-by: Devan --- datafusion/sqllogictest/test_files/string_view.slt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 6731e06cc2da..cd569e70869d 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -425,6 +425,18 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] +### Test TRANSLATE + +query T +SELECT + TRANSLATE(column1_utf8view, 'foo', 'bar') as c +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + ### Initcap query TT From 3915e2e085401988eb7138c4147f0b9d9a379831 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 14:59:25 -0500 Subject: [PATCH 5/9] adds additional TRANSLATE test Signed-off-by: Devan --- datafusion/sqllogictest/test_files/string_view.slt | 1 + 1 file changed, 1 insertion(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index cd569e70869d..7f6b41a7bde0 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -427,6 +427,7 @@ logical_plan ### Test TRANSLATE +# Should run TRANSLATE using utf8view column successfully query T SELECT TRANSLATE(column1_utf8view, 'foo', 'bar') as c From 57f31614b0e43c35a0266705c2a201ebd89cd4c7 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 15:32:33 -0500 Subject: [PATCH 6/9] rm unnecessary generic Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 53021bea6505..53f774cd858d 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -18,11 +18,8 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ - ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, - StringArray, -}; -use arrow::datatypes::{DataType, Int32Type, Int64Type}; +use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, StringArray}; +use arrow::datatypes::DataType; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; @@ -85,19 +82,19 @@ fn invoke_translate(args: &[ArrayRef]) -> Result { let string_array = args[0].as_string_view(); let from_array = args[1].as_string::(); let to_array = args[1].as_string::(); - translate::(string_array, from_array, to_array) + translate::<_, _>(string_array, from_array, to_array) } DataType::Utf8 => { let string_array = args[0].as_string::(); let from_array = args[1].as_string::(); let to_array = args[1].as_string::(); - translate::(string_array, from_array, to_array) + translate::<_, _>(string_array, from_array, to_array) } DataType::LargeUtf8 => { let string_array = args[0].as_string::(); let from_array = args[1].as_string::(); let to_array = args[1].as_string::(); - translate::(string_array, from_array, to_array) + translate::<_, _>(string_array, from_array, to_array) } other => { exec_err!("Unsupported data type {other:?} for function translate") @@ -107,15 +104,10 @@ fn invoke_translate(args: &[ArrayRef]) -> Result { /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' -fn translate<'a, T: ArrowPrimitiveType, V, B>( - string_array: V, - from_array: B, - to_array: B, -) -> Result +fn translate<'a, V, B>(string_array: V, from_array: B, to_array: B) -> Result where V: ArrayAccessor, B: ArrayAccessor, - T::Native: OffsetSizeTrait, { let string_array_iter = ArrayIter::new(string_array); let from_array_iter = ArrayIter::new(from_array); From a0c78de926220cc598af7060f697d45db6ba4d59 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 22:01:06 -0500 Subject: [PATCH 7/9] cleanup + fix typo Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 53f774cd858d..6a3fc257b807 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -18,12 +18,15 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, StringArray}; +use arrow::array::{ + ArrayAccessor, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, +}; use arrow::datatypes::DataType; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; +use datafusion_common::cast::as_generic_string_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -81,20 +84,20 @@ fn invoke_translate(args: &[ArrayRef]) -> Result { DataType::Utf8View => { let string_array = args[0].as_string_view(); let from_array = args[1].as_string::(); - let to_array = args[1].as_string::(); - translate::<_, _>(string_array, from_array, to_array) + let to_array = args[2].as_string::(); + translate::(string_array, from_array, to_array) } DataType::Utf8 => { let string_array = args[0].as_string::(); let from_array = args[1].as_string::(); - let to_array = args[1].as_string::(); - translate::<_, _>(string_array, from_array, to_array) + let to_array = args[2].as_string::(); + translate::(string_array, from_array, to_array) } DataType::LargeUtf8 => { let string_array = args[0].as_string::(); let from_array = args[1].as_string::(); - let to_array = args[1].as_string::(); - translate::<_, _>(string_array, from_array, to_array) + let to_array = args[2].as_string::(); + translate::(string_array, from_array, to_array) } other => { exec_err!("Unsupported data type {other:?} for function translate") @@ -104,18 +107,23 @@ fn invoke_translate(args: &[ArrayRef]) -> Result { /// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted. /// translate('12345', '143', 'ax') = 'a2x5' -fn translate<'a, V, B>(string_array: V, from_array: B, to_array: B) -> Result +fn translate<'a, T: OffsetSizeTrait, V, B>( + string_array: V, + from_array: B, + to_array: B, +) -> Result where V: ArrayAccessor, B: ArrayAccessor, { - let string_array_iter = ArrayIter::new(string_array); - let from_array_iter = ArrayIter::new(from_array); - let to_array_iter = ArrayIter::new(to_array); + let string_array_iter = as_generic_string_array::(&string_array)?; + let from_array_iter = as_generic_string_array::(&from_array)?; + let to_array_iter = as_generic_string_array::(&to_array)?; let result = string_array_iter - .zip(from_array_iter) - .zip(to_array_iter) + .iter() + .zip(from_array_iter.iter()) + .zip(to_array_iter.iter()) .map(|((string, from), to)| match (string, from, to) { (Some(string), Some(from), Some(to)) => { // create a hashmap of [char, index] to change from O(n) to O(1) for from list @@ -144,7 +152,7 @@ where } _ => None, }) - .collect::(); + .collect::>(); Ok(Arc::new(result) as ArrayRef) } From 6b97d7beb4f1b9264268ec0b1f9ecbe60430ea11 Mon Sep 17 00:00:00 2001 From: Devan Date: Tue, 13 Aug 2024 22:04:46 -0500 Subject: [PATCH 8/9] cleanup + fix typo Signed-off-by: Devan --- datafusion/functions/src/unicode/translate.rs | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 6a3fc257b807..a42b9c6cb857 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -19,14 +19,13 @@ use std::any::Any; use std::sync::Arc; use arrow::array::{ - ArrayAccessor, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, + ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait, }; use arrow::datatypes::DataType; use hashbrown::HashMap; use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; -use datafusion_common::cast::as_generic_string_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility}; @@ -116,14 +115,13 @@ where V: ArrayAccessor, B: ArrayAccessor, { - let string_array_iter = as_generic_string_array::(&string_array)?; - let from_array_iter = as_generic_string_array::(&from_array)?; - let to_array_iter = as_generic_string_array::(&to_array)?; + let string_array_iter = ArrayIter::new(string_array); + let from_array_iter = ArrayIter::new(from_array); + let to_array_iter = ArrayIter::new(to_array); let result = string_array_iter - .iter() - .zip(from_array_iter.iter()) - .zip(to_array_iter.iter()) + .zip(from_array_iter) + .zip(to_array_iter) .map(|((string, from), to)| match (string, from, to) { (Some(string), Some(from), Some(to)) => { // create a hashmap of [char, index] to change from O(n) to O(1) for from list From 960594d943dcbca9cb276a8219b291f156f141c0 Mon Sep 17 00:00:00 2001 From: Devan Date: Wed, 14 Aug 2024 08:35:17 -0500 Subject: [PATCH 9/9] adds some additional testing to sqllogictests for TRANSLATE string_view Signed-off-by: Devan --- .../sqllogictest/test_files/string_view.slt | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index 7f6b41a7bde0..2bfc0978ba5d 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -438,6 +438,30 @@ Xiangpeng Raphael NULL +# Should run TRANSLATE using utf8 column successfully +query T +SELECT + TRANSLATE(column1_utf8, 'foo', 'bar') as c +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + +# Should run TRANSLATE using large_utf8 column successfully +query T +SELECT + TRANSLATE(column1_large_utf8, 'foo', 'bar') as c +FROM test; +---- +Andrew +Xiangpeng +Raphael +NULL + + + ### Initcap query TT