diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index 6836713d8016..be83e5bb7f41 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -547,8 +547,12 @@ fn get_valid_types( let logical_type: NativeType = current_type.into(); let target_logical_type = target_type.native(); if can_coerce_to(&logical_type, target_logical_type) { - let target_type = - target_logical_type.default_cast_for(current_type)?; + let target_type = if &logical_type == target_logical_type { + current_type.clone() + } else { + target_logical_type.default_cast_for(current_type)? + }; + // let target_type = target_logical_type.default_cast_for(current_type)?; new_types.push(target_type); } } diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index 249ce15d6dbe..ea8038d73ffb 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -24,8 +24,8 @@ use arrow::array::{ ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, StringViewArray, }; -use arrow::datatypes::DataType; -use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; +use arrow::datatypes::DataType::{Dictionary, LargeUtf8, Utf8, Utf8View}; +use arrow::datatypes::{DataType, Int32Type}; use datafusion_common::cast::as_int64_array; use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::{exec_err, Result}; @@ -125,6 +125,22 @@ fn repeat(args: &[ArrayRef]) -> Result { let string_array = args[0].as_string::(); repeat_impl::>(string_array, number_array) } + Dictionary(k, v) => match &**v { + Utf8 => { + let dict_array = match &**k { + DataType::Int32 => Ok(args[0].as_dictionary::()), + _ => exec_err!("Unsupported Dictionary key type {k:?}"), + } + .unwrap(); + let string_array = dict_array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + repeat_impl::>(string_array, number_array) + } + _ => exec_err!("Unsupported type {v:?}"), + }, other => exec_err!( "Unsupported data type {other:?} for function repeat. \ Expected Utf8, Utf8View or LargeUtf8." diff --git a/datafusion/functions/src/unicode/left.rs b/datafusion/functions/src/unicode/left.rs index a6c2b9768f0b..ad7cd8c29eea 100644 --- a/datafusion/functions/src/unicode/left.rs +++ b/datafusion/functions/src/unicode/left.rs @@ -20,19 +20,19 @@ use std::cmp::Ordering; use std::sync::{Arc, OnceLock}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, - OffsetSizeTrait, + as_dictionary_array, Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, + Int64Array, OffsetSizeTrait, }; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Int32Type}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::Result; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,14 +50,9 @@ impl Default for LeftFunc { impl LeftFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - ], + signature: Signature::coercible( + vec![logical_string(), logical_int64()], Volatility::Immutable, ), } @@ -87,6 +82,10 @@ impl ScalarUDFImpl for LeftFunc { make_scalar_function(left::, vec![])(args) } DataType::LargeUtf8 => make_scalar_function(left::, vec![])(args), + DataType::Dictionary(_, v) => match &*v { + DataType::Utf8 => make_scalar_function(left::, vec![])(args), + _ => exec_err!("Unsupported data type {v:?}"), + }, other => exec_err!( "Unsupported data type {other:?} for function left,\ expected Utf8View, Utf8 or LargeUtf8." @@ -132,9 +131,26 @@ pub fn left(args: &[ArrayRef]) -> Result { if args[0].data_type() == &DataType::Utf8View { let string_array = as_string_view_array(&args[0])?; left_impl::(string_array, n_array) - } else { + } else if args[0].data_type() == &DataType::Utf8 + || args[0].data_type() == &DataType::LargeUtf8 + { let string_array = as_generic_string_array::(&args[0])?; left_impl::(string_array, n_array) + } else { + let dict_array = match &args[0].data_type() { + DataType::Dictionary(k, _) => match **k { + DataType::Int32 => Ok(as_dictionary_array::(&args[0])), + _ => exec_err!("Unsupported Dictionary key type {k:?}"), + }, + _ => exec_err!("Unsupported type {:?}", &args[0].data_type()), + } + .unwrap(); + let string_array = dict_array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + left_impl::(string_array, n_array) } } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index a639bcedcd1f..46b56c36321c 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -24,6 +24,7 @@ use arrow::array::{ OffsetSizeTrait, StringViewArray, }; use arrow::datatypes::DataType; +use datafusion_common::types::{logical_int64, logical_string}; use unicode_segmentation::UnicodeSegmentation; use DataType::{LargeUtf8, Utf8, Utf8View}; @@ -32,7 +33,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,22 +51,15 @@ impl Default for LPadFunc { impl LPadFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8View, Int64, Utf8View]), - Exact(vec![Utf8View, Int64, Utf8]), - Exact(vec![Utf8View, Int64, LargeUtf8]), - Exact(vec![Utf8, Int64]), - Exact(vec![Utf8, Int64, Utf8View]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![LargeUtf8, Int64, Utf8View]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), + TypeSignature::Coercible(vec![logical_string(), logical_int64()]), + TypeSignature::Coercible(vec![ + logical_string(), + logical_int64(), + logical_string(), + ]), ], Volatility::Immutable, ), diff --git a/datafusion/functions/src/unicode/right.rs b/datafusion/functions/src/unicode/right.rs index ab3b7ba1a27e..aedb2eac8e3e 100644 --- a/datafusion/functions/src/unicode/right.rs +++ b/datafusion/functions/src/unicode/right.rs @@ -20,19 +20,18 @@ use std::cmp::{max, Ordering}; use std::sync::{Arc, OnceLock}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, GenericStringArray, Int64Array, - OffsetSizeTrait, + as_dictionary_array, Array, ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, Int64Array, OffsetSizeTrait }; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Int32Type}; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::{ as_generic_string_array, as_int64_array, as_string_view_array, }; use datafusion_common::exec_err; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::Result; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -50,14 +49,9 @@ impl Default for RightFunc { impl RightFunc { pub fn new() -> Self { - use DataType::*; Self { - signature: Signature::one_of( - vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8, Int64]), - Exact(vec![LargeUtf8, Int64]), - ], + signature: Signature::coercible( + vec![logical_string(), logical_int64()], Volatility::Immutable, ), } @@ -87,6 +81,10 @@ impl ScalarUDFImpl for RightFunc { make_scalar_function(right::, vec![])(args) } DataType::LargeUtf8 => make_scalar_function(right::, vec![])(args), + DataType::Dictionary(_, v) => match &*v { + DataType::Utf8 => make_scalar_function(right::, vec![])(args), + _ => exec_err!("Unsupported data type {v:?}"), + }, other => exec_err!( "Unsupported data type {other:?} for function right,\ expected Utf8View, Utf8 or LargeUtf8." @@ -132,10 +130,27 @@ pub fn right(args: &[ArrayRef]) -> Result { // string_view_right(args) let string_array = as_string_view_array(&args[0])?; right_impl::(&mut string_array.iter(), n_array) - } else { + } else if args[0].data_type() == &DataType::Utf8 + || args[0].data_type() == &DataType::LargeUtf8 + { // string_right::(args) let string_array = &as_generic_string_array::(&args[0])?; right_impl::(&mut string_array.iter(), n_array) + } else { + let dict_array = match &args[0].data_type() { + DataType::Dictionary(k, _) => match **k { + DataType::Int32 => Ok(as_dictionary_array::(&args[0])), + _ => exec_err!("Unsupported Dictionary key type {k:?}"), + }, + _ => exec_err!("Unsupported type {:?}", &args[0].data_type()), + } + .unwrap(); + let string_array = dict_array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + right_impl::(&mut string_array.iter(), n_array) } } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index bd9d625105e9..13704798c037 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -21,12 +21,13 @@ use arrow::array::{ ArrayRef, AsArray, GenericStringArray, GenericStringBuilder, Int64Array, OffsetSizeTrait, StringViewArray, }; -use arrow::datatypes::DataType; +use arrow::datatypes::{DataType, Int32Type}; use datafusion_common::cast::as_int64_array; +use datafusion_common::types::{logical_int64, logical_string}; use datafusion_common::DataFusionError; use datafusion_common::{exec_err, Result}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_STRING; -use datafusion_expr::TypeSignature::Exact; +use datafusion_expr::TypeSignature; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; @@ -34,7 +35,7 @@ use std::any::Any; use std::fmt::Write; use std::sync::{Arc, OnceLock}; use unicode_segmentation::UnicodeSegmentation; -use DataType::{LargeUtf8, Utf8, Utf8View}; +use DataType::{Dictionary, LargeUtf8, Utf8, Utf8View}; #[derive(Debug)] pub struct RPadFunc { @@ -49,22 +50,15 @@ impl Default for RPadFunc { impl RPadFunc { pub fn new() -> Self { - use DataType::*; Self { signature: Signature::one_of( vec![ - Exact(vec![Utf8View, Int64]), - Exact(vec![Utf8View, Int64, Utf8View]), - Exact(vec![Utf8View, Int64, Utf8]), - Exact(vec![Utf8View, Int64, LargeUtf8]), - Exact(vec![Utf8, Int64]), - Exact(vec![Utf8, Int64, Utf8View]), - Exact(vec![Utf8, Int64, Utf8]), - Exact(vec![Utf8, Int64, LargeUtf8]), - Exact(vec![LargeUtf8, Int64]), - Exact(vec![LargeUtf8, Int64, Utf8View]), - Exact(vec![LargeUtf8, Int64, Utf8]), - Exact(vec![LargeUtf8, Int64, LargeUtf8]), + TypeSignature::Coercible(vec![logical_string(), logical_int64()]), + TypeSignature::Coercible(vec![ + logical_string(), + logical_int64(), + logical_string(), + ]), ], Volatility::Immutable, ), @@ -111,6 +105,14 @@ impl ScalarUDFImpl for RPadFunc { (3, LargeUtf8, Some(Utf8 | Utf8View)) => { make_scalar_function(rpad::, vec![])(args) } + (3, Dictionary(_, v), Some(Utf8 | Utf8View)) => match &*v { + Utf8 => make_scalar_function(rpad::, vec![])(args), + _ => exec_err!("Unsupported data type {v:?}"), + }, + (3, Dictionary(_, v), Some(LargeUtf8)) => match &*v { + Utf8 => make_scalar_function(rpad::, vec![])(args), + _ => exec_err!("Unsupported data type {v:?}"), + }, (_, _, _) => { exec_err!("Unsupported combination of data types for function rpad") } @@ -197,6 +199,39 @@ pub fn rpad( length_array, Some(args[2].as_string_view()), ), + (3, Dictionary(k, _), pad_str) => { + let dict_array = match &**k { + DataType::Int32 => Ok(args[0].as_dictionary::()), + _ => exec_err!("Unsupported Dictionary key type {k:?}"), + } + .unwrap(); + let string_array = dict_array + .values() + .as_any() + .downcast_ref::>() + .unwrap(); + match pad_str { + Some(Utf8 | LargeUtf8) => rpad_impl::< + &GenericStringArray, + &GenericStringArray, + StringArrayLen, + >( + string_array, + length_array, + Some(args[2].as_string::()), + ), + Some(Utf8View) => { + rpad_impl::< + &GenericStringArray, + &StringViewArray, + StringArrayLen, + >( + string_array, length_array, Some(args[2].as_string_view()) + ) + } + _ => exec_err!("Unsupported type {pad_str:?}"), + } + } (_, _, _) => rpad_impl::< &GenericStringArray, &GenericStringArray, diff --git a/datafusion/sqllogictest/test_files/jctest.slt b/datafusion/sqllogictest/test_files/jctest.slt new file mode 100644 index 000000000000..7e8ee80ab8a4 --- /dev/null +++ b/datafusion/sqllogictest/test_files/jctest.slt @@ -0,0 +1,49 @@ +# -------------------------------------- +# String_view specific tests +# -------------------------------------- +statement ok +create table test_source as values + ('Andrew', 'X', 'datafusion📊🔥', '🔥'), + ('Xiangpeng', 'Xiangpeng', 'datafusion数据融合', 'datafusion数据融合'), + ('Raphael', 'R', 'datafusionДатаФусион', 'аФус'), + ('under_score', 'un_____core', 'un iść core', 'chrząszcz na łące w 東京都'), + ('percent', 'p%t', 'pan Tadeusz ma iść w kąt', 'Pan Tadeusz ma frunąć stąd w kąt'), + ('', '%', '', ''), + ('%', '\%', '', ''), + ('_', '\_', '', ''), + (NULL, '%', NULL, NULL), + (NULL, 'R', NULL, '🔥'); + +# -------------------------------------- +# Setup test tables with different physical string types +# and repeat tests in `string_query.slt.part` +# -------------------------------------- +statement ok +create table test_basic_operator as +select + arrow_cast(column1, 'Utf8') as ascii_1, + arrow_cast(column2, 'Utf8') as ascii_2, + arrow_cast(column3, 'Utf8') as unicode_1, + arrow_cast(column4, 'Utf8') as unicode_2 +from test_source; + +query TTTTTT +SELECT + RIGHT(ascii_1, 3), + RIGHT(ascii_1, 0), + RIGHT(ascii_1, -3), + RIGHT(unicode_1, 3), + RIGHT(unicode_1, 0), + RIGHT(unicode_1, -3) +FROM test_basic_operator; +---- +rew (empty) rew n📊🔥 (empty) afusion📊🔥 +eng (empty) ngpeng 据融合 (empty) afusion数据融合 +ael (empty) hael ион (empty) afusionДатаФусион +ore (empty) er_score ore (empty) iść core +ent (empty) cent kąt (empty) Tadeusz ma iść w kąt +(empty) (empty) (empty) (empty) (empty) (empty) +% (empty) (empty) (empty) (empty) (empty) +_ (empty) (empty) (empty) (empty) (empty) +NULL NULL NULL NULL NULL NULL +NULL NULL NULL NULL NULL NULL \ No newline at end of file