diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 3617f56905a9..d1144ed40759 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -984,6 +984,26 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option } } +/// This will be deprecated when binary operators native support +/// for Utf8View (use `string_coercion` instead). +fn regex_comparison_string_coercion( + lhs_type: &DataType, + rhs_type: &DataType, +) -> Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + // If Utf8View is in any side, we coerce to Utf8. + (Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => { + Some(Utf8) + } + // Then, if LargeUtf8 is in any side, we coerce to LargeUtf8. + (LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8), + // Utf8 coerces to Utf8 + (Utf8, Utf8) => Some(Utf8), + _ => None, + } +} + fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -1072,10 +1092,10 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - string_coercion(lhs_type, rhs_type) + regex_comparison_string_coercion(lhs_type, rhs_type) .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) } diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index 764acd7de757..bf40eff11d30 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! Regx expressions +//! Regex expressions use arrow::array::{Array, ArrayRef, OffsetSizeTrait}; use arrow::compute::kernels::regexp; use arrow::datatypes::DataType; @@ -49,6 +49,10 @@ impl RegexpMatchFunc { Self { signature: Signature::one_of( vec![ + // Planner attempts coercion to the target type starting with the most preferred candidate. + // For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`. + // If that fails, it proceeds to `(LargeUtf8, Utf8)`. + // TODO: Native support Utf8View for regexp_match. Exact(vec![Utf8, Utf8]), Exact(vec![LargeUtf8, Utf8]), Exact(vec![Utf8, Utf8, Utf8]), diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 318badc08823..2118ae96a198 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -159,20 +159,26 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter { expr: right_expr, .. }), ) => { - // if the left_lit_value can be casted to the type of expr + // if the left_lit_value can be cast to the type of expr // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal let Ok(expr_type) = right_expr.get_type(&self.schema) else { return Ok(Transformed::no(expr)); }; - let Some(value) = - try_cast_literal_to_type(left_lit_value, &expr_type) - else { - return Ok(Transformed::no(expr)); - }; - **left = lit(value); - // unwrap the cast/try_cast for the right expr - **right = mem::take(right_expr); - Ok(Transformed::yes(expr)) + match expr_type { + // https://github.com/apache/datafusion/issues/12180 + DataType::Utf8View => Ok(Transformed::no(expr)), + _ => { + let Some(value) = + try_cast_literal_to_type(left_lit_value, &expr_type) + else { + return Ok(Transformed::no(expr)); + }; + **left = lit(value); + // unwrap the cast/try_cast for the right expr + **right = mem::take(right_expr); + Ok(Transformed::yes(expr)) + } + } } ( Expr::TryCast(TryCast { @@ -183,20 +189,26 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter { }), Expr::Literal(right_lit_value), ) => { - // if the right_lit_value can be casted to the type of expr + // if the right_lit_value can be cast to the type of expr // we need to unwrap the cast for cast/try_cast expr, and add cast to the literal let Ok(expr_type) = left_expr.get_type(&self.schema) else { return Ok(Transformed::no(expr)); }; - let Some(value) = - try_cast_literal_to_type(right_lit_value, &expr_type) - else { - return Ok(Transformed::no(expr)); - }; - // unwrap the cast/try_cast for the left expr - **left = mem::take(left_expr); - **right = lit(value); - Ok(Transformed::yes(expr)) + match expr_type { + // https://github.com/apache/datafusion/issues/12180 + DataType::Utf8View => Ok(Transformed::no(expr)), + _ => { + let Some(value) = + try_cast_literal_to_type(right_lit_value, &expr_type) + else { + return Ok(Transformed::no(expr)); + }; + // unwrap the cast/try_cast for the left expr + **left = mem::take(left_expr); + **right = lit(value); + Ok(Transformed::yes(expr)) + } + } } _ => Ok(Transformed::no(expr)), } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 08c133d7193a..7500c3e2651d 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -148,12 +148,12 @@ fn boolean_op( macro_rules! binary_string_array_flag_op { ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ match $LEFT.data_type() { - DataType::Utf8 => { + DataType::Utf8View | DataType::Utf8 => { compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) - } + }, DataType::LargeUtf8 => { compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) - } + }, other => internal_err!( "Data type {:?} not supported for binary_string_array_flag_op operation '{}' on string array", other, stringify!($OP) @@ -190,12 +190,12 @@ macro_rules! compute_utf8_flag_op { macro_rules! binary_string_array_flag_op_scalar { ($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{ let result: Result> = match $LEFT.data_type() { - DataType::Utf8 => { + DataType::Utf8View | DataType::Utf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG) - } + }, DataType::LargeUtf8 => { compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG) - } + }, other => internal_err!( "Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array", other, stringify!($OP) @@ -937,6 +937,54 @@ mod tests { DataType::Boolean, [true, false], ); + test_coercion!( + StringViewArray, + DataType::Utf8View, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"], + Operator::RegexMatch, + BooleanArray, + DataType::Boolean, + [true, false, true, false, false], + ); + test_coercion!( + StringViewArray, + DataType::Utf8View, + vec!["abc"; 5], + StringArray, + DataType::Utf8, + vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"], + Operator::RegexIMatch, + BooleanArray, + DataType::Boolean, + [true, true, true, true, false], + ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringViewArray, + DataType::Utf8View, + vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"], + Operator::RegexNotMatch, + BooleanArray, + DataType::Boolean, + [false, true, false, true, true], + ); + test_coercion!( + StringArray, + DataType::Utf8, + vec!["abc"; 5], + StringViewArray, + DataType::Utf8View, + vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"], + Operator::RegexNotIMatch, + BooleanArray, + DataType::Boolean, + [false, false, false, false, true], + ); test_coercion!( StringArray, DataType::Utf8, diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index eb625e530b66..f478ba47aa4c 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -1104,7 +1104,6 @@ Rap (empty) Raph NULL NULL NULL ## Ensure no casts for RPAD -## TODO file ticket query TT EXPLAIN SELECT RPAD(column1_utf8view, 1) as c1, @@ -1134,7 +1133,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for SPLIT_PART -## TODO file ticket query TT EXPLAIN SELECT SPLIT_PART(column1_utf8view, 'f', 1) as c1, @@ -1146,7 +1144,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view] ## Ensure no casts for STRPOS -## TODO file ticket query TT EXPLAIN SELECT STRPOS(column1_utf8view, 'f') as c, @@ -1158,7 +1155,6 @@ logical_plan 02)--TableScan: test projection=[column1_utf8view, column2_utf8view] ## Ensure no casts for SUBSTR -## TODO file ticket query TT EXPLAIN SELECT SUBSTR(column1_utf8view, 1) as c, @@ -1289,6 +1285,88 @@ XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng RaphaelRaphael RaphaelRaphael RaphaelRaphael NULL NULL NULL +## Ensure no casts for binary operators +## TODO: https://github.com/apache/datafusion/issues/12180 +# `~` operator (regex match) +query TT +EXPLAIN SELECT + column1_utf8view ~ 'an' AS c1 +FROM test; +---- +logical_plan +01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1 +02)--TableScan: test projection=[column1_utf8view] + +query B +SELECT + column1_utf8view ~ 'an' AS c1 +FROM test; +---- +false +true +false +NULL + +# `~*` operator (regex match case-insensitive) +query TT +EXPLAIN SELECT + column1_utf8view ~* '^a.{3}e' AS c1 +FROM test; +---- +logical_plan +01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1 +02)--TableScan: test projection=[column1_utf8view] + +query B +SELECT + column1_utf8view ~* '^a.{3}e' AS c1 +FROM test; +---- +true +false +false +NULL + +# `!~~` operator (not like match) +query TT +EXPLAIN SELECT + column1_utf8view !~~ 'xia_g%g' AS c1 +FROM test; +---- +logical_plan +01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1 +02)--TableScan: test projection=[column1_utf8view] + +query B +SELECT + column1_utf8view !~~ 'xia_g%g' AS c1 +FROM test; +---- +true +true +true +NULL + +# `!~~*` operator (not like match case-insensitive) +query TT +EXPLAIN SELECT + column1_utf8view !~~* 'xia_g%g' AS c1 +FROM test; +---- +logical_plan +01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1 +02)--TableScan: test projection=[column1_utf8view] + +query B +SELECT + column1_utf8view !~~* 'xia_g%g' AS c1 +FROM test; +---- +true +false +true +NULL + statement ok drop table test;