Skip to content

Commit

Permalink
Support StringView for binary operators (#12212)
Browse files Browse the repository at this point in the history
* Support StringView for binary operators

Signed-off-by: Tai Le Manh <[email protected]>

* Fix cargo fmt

* Add and update unit tests

Signed-off-by: Tai Le Manh <[email protected]>

* Fix clippy warning

---------

Signed-off-by: Tai Le Manh <[email protected]>
  • Loading branch information
tlm365 authored Sep 9, 2024
1 parent 383bca3 commit 6316849
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 33 deletions.
24 changes: 22 additions & 2 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,26 @@ fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType>
}
}

/// This will be deprecated when binary operators native support
/// for Utf8View (use `string_coercion` instead).
fn regex_comparison_string_coercion(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
// If Utf8View is in any side, we coerce to Utf8.
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
Some(Utf8)
}
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
// Utf8 coerces to Utf8
(Utf8, Utf8) => Some(Utf8),
_ => None,
}
}

fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Expand Down Expand Up @@ -1072,10 +1092,10 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
}
}

/// coercion rules for regular expression comparison operations.
/// Coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
regex_comparison_string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}
Expand Down
6 changes: 5 additions & 1 deletion datafusion/functions/src/regex/regexpmatch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.

//! Regx expressions
//! Regex expressions
use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
use arrow::compute::kernels::regexp;
use arrow::datatypes::DataType;
Expand Down Expand Up @@ -49,6 +49,10 @@ impl RegexpMatchFunc {
Self {
signature: Signature::one_of(
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8, Utf8)`.
// If that fails, it proceeds to `(LargeUtf8, Utf8)`.
// TODO: Native support Utf8View for regexp_match.
Exact(vec![Utf8, Utf8]),
Exact(vec![LargeUtf8, Utf8]),
Exact(vec![Utf8, Utf8, Utf8]),
Expand Down
52 changes: 32 additions & 20 deletions datafusion/optimizer/src/unwrap_cast_in_comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,20 +159,26 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
expr: right_expr, ..
}),
) => {
// if the left_lit_value can be casted to the type of expr
// if the left_lit_value can be cast to the type of expr
// we need to unwrap the cast for cast/try_cast expr, and add cast to the literal
let Ok(expr_type) = right_expr.get_type(&self.schema) else {
return Ok(Transformed::no(expr));
};
let Some(value) =
try_cast_literal_to_type(left_lit_value, &expr_type)
else {
return Ok(Transformed::no(expr));
};
**left = lit(value);
// unwrap the cast/try_cast for the right expr
**right = mem::take(right_expr);
Ok(Transformed::yes(expr))
match expr_type {
// https://github.com/apache/datafusion/issues/12180
DataType::Utf8View => Ok(Transformed::no(expr)),
_ => {
let Some(value) =
try_cast_literal_to_type(left_lit_value, &expr_type)
else {
return Ok(Transformed::no(expr));
};
**left = lit(value);
// unwrap the cast/try_cast for the right expr
**right = mem::take(right_expr);
Ok(Transformed::yes(expr))
}
}
}
(
Expr::TryCast(TryCast {
Expand All @@ -183,20 +189,26 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
}),
Expr::Literal(right_lit_value),
) => {
// if the right_lit_value can be casted to the type of expr
// if the right_lit_value can be cast to the type of expr
// we need to unwrap the cast for cast/try_cast expr, and add cast to the literal
let Ok(expr_type) = left_expr.get_type(&self.schema) else {
return Ok(Transformed::no(expr));
};
let Some(value) =
try_cast_literal_to_type(right_lit_value, &expr_type)
else {
return Ok(Transformed::no(expr));
};
// unwrap the cast/try_cast for the left expr
**left = mem::take(left_expr);
**right = lit(value);
Ok(Transformed::yes(expr))
match expr_type {
// https://github.com/apache/datafusion/issues/12180
DataType::Utf8View => Ok(Transformed::no(expr)),
_ => {
let Some(value) =
try_cast_literal_to_type(right_lit_value, &expr_type)
else {
return Ok(Transformed::no(expr));
};
// unwrap the cast/try_cast for the left expr
**left = mem::take(left_expr);
**right = lit(value);
Ok(Transformed::yes(expr))
}
}
}
_ => Ok(Transformed::no(expr)),
}
Expand Down
60 changes: 54 additions & 6 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ fn boolean_op(
macro_rules! binary_string_array_flag_op {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
match $LEFT.data_type() {
DataType::Utf8 => {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
}
},
DataType::LargeUtf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
}
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op operation '{}' on string array",
other, stringify!($OP)
Expand Down Expand Up @@ -190,12 +190,12 @@ macro_rules! compute_utf8_flag_op {
macro_rules! binary_string_array_flag_op_scalar {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8 => {
DataType::Utf8View | DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
}
},
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
}
},
other => internal_err!(
"Data type {:?} not supported for binary_string_array_flag_op_scalar operation '{}' on string array",
other, stringify!($OP)
Expand Down Expand Up @@ -937,6 +937,54 @@ mod tests {
DataType::Boolean,
[true, false],
);
test_coercion!(
StringViewArray,
DataType::Utf8View,
vec!["abc"; 5],
StringArray,
DataType::Utf8,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexMatch,
BooleanArray,
DataType::Boolean,
[true, false, true, false, false],
);
test_coercion!(
StringViewArray,
DataType::Utf8View,
vec!["abc"; 5],
StringArray,
DataType::Utf8,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexIMatch,
BooleanArray,
DataType::Boolean,
[true, true, true, true, false],
);
test_coercion!(
StringArray,
DataType::Utf8,
vec!["abc"; 5],
StringViewArray,
DataType::Utf8View,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexNotMatch,
BooleanArray,
DataType::Boolean,
[false, true, false, true, true],
);
test_coercion!(
StringArray,
DataType::Utf8,
vec!["abc"; 5],
StringViewArray,
DataType::Utf8View,
vec!["^a", "^A", "(b|d)", "(B|D)", "^(b|c)"],
Operator::RegexNotIMatch,
BooleanArray,
DataType::Boolean,
[false, false, false, false, true],
);
test_coercion!(
StringArray,
DataType::Utf8,
Expand Down
86 changes: 82 additions & 4 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1104,7 +1104,6 @@ Rap (empty) Raph
NULL NULL NULL

## Ensure no casts for RPAD
## TODO file ticket
query TT
EXPLAIN SELECT
RPAD(column1_utf8view, 1) as c1,
Expand Down Expand Up @@ -1134,7 +1133,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SPLIT_PART
## TODO file ticket
query TT
EXPLAIN SELECT
SPLIT_PART(column1_utf8view, 'f', 1) as c1,
Expand All @@ -1146,7 +1144,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for STRPOS
## TODO file ticket
query TT
EXPLAIN SELECT
STRPOS(column1_utf8view, 'f') as c,
Expand All @@ -1158,7 +1155,6 @@ logical_plan
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SUBSTR
## TODO file ticket
query TT
EXPLAIN SELECT
SUBSTR(column1_utf8view, 1) as c,
Expand Down Expand Up @@ -1289,6 +1285,88 @@ XiangpengXiangpeng XiangpengXiangpeng XiangpengXiangpeng
RaphaelRaphael RaphaelRaphael RaphaelRaphael
NULL NULL NULL

## Ensure no casts for binary operators
## TODO: https://github.com/apache/datafusion/issues/12180
# `~` operator (regex match)
query TT
EXPLAIN SELECT
column1_utf8view ~ 'an' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1
02)--TableScan: test projection=[column1_utf8view]

query B
SELECT
column1_utf8view ~ 'an' AS c1
FROM test;
----
false
true
false
NULL

# `~*` operator (regex match case-insensitive)
query TT
EXPLAIN SELECT
column1_utf8view ~* '^a.{3}e' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1
02)--TableScan: test projection=[column1_utf8view]

query B
SELECT
column1_utf8view ~* '^a.{3}e' AS c1
FROM test;
----
true
false
false
NULL

# `!~~` operator (not like match)
query TT
EXPLAIN SELECT
column1_utf8view !~~ 'xia_g%g' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]

query B
SELECT
column1_utf8view !~~ 'xia_g%g' AS c1
FROM test;
----
true
true
true
NULL

# `!~~*` operator (not like match case-insensitive)
query TT
EXPLAIN SELECT
column1_utf8view !~~* 'xia_g%g' AS c1
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]

query B
SELECT
column1_utf8view !~~* 'xia_g%g' AS c1
FROM test;
----
true
false
true
NULL

statement ok
drop table test;

Expand Down

0 comments on commit 6316849

Please sign in to comment.