Skip to content

feat: Native support utf8view for regex string operators #15275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 127 additions & 21 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1177,26 +1177,6 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
}
}

/// This will be deprecated when binary operators native support
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

outstanding ❤️

/// for Utf8View (use `string_coercion` instead).
fn regex_comparison_string_coercion(
lhs_type: &DataType,
rhs_type: &DataType,
) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
// If Utf8View is in any side, we coerce to Utf8.
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
Some(Utf8)
}
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
// Utf8 coerces to Utf8
(Utf8, Utf8) => Some(Utf8),
_ => None,
}
}

fn numeric_string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Expand Down Expand Up @@ -1327,7 +1307,7 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataT
/// Coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
regex_comparison_string_coercion(lhs_type, rhs_type)
string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}
Expand Down Expand Up @@ -1802,42 +1782,168 @@ mod tests {
Operator::RegexMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8View,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8View,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8View,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8View,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8,
DataType::Utf8View,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Utf8View,
DataType::Utf8View,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8View,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8View,
Operator::RegexMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexIMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8,
Operator::RegexIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8View,
Operator::RegexIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8View,
Operator::RegexIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8View,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8View,
Operator::RegexNotMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8.into()),
DataType::Utf8View,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Dictionary(DataType::Int32.into(), DataType::Utf8View.into()),
DataType::Utf8View,
Operator::RegexNotIMatch,
DataType::Utf8View
);
test_coercion_binary_rule!(
DataType::Int16,
DataType::Int64,
Expand Down
66 changes: 63 additions & 3 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,9 +168,12 @@ fn boolean_op(
macro_rules! binary_string_array_flag_op {
($LEFT:expr, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
match $LEFT.data_type() {
DataType::Utf8View | DataType::Utf8 => {
DataType::Utf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
},
DataType::Utf8View => {
compute_utf8view_flag_op!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG)
}
DataType::LargeUtf8 => {
compute_utf8_flag_op!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
},
Expand Down Expand Up @@ -207,22 +210,51 @@ macro_rules! compute_utf8_flag_op {
}};
}

/// Invoke a compute kernel on a pair of binary data arrays with flags
macro_rules! compute_utf8view_flag_op {
($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
let ll = $LEFT
.as_any()
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8view_flag_op failed to downcast array");
let rr = $RIGHT
.as_any()
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8view_flag_op failed to downcast array");

let flag = if $FLAG {
Some($ARRAYTYPE::from(vec!["i"; ll.len()]))
} else {
None
};
let mut array = $OP(ll, rr, flag.as_ref())?;
if $NOT {
array = not(&array).unwrap();
}
Ok(Arc::new(array))
}};
}

macro_rules! binary_string_array_flag_op_scalar {
($LEFT:ident, $RIGHT:expr, $OP:ident, $NOT:expr, $FLAG:expr) => {{
// This macro is slightly different from binary_string_array_flag_op because, when comparing with a scalar value,
// the query can be optimized in such a way that operands will be dicts, so we need to support it here
let result: Result<Arc<dyn Array>> = match $LEFT.data_type() {
DataType::Utf8View | DataType::Utf8 => {
DataType::Utf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, StringArray, $NOT, $FLAG)
},
DataType::Utf8View => {
compute_utf8view_flag_op_scalar!($LEFT, $RIGHT, $OP, StringViewArray, $NOT, $FLAG)
}
DataType::LargeUtf8 => {
compute_utf8_flag_op_scalar!($LEFT, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG)
},
DataType::Dictionary(_, _) => {
let values = $LEFT.as_any_dictionary().values();

match values.data_type() {
DataType::Utf8View | DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
DataType::Utf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, StringArray, $NOT, $FLAG),
DataType::Utf8View => compute_utf8view_flag_op_scalar!(values, $RIGHT, $OP, StringViewArray, $NOT, $FLAG),
DataType::LargeUtf8 => compute_utf8_flag_op_scalar!(values, $RIGHT, $OP, LargeStringArray, $NOT, $FLAG),
other => internal_err!(
"Data type {:?} not supported as a dictionary value type for binary_string_array_flag_op_scalar operation '{}' on string array",
Expand Down Expand Up @@ -276,6 +308,34 @@ macro_rules! compute_utf8_flag_op_scalar {
}};
}

/// Invoke a compute kernel on a data array and a scalar value with flag
macro_rules! compute_utf8view_flag_op_scalar {
($LEFT:expr, $RIGHT:expr, $OP:ident, $ARRAYTYPE:ident, $NOT:expr, $FLAG:expr) => {{
let ll = $LEFT
.as_any()
.downcast_ref::<$ARRAYTYPE>()
.expect("compute_utf8view_flag_op_scalar failed to downcast array");

let string_value = match $RIGHT.try_as_str() {
Some(Some(string_value)) => string_value,
// null literal or non string
_ => return internal_err!(
"compute_utf8view_flag_op_scalar failed to cast literal value {} for operation '{}'",
$RIGHT, stringify!($OP)
)
};

let flag = $FLAG.then_some("i");
let mut array =
paste::expr! {[<$OP _scalar>]}(ll, &string_value, flag)?;
if $NOT {
array = not(&array).unwrap();
}

Ok(Arc::new(array))
}};
}

impl PhysicalExpr for BinaryExpr {
/// Return a reference to Any that can be used for downcasting
fn as_any(&self) -> &dyn Any {
Expand Down
8 changes: 4 additions & 4 deletions datafusion/sqllogictest/test_files/string/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -1100,7 +1100,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) LIKE Utf8("%an%") AS c1
01)Projection: test.column1_utf8view ~ Utf8View("an") AS c1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks good

02)--TableScan: test projection=[column1_utf8view]

# `~*` operator (regex match case-insensitive)
Expand All @@ -1110,7 +1110,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) ~* Utf8("^a.{3}e") AS c1
01)Projection: test.column1_utf8view ~* Utf8View("^a.{3}e") AS c1
02)--TableScan: test projection=[column1_utf8view]

# `!~~` operator (not like match)
Expand All @@ -1120,7 +1120,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~ Utf8("xia_g%g") AS c1
01)Projection: test.column1_utf8view !~~ Utf8View("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]

# `!~~*` operator (not like match case-insensitive)
Expand All @@ -1130,7 +1130,7 @@ EXPLAIN SELECT
FROM test;
----
logical_plan
01)Projection: CAST(test.column1_utf8view AS Utf8) !~~* Utf8("xia_g%g") AS c1
01)Projection: test.column1_utf8view !~~* Utf8View("xia_g%g") AS c1
02)--TableScan: test projection=[column1_utf8view]

# coercions between stringview and date types
Expand Down