Skip to content

Commit 8fda4a6

Browse files
authored
feat(optimizer): handle partial anchored regex cases and improve doc (#10977)
Signed-off-by: Ruihang Xia <[email protected]>
1 parent 5cb1917 commit 8fda4a6

File tree

2 files changed

+67
-13
lines changed

2 files changed

+67
-13
lines changed

datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs

+9-3
Original file line numberDiff line numberDiff line change
@@ -2730,11 +2730,10 @@ mod tests {
27302730
// unsupported cases
27312731
assert_no_change(regex_match(col("c1"), lit("foo.*")));
27322732
assert_no_change(regex_match(col("c1"), lit("(foo)")));
2733-
assert_no_change(regex_match(col("c1"), lit("^foo")));
2734-
assert_no_change(regex_match(col("c1"), lit("foo$")));
27352733
assert_no_change(regex_match(col("c1"), lit("%")));
27362734
assert_no_change(regex_match(col("c1"), lit("_")));
27372735
assert_no_change(regex_match(col("c1"), lit("f%o")));
2736+
assert_no_change(regex_match(col("c1"), lit("^f%o")));
27382737
assert_no_change(regex_match(col("c1"), lit("f_o")));
27392738

27402739
// empty cases
@@ -2827,13 +2826,20 @@ mod tests {
28272826
assert_no_change(regex_match(col("c1"), lit("(foo|ba_r)*")));
28282827
assert_no_change(regex_match(col("c1"), lit("(fo_o|ba_r)*")));
28292828
assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*")));
2830-
assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
28312829
assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
28322830
assert_no_change(regex_match(col("c1"), lit("^")));
28332831
assert_no_change(regex_match(col("c1"), lit("$")));
28342832
assert_no_change(regex_match(col("c1"), lit("$^")));
28352833
assert_no_change(regex_match(col("c1"), lit("$foo^")));
28362834

2835+
// regular expressions that match a partial literal
2836+
assert_change(regex_match(col("c1"), lit("^foo")), like(col("c1"), "foo%"));
2837+
assert_change(regex_match(col("c1"), lit("foo$")), like(col("c1"), "%foo"));
2838+
assert_change(
2839+
regex_match(col("c1"), lit("^foo|bar$")),
2840+
like(col("c1"), "foo%").or(like(col("c1"), "%bar")),
2841+
);
2842+
28372843
// OR-chain
28382844
assert_change(
28392845
regex_match(col("c1"), lit("foo|bar|baz")),

datafusion/optimizer/src/simplify_expressions/regex.rs

+58-10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,19 @@ use regex_syntax::hir::{Capture, Hir, HirKind, Literal, Look};
2222
/// Maximum number of regex alternations (`foo|bar|...`) that will be expanded into multiple `LIKE` expressions.
2323
const MAX_REGEX_ALTERNATIONS_EXPANSION: usize = 4;
2424

25+
/// Tries to convert a regexp expression to a `LIKE` or `Eq`/`NotEq` expression.
26+
///
27+
/// This function also validates the regex pattern. And will return error if the
28+
/// pattern is invalid.
29+
///
30+
/// Typical cases this function can simplify:
31+
/// - empty regex pattern to `LIKE '%'`
32+
/// - literal regex patterns to `LIKE '%foo%'`
33+
/// - full anchored regex patterns (e.g. `^foo$`) to `= 'foo'`
34+
/// - partial anchored regex patterns (e.g. `^foo`) to `LIKE 'foo%'`
35+
/// - combinations (alternatives) of the above, will be concatenated with `OR` or `AND`
36+
///
37+
/// Dev note: unit tests of this function are in `expr_simplifier.rs`, case `test_simplify_regex`.
2538
pub fn simplify_regex_expr(
2639
left: Box<Expr>,
2740
op: Operator,
@@ -53,13 +66,15 @@ pub fn simplify_regex_expr(
5366
}
5467
}
5568

56-
// leave untouched if optimization didn't work
69+
// Leave untouched if optimization didn't work
5770
Ok(Expr::BinaryExpr(BinaryExpr { left, op, right }))
5871
}
5972

6073
#[derive(Debug)]
6174
struct OperatorMode {
75+
/// Negative match.
6276
not: bool,
77+
/// Ignore case (`true` for case-insensitive).
6378
i: bool,
6479
}
6580

@@ -80,6 +95,7 @@ impl OperatorMode {
8095
Self { not, i }
8196
}
8297

98+
/// Creates an [`LIKE`](Expr::Like) from the given `LIKE` pattern.
8399
fn expr(&self, expr: Box<Expr>, pattern: String) -> Expr {
84100
let like = Like {
85101
negated: self.not,
@@ -92,6 +108,7 @@ impl OperatorMode {
92108
Expr::Like(like)
93109
}
94110

111+
/// Creates an [`Expr::BinaryExpr`] of "`left` = `right`" or "`left` != `right`".
95112
fn expr_matches_literal(&self, left: Box<Expr>, right: Box<Expr>) -> Expr {
96113
let op = if self.not {
97114
Operator::NotEq
@@ -118,7 +135,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {
118135
Some(s)
119136
}
120137

121-
/// returns a str represented by `Literal` if it contains a valid utf8
138+
/// Returns a str represented by `Literal` if it contains a valid utf8
122139
/// sequence and is safe for like (has no '%' and '_')
123140
fn like_str_from_literal(l: &Literal) -> Option<&str> {
124141
// if not utf8, no good
@@ -131,7 +148,7 @@ fn like_str_from_literal(l: &Literal) -> Option<&str> {
131148
}
132149
}
133150

134-
/// returns a str represented by `Literal` if it contains a valid utf8
151+
/// Returns a str represented by `Literal` if it contains a valid utf8
135152
fn str_from_literal(l: &Literal) -> Option<&str> {
136153
// if not utf8, no good
137154
let s = std::str::from_utf8(&l.0).ok()?;
@@ -143,7 +160,7 @@ fn is_safe_for_like(c: char) -> bool {
143160
(c != '%') && (c != '_')
144161
}
145162

146-
/// returns true if the elements in a `Concat` pattern are:
163+
/// Returns true if the elements in a `Concat` pattern are:
147164
/// - `[Look::Start, Look::End]`
148165
/// - `[Look::Start, Literal(_), Look::End]`
149166
fn is_anchored_literal(v: &[Hir]) -> bool {
@@ -157,10 +174,9 @@ fn is_anchored_literal(v: &[Hir]) -> bool {
157174
v.last().expect("length checked"),
158175
);
159176
if !matches!(first_last,
160-
(s, e) if s.kind() == &HirKind::Look(Look::Start)
177+
(s, e) if s.kind() == &HirKind::Look(Look::Start)
161178
&& e.kind() == &HirKind::Look(Look::End)
162-
)
163-
{
179+
) {
164180
return false;
165181
}
166182

@@ -170,7 +186,7 @@ fn is_anchored_literal(v: &[Hir]) -> bool {
170186
.all(|h| matches!(h.kind(), HirKind::Literal(_)))
171187
}
172188

173-
/// returns true if the elements in a `Concat` pattern are:
189+
/// Returns true if the elements in a `Concat` pattern are:
174190
/// - `[Look::Start, Capture(Alternation(Literals...)), Look::End]`
175191
fn is_anchored_capture(v: &[Hir]) -> bool {
176192
if v.len() != 3
@@ -197,7 +213,33 @@ fn is_anchored_capture(v: &[Hir]) -> bool {
197213
true
198214
}
199215

200-
/// extracts a string literal expression assuming that [`is_anchored_literal`]
216+
/// Returns the `LIKE` pattern if the `Concat` pattern is partial anchored:
217+
/// - `[Look::Start, Literal(_)]`
218+
/// - `[Literal(_), Look::End]`
219+
/// Full anchored patterns are handled by [`anchored_literal_to_expr`].
220+
fn partial_anchored_literal_to_like(v: &[Hir]) -> Option<String> {
221+
if v.len() != 2 {
222+
return None;
223+
}
224+
225+
let (lit, match_begin) = match (&v[0].kind(), &v[1].kind()) {
226+
(HirKind::Look(Look::Start), HirKind::Literal(l)) => {
227+
(like_str_from_literal(l)?, true)
228+
}
229+
(HirKind::Literal(l), HirKind::Look(Look::End)) => {
230+
(like_str_from_literal(l)?, false)
231+
}
232+
_ => return None,
233+
};
234+
235+
if match_begin {
236+
Some(format!("{}%", lit))
237+
} else {
238+
Some(format!("%{}", lit))
239+
}
240+
}
241+
242+
/// Extracts a string literal expression assuming that [`is_anchored_literal`]
201243
/// returned true.
202244
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
203245
match v.len() {
@@ -246,6 +288,7 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
246288
None
247289
}
248290

291+
/// Tries to lower (transform) a simple regex pattern to a LIKE expression.
249292
fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
250293
match hir.kind() {
251294
HirKind::Empty => {
@@ -265,7 +308,9 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
265308
.map(|right| left.clone().in_list(right, mode.not));
266309
}
267310
HirKind::Concat(inner) => {
268-
if let Some(pattern) = collect_concat_to_like_string(inner) {
311+
if let Some(pattern) = partial_anchored_literal_to_like(inner)
312+
.or(collect_concat_to_like_string(inner))
313+
{
269314
return Some(mode.expr(Box::new(left.clone()), pattern));
270315
}
271316
}
@@ -274,6 +319,9 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
274319
None
275320
}
276321

322+
/// Calls [`lower_simple`] for each alternative and combine the results with `or` or `and`
323+
/// based on [`OperatorMode`]. Any fail attempt to lower an alternative will makes this
324+
/// function to return `None`.
277325
fn lower_alt(mode: &OperatorMode, left: &Expr, alts: &[Hir]) -> Option<Expr> {
278326
let mut accu: Option<Expr> = None;
279327

0 commit comments

Comments
 (0)