Skip to content

Commit ce80864

Browse files
committed
Implement native support StringView for contains function
Signed-off-by: Tai Le Manh <[email protected]>
1 parent 45dd141 commit ce80864

File tree

5 files changed

+135
-109
lines changed

5 files changed

+135
-109
lines changed

datafusion/functions/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ encoding_expressions = ["base64", "hex"]
5252
# enable math functions
5353
math_expressions = []
5454
# enable regular expressions
55-
regex_expressions = ["regex"]
55+
regex_expressions = ["regex", "string_expressions"]
5656
# enable string functions
5757
string_expressions = ["regex", "uuid"]
5858
# enable unicode functions

datafusion/functions/src/regex/mod.rs

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! "regex" DataFusion functions
1919
20+
pub mod regexp_common;
2021
pub mod regexplike;
2122
pub mod regexpmatch;
2223
pub mod regexpreplace;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Common utilities for implementing regex functions
19+
20+
use crate::string::common::StringArrayType;
21+
22+
use arrow::array::{Array, ArrayDataBuilder, BooleanArray};
23+
use arrow::datatypes::DataType;
24+
use arrow_buffer::{BooleanBufferBuilder, NullBuffer};
25+
use datafusion_common::DataFusionError;
26+
use regex::Regex;
27+
28+
use std::collections::HashMap;
29+
30+
#[cfg(doc)]
31+
use arrow::array::{LargeStringArray, StringArray, StringViewArray};
32+
/// Perform SQL `array ~ regex_array` operation on
33+
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
34+
///
35+
/// If `regex_array` element has an empty value, the corresponding result value is always true.
36+
///
37+
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag,
38+
/// which allow special search modes, such as case-insensitive and multi-line mode.
39+
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
40+
/// for more information.
41+
///
42+
/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs].
43+
///
44+
/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37
45+
pub fn regexp_is_match_utf8<'a, S1, S2, S3>(
46+
array: &'a S1,
47+
regex_array: &'a S2,
48+
flags_array: Option<&'a S3>,
49+
) -> datafusion_common::Result<BooleanArray, DataFusionError>
50+
where
51+
&'a S1: StringArrayType<'a>,
52+
&'a S2: StringArrayType<'a>,
53+
&'a S3: StringArrayType<'a>,
54+
{
55+
if array.len() != regex_array.len() {
56+
return Err(DataFusionError::Execution(
57+
"Cannot perform comparison operation on arrays of different length"
58+
.to_string(),
59+
));
60+
}
61+
62+
let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
63+
64+
let mut patterns: HashMap<String, Regex> = HashMap::new();
65+
let mut result = BooleanBufferBuilder::new(array.len());
66+
67+
let complete_pattern = match flags_array {
68+
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
69+
|(pattern, flags)| {
70+
pattern.map(|pattern| match flags {
71+
Some(flag) => format!("(?{flag}){pattern}"),
72+
None => pattern.to_string(),
73+
})
74+
},
75+
)) as Box<dyn Iterator<Item = Option<String>>>,
76+
None => Box::new(
77+
regex_array
78+
.iter()
79+
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
80+
),
81+
};
82+
83+
array
84+
.iter()
85+
.zip(complete_pattern)
86+
.map(|(value, pattern)| {
87+
match (value, pattern) {
88+
(Some(_), Some(pattern)) if pattern == *"" => {
89+
result.append(true);
90+
}
91+
(Some(value), Some(pattern)) => {
92+
let existing_pattern = patterns.get(&pattern);
93+
let re = match existing_pattern {
94+
Some(re) => re,
95+
None => {
96+
let re = Regex::new(pattern.as_str()).map_err(|e| {
97+
DataFusionError::Execution(format!(
98+
"Regular expression did not compile: {e:?}"
99+
))
100+
})?;
101+
patterns.entry(pattern).or_insert(re)
102+
}
103+
};
104+
result.append(re.is_match(value));
105+
}
106+
_ => result.append(false),
107+
}
108+
Ok(())
109+
})
110+
.collect::<datafusion_common::Result<Vec<()>, DataFusionError>>()?;
111+
112+
let data = unsafe {
113+
ArrayDataBuilder::new(DataType::Boolean)
114+
.len(array.len())
115+
.buffers(vec![result.into()])
116+
.nulls(nulls)
117+
.build_unchecked()
118+
};
119+
120+
Ok(BooleanArray::from(data))
121+
}

datafusion/functions/src/string/common.rs

+2-98
Original file line numberDiff line numberDiff line change
@@ -17,23 +17,20 @@
1717

1818
//! Common utilities for implementing string functions
1919
20-
use std::collections::HashMap;
2120
use std::fmt::{Display, Formatter};
2221
use std::sync::Arc;
2322

2423
use arrow::array::{
2524
new_null_array, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ArrayRef,
26-
BooleanArray, GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray,
25+
GenericStringArray, GenericStringBuilder, OffsetSizeTrait, StringArray,
2726
StringBuilder, StringViewArray,
2827
};
2928
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
3029
use arrow::datatypes::DataType;
31-
use arrow_buffer::BooleanBufferBuilder;
3230
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
31+
use datafusion_common::Result;
3332
use datafusion_common::{exec_err, ScalarValue};
34-
use datafusion_common::{DataFusionError, Result};
3533
use datafusion_expr::ColumnarValue;
36-
use regex::Regex;
3734

3835
pub(crate) enum TrimType {
3936
Left,
@@ -481,96 +478,3 @@ where
481478
GenericStringArray::<O>::new_unchecked(offsets, values, nulls)
482479
}))
483480
}
484-
485-
#[cfg(doc)]
486-
use arrow::array::LargeStringArray;
487-
/// Perform SQL `array ~ regex_array` operation on
488-
/// [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`].
489-
///
490-
/// If `regex_array` element has an empty value, the corresponding result value is always true.
491-
///
492-
/// `flags_array` are optional [`StringArray`] / [`LargeStringArray`] / [`StringViewArray`] flag,
493-
/// which allow special search modes, such as case-insensitive and multi-line mode.
494-
/// See the documentation [here](https://docs.rs/regex/1.5.4/regex/#grouping-and-flags)
495-
/// for more information.
496-
///
497-
/// It is inspired / copied from `regexp_is_match_utf8` [arrow-rs].
498-
///
499-
/// [arrow-rs]: https://github.com/apache/arrow-rs/blob/8c956a9f9ab26c14072740cce64c2b99cb039b13/arrow-string/src/regexp.rs#L31-L37
500-
pub fn regexp_is_match<'a, S1, S2, S3>(
501-
array: &'a S1,
502-
regex_array: &'a S2,
503-
flags_array: Option<&'a S3>,
504-
) -> Result<BooleanArray, DataFusionError>
505-
where
506-
&'a S1: StringArrayType<'a>,
507-
&'a S2: StringArrayType<'a>,
508-
&'a S3: StringArrayType<'a>,
509-
{
510-
if array.len() != regex_array.len() {
511-
return Err(DataFusionError::Execution(
512-
"Cannot perform comparison operation on arrays of different length"
513-
.to_string(),
514-
));
515-
}
516-
517-
let nulls = NullBuffer::union(array.nulls(), regex_array.nulls());
518-
519-
let mut patterns: HashMap<String, Regex> = HashMap::new();
520-
let mut result = BooleanBufferBuilder::new(array.len());
521-
522-
let complete_pattern = match flags_array {
523-
Some(flags) => Box::new(regex_array.iter().zip(flags.iter()).map(
524-
|(pattern, flags)| {
525-
pattern.map(|pattern| match flags {
526-
Some(flag) => format!("(?{flag}){pattern}"),
527-
None => pattern.to_string(),
528-
})
529-
},
530-
)) as Box<dyn Iterator<Item = Option<String>>>,
531-
None => Box::new(
532-
regex_array
533-
.iter()
534-
.map(|pattern| pattern.map(|pattern| pattern.to_string())),
535-
),
536-
};
537-
538-
array
539-
.iter()
540-
.zip(complete_pattern)
541-
.map(|(value, pattern)| {
542-
match (value, pattern) {
543-
(Some(_), Some(pattern)) if pattern == *"" => {
544-
result.append(true);
545-
}
546-
(Some(value), Some(pattern)) => {
547-
let existing_pattern = patterns.get(&pattern);
548-
let re = match existing_pattern {
549-
Some(re) => re,
550-
None => {
551-
let re = Regex::new(pattern.as_str()).map_err(|e| {
552-
DataFusionError::Execution(format!(
553-
"Regular expression did not compile: {e:?}"
554-
))
555-
})?;
556-
patterns.entry(pattern).or_insert(re)
557-
}
558-
};
559-
result.append(re.is_match(value));
560-
}
561-
_ => result.append(false),
562-
}
563-
Ok(())
564-
})
565-
.collect::<Result<Vec<()>, DataFusionError>>()?;
566-
567-
let data = unsafe {
568-
ArrayDataBuilder::new(DataType::Boolean)
569-
.len(array.len())
570-
.buffers(vec![result.into()])
571-
.nulls(nulls)
572-
.build_unchecked()
573-
};
574-
575-
Ok(BooleanArray::from(data))
576-
}

datafusion/functions/src/string/contains.rs

+10-10
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use crate::string::common::regexp_is_match;
18+
use crate::regex::regexp_common::regexp_is_match_utf8;
1919
use crate::utils::make_scalar_function;
2020

2121
use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray, StringViewArray};
@@ -92,7 +92,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
9292
(Utf8View, Utf8View) => {
9393
let mod_str = args[0].as_string_view();
9494
let match_str = args[1].as_string_view();
95-
let res = regexp_is_match::<
95+
let res = regexp_is_match_utf8::<
9696
StringViewArray,
9797
StringViewArray,
9898
GenericStringArray<i32>,
@@ -103,7 +103,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
103103
(Utf8View, Utf8) => {
104104
let mod_str = args[0].as_string_view();
105105
let match_str = args[1].as_string::<i32>();
106-
let res = regexp_is_match::<
106+
let res = regexp_is_match_utf8::<
107107
StringViewArray,
108108
GenericStringArray<i32>,
109109
GenericStringArray<i32>,
@@ -114,7 +114,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
114114
(Utf8View, LargeUtf8) => {
115115
let mod_str = args[0].as_string_view();
116116
let match_str = args[1].as_string::<i64>();
117-
let res = regexp_is_match::<
117+
let res = regexp_is_match_utf8::<
118118
StringViewArray,
119119
GenericStringArray<i64>,
120120
GenericStringArray<i32>,
@@ -125,7 +125,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
125125
(Utf8, Utf8View) => {
126126
let mod_str = args[0].as_string::<i32>();
127127
let match_str = args[1].as_string_view();
128-
let res = regexp_is_match::<
128+
let res = regexp_is_match_utf8::<
129129
GenericStringArray<i32>,
130130
StringViewArray,
131131
GenericStringArray<i32>,
@@ -136,7 +136,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
136136
(Utf8, Utf8) => {
137137
let mod_str = args[0].as_string::<i32>();
138138
let match_str = args[1].as_string::<i32>();
139-
let res = regexp_is_match::<
139+
let res = regexp_is_match_utf8::<
140140
GenericStringArray<i32>,
141141
GenericStringArray<i32>,
142142
GenericStringArray<i32>,
@@ -147,7 +147,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
147147
(Utf8, LargeUtf8) => {
148148
let mod_str = args[0].as_string::<i32>();
149149
let match_str = args[1].as_string::<i64>();
150-
let res = regexp_is_match::<
150+
let res = regexp_is_match_utf8::<
151151
GenericStringArray<i32>,
152152
GenericStringArray<i64>,
153153
GenericStringArray<i32>,
@@ -158,7 +158,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
158158
(LargeUtf8, Utf8View) => {
159159
let mod_str = args[0].as_string::<i64>();
160160
let match_str = args[1].as_string_view();
161-
let res = regexp_is_match::<
161+
let res = regexp_is_match_utf8::<
162162
GenericStringArray<i64>,
163163
StringViewArray,
164164
GenericStringArray<i32>,
@@ -169,7 +169,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
169169
(LargeUtf8, Utf8) => {
170170
let mod_str = args[0].as_string::<i64>();
171171
let match_str = args[1].as_string::<i32>();
172-
let res = regexp_is_match::<
172+
let res = regexp_is_match_utf8::<
173173
GenericStringArray<i64>,
174174
GenericStringArray<i32>,
175175
GenericStringArray<i32>,
@@ -180,7 +180,7 @@ pub fn contains(args: &[ArrayRef]) -> Result<ArrayRef, DataFusionError> {
180180
(LargeUtf8, LargeUtf8) => {
181181
let mod_str = args[0].as_string::<i64>();
182182
let match_str = args[1].as_string::<i64>();
183-
let res = regexp_is_match::<
183+
let res = regexp_is_match_utf8::<
184184
GenericStringArray<i64>,
185185
GenericStringArray<i64>,
186186
GenericStringArray<i32>,

0 commit comments

Comments
 (0)