Skip to content

Commit bd48262

Browse files
thinh2alamb
andauthored
implement utf8_view for replace (#12004)
* draft implement utf8_view for replace * add function signature * Add sql test * move macro util to replace function * remove unused import * rust format * change return type from utf8view to utf8 --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 7c5a8eb commit bd48262

File tree

3 files changed

+106
-9
lines changed

3 files changed

+106
-9
lines changed

datafusion/functions/src/string/replace.rs

Lines changed: 82 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
21+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
2222
use arrow::datatypes::DataType;
2323

24-
use datafusion_common::cast::as_generic_string_array;
24+
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
2525
use datafusion_common::{exec_err, Result};
2626
use datafusion_expr::TypeSignature::*;
2727
use datafusion_expr::{ColumnarValue, Volatility};
@@ -45,7 +45,11 @@ impl ReplaceFunc {
4545
use DataType::*;
4646
Self {
4747
signature: Signature::one_of(
48-
vec![Exact(vec![Utf8, Utf8, Utf8])],
48+
vec![
49+
Exact(vec![Utf8View, Utf8View, Utf8View]),
50+
Exact(vec![Utf8, Utf8, Utf8]),
51+
Exact(vec![LargeUtf8, LargeUtf8, LargeUtf8]),
52+
],
4953
Volatility::Immutable,
5054
),
5155
}
@@ -73,13 +77,31 @@ impl ScalarUDFImpl for ReplaceFunc {
7377
match args[0].data_type() {
7478
DataType::Utf8 => make_scalar_function(replace::<i32>, vec![])(args),
7579
DataType::LargeUtf8 => make_scalar_function(replace::<i64>, vec![])(args),
80+
DataType::Utf8View => make_scalar_function(replace_view, vec![])(args),
7681
other => {
7782
exec_err!("Unsupported data type {other:?} for function replace")
7883
}
7984
}
8085
}
8186
}
8287

88+
fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> {
89+
let string_array = as_string_view_array(&args[0])?;
90+
let from_array = as_string_view_array(&args[1])?;
91+
let to_array = as_string_view_array(&args[2])?;
92+
93+
let result = string_array
94+
.iter()
95+
.zip(from_array.iter())
96+
.zip(to_array.iter())
97+
.map(|((string, from), to)| match (string, from, to) {
98+
(Some(string), Some(from), Some(to)) => Some(string.replace(from, to)),
99+
_ => None,
100+
})
101+
.collect::<StringArray>();
102+
103+
Ok(Arc::new(result) as ArrayRef)
104+
}
83105
/// Replaces all occurrences in string of substring from with substring to.
84106
/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef'
85107
fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
@@ -100,4 +122,60 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
100122
Ok(Arc::new(result) as ArrayRef)
101123
}
102124

103-
mod test {}
125+
#[cfg(test)]
126+
mod tests {
127+
use super::*;
128+
use crate::utils::test::test_function;
129+
use arrow::array::Array;
130+
use arrow::array::LargeStringArray;
131+
use arrow::array::StringArray;
132+
use arrow::datatypes::DataType::{LargeUtf8, Utf8};
133+
use datafusion_common::ScalarValue;
134+
#[test]
135+
fn test_functions() -> Result<()> {
136+
test_function!(
137+
ReplaceFunc::new(),
138+
&[
139+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("aabbdqcbb")))),
140+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("bb")))),
141+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ccc")))),
142+
],
143+
Ok(Some("aacccdqcccc")),
144+
&str,
145+
Utf8,
146+
StringArray
147+
);
148+
149+
test_function!(
150+
ReplaceFunc::new(),
151+
&[
152+
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from(
153+
"aabbb"
154+
)))),
155+
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("bbb")))),
156+
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("cc")))),
157+
],
158+
Ok(Some("aacc")),
159+
&str,
160+
LargeUtf8,
161+
LargeStringArray
162+
);
163+
164+
test_function!(
165+
ReplaceFunc::new(),
166+
&[
167+
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from(
168+
"aabbbcw"
169+
)))),
170+
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("bb")))),
171+
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("cc")))),
172+
],
173+
Ok(Some("aaccbcw")),
174+
&str,
175+
Utf8,
176+
StringArray
177+
);
178+
179+
Ok(())
180+
}
181+
}

datafusion/sqllogictest/test_files/functions.slt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,16 @@ SELECT replace(arrow_cast('foobar', 'Dictionary(Int32, Utf8)'), 'bar', 'hello')
826826
----
827827
foohello
828828

829+
query T
830+
SELECT replace(arrow_cast('foobar', 'Utf8View'), arrow_cast('bar', 'Utf8View'), arrow_cast('hello', 'Utf8View'))
831+
----
832+
foohello
833+
834+
query T
835+
SELECT replace(arrow_cast('foobar', 'LargeUtf8'), arrow_cast('bar', 'LargeUtf8'), arrow_cast('hello', 'LargeUtf8'))
836+
----
837+
foohello
838+
829839
query T
830840
SELECT rtrim(' foo ')
831841
----

datafusion/sqllogictest/test_files/string_view.slt

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,6 @@ logical_plan
925925
01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k
926926
02)--TableScan: test projection=[column1_utf8view]
927927

928-
929928
## Ensure no casts for REPEAT
930929
query TT
931930
EXPLAIN SELECT
@@ -937,17 +936,27 @@ logical_plan
937936
02)--TableScan: test projection=[column1_utf8view]
938937

939938
## Ensure no casts for REPLACE
940-
## TODO file ticket
941939
query TT
942940
EXPLAIN SELECT
943941
REPLACE(column1_utf8view, 'foo', 'bar') as c1,
944942
REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2
945943
FROM test;
946944
----
947945
logical_plan
948-
01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2
949-
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view
950-
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]
946+
01)Projection: replace(test.column1_utf8view, Utf8View("foo"), Utf8View("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8View("bar")) AS c2
947+
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
948+
949+
query TT
950+
SELECT
951+
REPLACE(column1_utf8view, 'foo', 'bar') as c1,
952+
REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2
953+
FROM test;
954+
----
955+
Andrew Andrew
956+
Xiangpeng bar
957+
Raphael baraphael
958+
NULL NULL
959+
951960

952961
## Ensure no casts for REVERSE
953962
query TT

0 commit comments

Comments
 (0)