Skip to content

Commit 1e4ddb6

Browse files
authored
Move repeat, replace, split_part to datafusion_functions (#9784)
* Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Move repeat, replace, split_part to datafusion_functions
1 parent cb9da2b commit 1e4ddb6

File tree

13 files changed

+469
-261
lines changed

13 files changed

+469
-261
lines changed

datafusion/expr/src/built_in_function.rs

Lines changed: 7 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -123,18 +123,12 @@ pub enum BuiltinScalarFunction {
123123
Lpad,
124124
/// random
125125
Random,
126-
/// repeat
127-
Repeat,
128-
/// replace
129-
Replace,
130126
/// reverse
131127
Reverse,
132128
/// right
133129
Right,
134130
/// rpad
135131
Rpad,
136-
/// split_part
137-
SplitPart,
138132
/// strpos
139133
Strpos,
140134
/// substr
@@ -238,12 +232,9 @@ impl BuiltinScalarFunction {
238232
BuiltinScalarFunction::Left => Volatility::Immutable,
239233
BuiltinScalarFunction::Lpad => Volatility::Immutable,
240234
BuiltinScalarFunction::Radians => Volatility::Immutable,
241-
BuiltinScalarFunction::Repeat => Volatility::Immutable,
242-
BuiltinScalarFunction::Replace => Volatility::Immutable,
243235
BuiltinScalarFunction::Reverse => Volatility::Immutable,
244236
BuiltinScalarFunction::Right => Volatility::Immutable,
245237
BuiltinScalarFunction::Rpad => Volatility::Immutable,
246-
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
247238
BuiltinScalarFunction::Strpos => Volatility::Immutable,
248239
BuiltinScalarFunction::Substr => Volatility::Immutable,
249240
BuiltinScalarFunction::Translate => Volatility::Immutable,
@@ -293,22 +284,13 @@ impl BuiltinScalarFunction {
293284
BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"),
294285
BuiltinScalarFunction::Pi => Ok(Float64),
295286
BuiltinScalarFunction::Random => Ok(Float64),
296-
BuiltinScalarFunction::Repeat => {
297-
utf8_to_str_type(&input_expr_types[0], "repeat")
298-
}
299-
BuiltinScalarFunction::Replace => {
300-
utf8_to_str_type(&input_expr_types[0], "replace")
301-
}
302287
BuiltinScalarFunction::Reverse => {
303288
utf8_to_str_type(&input_expr_types[0], "reverse")
304289
}
305290
BuiltinScalarFunction::Right => {
306291
utf8_to_str_type(&input_expr_types[0], "right")
307292
}
308293
BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"),
309-
BuiltinScalarFunction::SplitPart => {
310-
utf8_to_str_type(&input_expr_types[0], "split_part")
311-
}
312294
BuiltinScalarFunction::EndsWith => Ok(Boolean),
313295
BuiltinScalarFunction::Strpos => {
314296
utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
@@ -417,21 +399,12 @@ impl BuiltinScalarFunction {
417399
self.volatility(),
418400
)
419401
}
420-
BuiltinScalarFunction::Left
421-
| BuiltinScalarFunction::Repeat
422-
| BuiltinScalarFunction::Right => Signature::one_of(
423-
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
424-
self.volatility(),
425-
),
426-
BuiltinScalarFunction::SplitPart => Signature::one_of(
427-
vec![
428-
Exact(vec![Utf8, Utf8, Int64]),
429-
Exact(vec![LargeUtf8, Utf8, Int64]),
430-
Exact(vec![Utf8, LargeUtf8, Int64]),
431-
Exact(vec![LargeUtf8, LargeUtf8, Int64]),
432-
],
433-
self.volatility(),
434-
),
402+
BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => {
403+
Signature::one_of(
404+
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
405+
self.volatility(),
406+
)
407+
}
435408

436409
BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => {
437410
Signature::one_of(
@@ -467,7 +440,7 @@ impl BuiltinScalarFunction {
467440
self.volatility(),
468441
),
469442

470-
BuiltinScalarFunction::Replace | BuiltinScalarFunction::Translate => {
443+
BuiltinScalarFunction::Translate => {
471444
Signature::one_of(vec![Exact(vec![Utf8, Utf8, Utf8])], self.volatility())
472445
}
473446
BuiltinScalarFunction::Pi => Signature::exact(vec![], self.volatility()),
@@ -637,12 +610,9 @@ impl BuiltinScalarFunction {
637610
BuiltinScalarFunction::InitCap => &["initcap"],
638611
BuiltinScalarFunction::Left => &["left"],
639612
BuiltinScalarFunction::Lpad => &["lpad"],
640-
BuiltinScalarFunction::Repeat => &["repeat"],
641-
BuiltinScalarFunction::Replace => &["replace"],
642613
BuiltinScalarFunction::Reverse => &["reverse"],
643614
BuiltinScalarFunction::Right => &["right"],
644615
BuiltinScalarFunction::Rpad => &["rpad"],
645-
BuiltinScalarFunction::SplitPart => &["split_part"],
646616
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
647617
BuiltinScalarFunction::Substr => &["substr"],
648618
BuiltinScalarFunction::Translate => &["translate"],

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -598,11 +598,8 @@ scalar_expr!(
598598
);
599599
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase");
600600
scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`");
601-
scalar_expr!(Replace, replace, string from to, "replaces all occurrences of `from` with `to` in the `string`");
602-
scalar_expr!(Repeat, repeat, string n, "repeats the `string` to `n` times");
603601
scalar_expr!(Reverse, reverse, string, "reverses the `string`");
604602
scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`");
605-
scalar_expr!(SplitPart, split_part, string delimiter index, "splits a string based on a delimiter and picks out the desired field based on the index.");
606603
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`");
607604
scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`");
608605
scalar_expr!(Substr, substr, string position, "substring from the `position` to the end");
@@ -1056,13 +1053,10 @@ mod test {
10561053
test_scalar_expr!(Left, left, string, count);
10571054
test_nary_scalar_expr!(Lpad, lpad, string, count);
10581055
test_nary_scalar_expr!(Lpad, lpad, string, count, characters);
1059-
test_scalar_expr!(Replace, replace, string, from, to);
1060-
test_scalar_expr!(Repeat, repeat, string, count);
10611056
test_scalar_expr!(Reverse, reverse, string);
10621057
test_scalar_expr!(Right, right, string, count);
10631058
test_nary_scalar_expr!(Rpad, rpad, string, count);
10641059
test_nary_scalar_expr!(Rpad, rpad, string, count, characters);
1065-
test_scalar_expr!(SplitPart, split_part, expr, delimiter, index);
10661060
test_scalar_expr!(EndsWith, ends_with, string, characters);
10671061
test_scalar_expr!(Strpos, strpos, string, substring);
10681062
test_scalar_expr!(Substr, substr, string, position);

datafusion/functions/src/string/mod.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,10 @@ mod lower;
2929
mod ltrim;
3030
mod octet_length;
3131
mod overlay;
32+
mod repeat;
33+
mod replace;
3234
mod rtrim;
35+
mod split_part;
3336
mod starts_with;
3437
mod to_hex;
3538
mod upper;
@@ -43,8 +46,11 @@ make_udf_function!(ltrim::LtrimFunc, LTRIM, ltrim);
4346
make_udf_function!(lower::LowerFunc, LOWER, lower);
4447
make_udf_function!(octet_length::OctetLengthFunc, OCTET_LENGTH, octet_length);
4548
make_udf_function!(overlay::OverlayFunc, OVERLAY, overlay);
49+
make_udf_function!(repeat::RepeatFunc, REPEAT, repeat);
50+
make_udf_function!(replace::ReplaceFunc, REPLACE, replace);
4651
make_udf_function!(rtrim::RtrimFunc, RTRIM, rtrim);
4752
make_udf_function!(starts_with::StartsWithFunc, STARTS_WITH, starts_with);
53+
make_udf_function!(split_part::SplitPartFunc, SPLIT_PART, split_part);
4854
make_udf_function!(to_hex::ToHexFunc, TO_HEX, to_hex);
4955
make_udf_function!(upper::UpperFunc, UPPER, upper);
5056
make_udf_function!(uuid::UuidFunc, UUID, uuid);
@@ -87,11 +93,26 @@ pub mod expr_fn {
8793
super::overlay().call(args)
8894
}
8995

96+
#[doc = "Repeats the `string` to `n` times"]
97+
pub fn repeat(string: Expr, n: Expr) -> Expr {
98+
super::repeat().call(vec![string, n])
99+
}
100+
101+
#[doc = "Replaces all occurrences of `from` with `to` in the `string`"]
102+
pub fn replace(string: Expr, from: Expr, to: Expr) -> Expr {
103+
super::replace().call(vec![string, from, to])
104+
}
105+
90106
#[doc = "Removes all characters, spaces by default, from the end of a string"]
91107
pub fn rtrim(args: Vec<Expr>) -> Expr {
92108
super::rtrim().call(args)
93109
}
94110

111+
#[doc = "Splits a string based on a delimiter and picks out the desired field based on the index."]
112+
pub fn split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr {
113+
super::split_part().call(vec![string, delimiter, index])
114+
}
115+
95116
#[doc = "Returns true if string starts with prefix."]
96117
pub fn starts_with(arg1: Expr, arg2: Expr) -> Expr {
97118
super::starts_with().call(vec![arg1, arg2])
@@ -128,7 +149,10 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
128149
ltrim(),
129150
octet_length(),
130151
overlay(),
152+
repeat(),
153+
replace(),
131154
rtrim(),
155+
split_part(),
132156
starts_with(),
133157
to_hex(),
134158
upper(),
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::sync::Arc;
20+
21+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
22+
use arrow::datatypes::DataType;
23+
24+
use datafusion_common::cast::{as_generic_string_array, as_int64_array};
25+
use datafusion_common::{exec_err, Result};
26+
use datafusion_expr::TypeSignature::*;
27+
use datafusion_expr::{ColumnarValue, Volatility};
28+
use datafusion_expr::{ScalarUDFImpl, Signature};
29+
30+
use crate::string::common::*;
31+
32+
#[derive(Debug)]
33+
pub(super) struct RepeatFunc {
34+
signature: Signature,
35+
}
36+
37+
impl RepeatFunc {
38+
pub fn new() -> Self {
39+
use DataType::*;
40+
Self {
41+
signature: Signature::one_of(
42+
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
43+
Volatility::Immutable,
44+
),
45+
}
46+
}
47+
}
48+
49+
impl ScalarUDFImpl for RepeatFunc {
50+
fn as_any(&self) -> &dyn Any {
51+
self
52+
}
53+
54+
fn name(&self) -> &str {
55+
"repeat"
56+
}
57+
58+
fn signature(&self) -> &Signature {
59+
&self.signature
60+
}
61+
62+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
63+
utf8_to_str_type(&arg_types[0], "repeat")
64+
}
65+
66+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
67+
match args[0].data_type() {
68+
DataType::Utf8 => make_scalar_function(repeat::<i32>, vec![])(args),
69+
DataType::LargeUtf8 => make_scalar_function(repeat::<i64>, vec![])(args),
70+
other => exec_err!("Unsupported data type {other:?} for function repeat"),
71+
}
72+
}
73+
}
74+
75+
/// Repeats string the specified number of times.
76+
/// repeat('Pg', 4) = 'PgPgPgPg'
77+
fn repeat<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
78+
let string_array = as_generic_string_array::<T>(&args[0])?;
79+
let number_array = as_int64_array(&args[1])?;
80+
81+
let result = string_array
82+
.iter()
83+
.zip(number_array.iter())
84+
.map(|(string, number)| match (string, number) {
85+
(Some(string), Some(number)) => Some(string.repeat(number as usize)),
86+
_ => None,
87+
})
88+
.collect::<GenericStringArray<T>>();
89+
90+
Ok(Arc::new(result) as ArrayRef)
91+
}
92+
93+
#[cfg(test)]
94+
mod tests {
95+
use arrow::array::{Array, StringArray};
96+
use arrow::datatypes::DataType::Utf8;
97+
98+
use datafusion_common::Result;
99+
use datafusion_common::ScalarValue;
100+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
101+
102+
use crate::string::common::test::test_function;
103+
use crate::string::repeat::RepeatFunc;
104+
105+
#[test]
106+
fn test_functions() -> Result<()> {
107+
test_function!(
108+
RepeatFunc::new(),
109+
&[
110+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))),
111+
ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
112+
],
113+
Ok(Some("PgPgPgPg")),
114+
&str,
115+
Utf8,
116+
StringArray
117+
);
118+
119+
test_function!(
120+
RepeatFunc::new(),
121+
&[
122+
ColumnarValue::Scalar(ScalarValue::Utf8(None)),
123+
ColumnarValue::Scalar(ScalarValue::Int64(Some(4))),
124+
],
125+
Ok(None),
126+
&str,
127+
Utf8,
128+
StringArray
129+
);
130+
test_function!(
131+
RepeatFunc::new(),
132+
&[
133+
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("Pg")))),
134+
ColumnarValue::Scalar(ScalarValue::Int64(None)),
135+
],
136+
Ok(None),
137+
&str,
138+
Utf8,
139+
StringArray
140+
);
141+
142+
Ok(())
143+
}
144+
}

0 commit comments

Comments
 (0)