Skip to content

Commit 09f5a54

Browse files
authored
move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions (#9841)
* Fix to_timestamp benchmark * Remove reference to simd and nightly build as simd is no longer an available feature in DataFusion and building with nightly may not be a good recommendation when getting started. * Fixed missing trim() function. * Create unicode module in datafusion/functions/src/unicode and unicode_expressions feature flag, move char_length function * move Left, Lpad, Reverse, Right, Rpad functions to datafusion_functions * Code cleanup from PR review.
1 parent 81c96fc commit 09f5a54

File tree

16 files changed

+1428
-1016
lines changed

16 files changed

+1428
-1016
lines changed

datafusion/expr/src/built_in_function.rs

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -111,18 +111,8 @@ pub enum BuiltinScalarFunction {
111111
EndsWith,
112112
/// initcap
113113
InitCap,
114-
/// left
115-
Left,
116-
/// lpad
117-
Lpad,
118114
/// random
119115
Random,
120-
/// reverse
121-
Reverse,
122-
/// right
123-
Right,
124-
/// rpad
125-
Rpad,
126116
/// strpos
127117
Strpos,
128118
/// substr
@@ -220,12 +210,7 @@ impl BuiltinScalarFunction {
220210
BuiltinScalarFunction::ConcatWithSeparator => Volatility::Immutable,
221211
BuiltinScalarFunction::EndsWith => Volatility::Immutable,
222212
BuiltinScalarFunction::InitCap => Volatility::Immutable,
223-
BuiltinScalarFunction::Left => Volatility::Immutable,
224-
BuiltinScalarFunction::Lpad => Volatility::Immutable,
225213
BuiltinScalarFunction::Radians => Volatility::Immutable,
226-
BuiltinScalarFunction::Reverse => Volatility::Immutable,
227-
BuiltinScalarFunction::Right => Volatility::Immutable,
228-
BuiltinScalarFunction::Rpad => Volatility::Immutable,
229214
BuiltinScalarFunction::Strpos => Volatility::Immutable,
230215
BuiltinScalarFunction::Substr => Volatility::Immutable,
231216
BuiltinScalarFunction::Translate => Volatility::Immutable,
@@ -264,17 +249,8 @@ impl BuiltinScalarFunction {
264249
BuiltinScalarFunction::InitCap => {
265250
utf8_to_str_type(&input_expr_types[0], "initcap")
266251
}
267-
BuiltinScalarFunction::Left => utf8_to_str_type(&input_expr_types[0], "left"),
268-
BuiltinScalarFunction::Lpad => utf8_to_str_type(&input_expr_types[0], "lpad"),
269252
BuiltinScalarFunction::Pi => Ok(Float64),
270253
BuiltinScalarFunction::Random => Ok(Float64),
271-
BuiltinScalarFunction::Reverse => {
272-
utf8_to_str_type(&input_expr_types[0], "reverse")
273-
}
274-
BuiltinScalarFunction::Right => {
275-
utf8_to_str_type(&input_expr_types[0], "right")
276-
}
277-
BuiltinScalarFunction::Rpad => utf8_to_str_type(&input_expr_types[0], "rpad"),
278254
BuiltinScalarFunction::EndsWith => Ok(Boolean),
279255
BuiltinScalarFunction::Strpos => {
280256
utf8_to_int_type(&input_expr_types[0], "strpos/instr/position")
@@ -361,28 +337,9 @@ impl BuiltinScalarFunction {
361337
BuiltinScalarFunction::Coalesce => {
362338
Signature::variadic_equal(self.volatility())
363339
}
364-
BuiltinScalarFunction::InitCap | BuiltinScalarFunction::Reverse => {
340+
BuiltinScalarFunction::InitCap => {
365341
Signature::uniform(1, vec![Utf8, LargeUtf8], self.volatility())
366342
}
367-
BuiltinScalarFunction::Lpad | BuiltinScalarFunction::Rpad => {
368-
Signature::one_of(
369-
vec![
370-
Exact(vec![Utf8, Int64]),
371-
Exact(vec![LargeUtf8, Int64]),
372-
Exact(vec![Utf8, Int64, Utf8]),
373-
Exact(vec![LargeUtf8, Int64, Utf8]),
374-
Exact(vec![Utf8, Int64, LargeUtf8]),
375-
Exact(vec![LargeUtf8, Int64, LargeUtf8]),
376-
],
377-
self.volatility(),
378-
)
379-
}
380-
BuiltinScalarFunction::Left | BuiltinScalarFunction::Right => {
381-
Signature::one_of(
382-
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
383-
self.volatility(),
384-
)
385-
}
386343

387344
BuiltinScalarFunction::EndsWith | BuiltinScalarFunction::Strpos => {
388345
Signature::one_of(
@@ -580,11 +537,6 @@ impl BuiltinScalarFunction {
580537
BuiltinScalarFunction::ConcatWithSeparator => &["concat_ws"],
581538
BuiltinScalarFunction::EndsWith => &["ends_with"],
582539
BuiltinScalarFunction::InitCap => &["initcap"],
583-
BuiltinScalarFunction::Left => &["left"],
584-
BuiltinScalarFunction::Lpad => &["lpad"],
585-
BuiltinScalarFunction::Reverse => &["reverse"],
586-
BuiltinScalarFunction::Right => &["right"],
587-
BuiltinScalarFunction::Rpad => &["rpad"],
588540
BuiltinScalarFunction::Strpos => &["strpos", "instr", "position"],
589541
BuiltinScalarFunction::Substr => &["substr"],
590542
BuiltinScalarFunction::Translate => &["translate"],

datafusion/expr/src/expr_fn.rs

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -578,25 +578,11 @@ scalar_expr!(Atan2, atan2, y x, "inverse tangent of a division given in the argu
578578
scalar_expr!(Log, log, base x, "logarithm of a `x` for a particular `base`");
579579

580580
scalar_expr!(InitCap, initcap, string, "converts the first letter of each word in `string` in uppercase and the remaining characters in lowercase");
581-
scalar_expr!(Left, left, string n, "returns the first `n` characters in the `string`");
582-
scalar_expr!(Reverse, reverse, string, "reverses the `string`");
583-
scalar_expr!(Right, right, string n, "returns the last `n` characters in the `string`");
584581
scalar_expr!(EndsWith, ends_with, string suffix, "whether the `string` ends with the `suffix`");
585582
scalar_expr!(Strpos, strpos, string substring, "finds the position from where the `substring` matches the `string`");
586583
scalar_expr!(Substr, substr, string position, "substring from the `position` to the end");
587584
scalar_expr!(Substr, substring, string position length, "substring from the `position` with `length` characters");
588585
scalar_expr!(Translate, translate, string from to, "replaces the characters in `from` with the counterpart in `to`");
589-
//use vec as parameter
590-
nary_scalar_expr!(
591-
Lpad,
592-
lpad,
593-
"fill up a string to the length by prepending the characters"
594-
);
595-
nary_scalar_expr!(
596-
Rpad,
597-
rpad,
598-
"fill up a string to the length by appending the characters"
599-
);
600586
nary_scalar_expr!(Coalesce, coalesce, "returns `coalesce(args...)`, which evaluates to the value of the first [Expr] which is not NULL");
601587
//there is a func concat_ws before, so use concat_ws_expr as name.c
602588
nary_scalar_expr!(
@@ -1028,13 +1014,6 @@ mod test {
10281014
test_scalar_expr!(Gcd, gcd, arg_1, arg_2);
10291015
test_scalar_expr!(Lcm, lcm, arg_1, arg_2);
10301016
test_scalar_expr!(InitCap, initcap, string);
1031-
test_scalar_expr!(Left, left, string, count);
1032-
test_nary_scalar_expr!(Lpad, lpad, string, count);
1033-
test_nary_scalar_expr!(Lpad, lpad, string, count, characters);
1034-
test_scalar_expr!(Reverse, reverse, string);
1035-
test_scalar_expr!(Right, right, string, count);
1036-
test_nary_scalar_expr!(Rpad, rpad, string, count);
1037-
test_nary_scalar_expr!(Rpad, rpad, string, count, characters);
10381017
test_scalar_expr!(EndsWith, ends_with, string, characters);
10391018
test_scalar_expr!(Strpos, strpos, string, substring);
10401019
test_scalar_expr!(Substr, substr, string, position);
Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::cmp::Ordering;
20+
use std::sync::Arc;
21+
22+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
23+
use arrow::datatypes::DataType;
24+
25+
use datafusion_common::cast::{as_generic_string_array, as_int64_array};
26+
use datafusion_common::exec_err;
27+
use datafusion_common::Result;
28+
use datafusion_expr::TypeSignature::Exact;
29+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
30+
31+
use crate::utils::{make_scalar_function, utf8_to_str_type};
32+
33+
#[derive(Debug)]
34+
pub(super) struct LeftFunc {
35+
signature: Signature,
36+
}
37+
38+
impl LeftFunc {
39+
pub fn new() -> Self {
40+
use DataType::*;
41+
Self {
42+
signature: Signature::one_of(
43+
vec![Exact(vec![Utf8, Int64]), Exact(vec![LargeUtf8, Int64])],
44+
Volatility::Immutable,
45+
),
46+
}
47+
}
48+
}
49+
50+
impl ScalarUDFImpl for LeftFunc {
51+
fn as_any(&self) -> &dyn Any {
52+
self
53+
}
54+
55+
fn name(&self) -> &str {
56+
"left"
57+
}
58+
59+
fn signature(&self) -> &Signature {
60+
&self.signature
61+
}
62+
63+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
64+
utf8_to_str_type(&arg_types[0], "left")
65+
}
66+
67+
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
68+
match args[0].data_type() {
69+
DataType::Utf8 => make_scalar_function(left::<i32>, vec![])(args),
70+
DataType::LargeUtf8 => make_scalar_function(left::<i64>, vec![])(args),
71+
other => exec_err!("Unsupported data type {other:?} for function left"),
72+
}
73+
}
74+
}
75+
76+
/// Returns first n characters in the string, or when n is negative, returns all but last |n| characters.
77+
/// left('abcde', 2) = 'ab'
78+
/// The implementation uses UTF-8 code points as characters
79+
pub fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
80+
let string_array = as_generic_string_array::<T>(&args[0])?;
81+
let n_array = as_int64_array(&args[1])?;
82+
let result = string_array
83+
.iter()
84+
.zip(n_array.iter())
85+
.map(|(string, n)| match (string, n) {
86+
(Some(string), Some(n)) => match n.cmp(&0) {
87+
Ordering::Less => {
88+
let len = string.chars().count() as i64;
89+
Some(if n.abs() < len {
90+
string.chars().take((len + n) as usize).collect::<String>()
91+
} else {
92+
"".to_string()
93+
})
94+
}
95+
Ordering::Equal => Some("".to_string()),
96+
Ordering::Greater => {
97+
Some(string.chars().take(n as usize).collect::<String>())
98+
}
99+
},
100+
_ => None,
101+
})
102+
.collect::<GenericStringArray<T>>();
103+
104+
Ok(Arc::new(result) as ArrayRef)
105+
}
106+
107+
#[cfg(test)]
108+
mod tests {
109+
use arrow::array::{Array, StringArray};
110+
use arrow::datatypes::DataType::Utf8;
111+
112+
use datafusion_common::{Result, ScalarValue};
113+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
114+
115+
use crate::unicode::left::LeftFunc;
116+
use crate::utils::test::test_function;
117+
118+
#[test]
119+
fn test_functions() -> Result<()> {
120+
test_function!(
121+
LeftFunc::new(),
122+
&[
123+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
124+
ColumnarValue::Scalar(ScalarValue::from(2i64)),
125+
],
126+
Ok(Some("ab")),
127+
&str,
128+
Utf8,
129+
StringArray
130+
);
131+
test_function!(
132+
LeftFunc::new(),
133+
&[
134+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
135+
ColumnarValue::Scalar(ScalarValue::from(200i64)),
136+
],
137+
Ok(Some("abcde")),
138+
&str,
139+
Utf8,
140+
StringArray
141+
);
142+
test_function!(
143+
LeftFunc::new(),
144+
&[
145+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
146+
ColumnarValue::Scalar(ScalarValue::from(-2i64)),
147+
],
148+
Ok(Some("abc")),
149+
&str,
150+
Utf8,
151+
StringArray
152+
);
153+
test_function!(
154+
LeftFunc::new(),
155+
&[
156+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
157+
ColumnarValue::Scalar(ScalarValue::from(-200i64)),
158+
],
159+
Ok(Some("")),
160+
&str,
161+
Utf8,
162+
StringArray
163+
);
164+
test_function!(
165+
LeftFunc::new(),
166+
&[
167+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
168+
ColumnarValue::Scalar(ScalarValue::from(0i64)),
169+
],
170+
Ok(Some("")),
171+
&str,
172+
Utf8,
173+
StringArray
174+
);
175+
test_function!(
176+
LeftFunc::new(),
177+
&[
178+
ColumnarValue::Scalar(ScalarValue::Utf8(None)),
179+
ColumnarValue::Scalar(ScalarValue::from(2i64)),
180+
],
181+
Ok(None),
182+
&str,
183+
Utf8,
184+
StringArray
185+
);
186+
test_function!(
187+
LeftFunc::new(),
188+
&[
189+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
190+
ColumnarValue::Scalar(ScalarValue::Int64(None)),
191+
],
192+
Ok(None),
193+
&str,
194+
Utf8,
195+
StringArray
196+
);
197+
test_function!(
198+
LeftFunc::new(),
199+
&[
200+
ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
201+
ColumnarValue::Scalar(ScalarValue::from(5i64)),
202+
],
203+
Ok(Some("joséé")),
204+
&str,
205+
Utf8,
206+
StringArray
207+
);
208+
test_function!(
209+
LeftFunc::new(),
210+
&[
211+
ColumnarValue::Scalar(ScalarValue::from("joséésoj")),
212+
ColumnarValue::Scalar(ScalarValue::from(-3i64)),
213+
],
214+
Ok(Some("joséé")),
215+
&str,
216+
Utf8,
217+
StringArray
218+
);
219+
#[cfg(not(feature = "unicode_expressions"))]
220+
test_function!(
221+
LeftFunc::new(),
222+
&[
223+
ColumnarValue::Scalar(ScalarValue::from("abcde")),
224+
ColumnarValue::Scalar(ScalarValue::from(2i64)),
225+
],
226+
internal_err!(
227+
"function left requires compilation with feature flag: unicode_expressions."
228+
),
229+
&str,
230+
Utf8,
231+
StringArray
232+
);
233+
234+
Ok(())
235+
}
236+
}

0 commit comments

Comments
 (0)