Skip to content

Commit 45ed5aa

Browse files
shruti2522alamb
andauthored
fix: core_expressions feature flag broken, move overlay into core functions (#15217)
* fix: remove core_expressions, move overlay to core * fix license header * fix header * update doc * Remove CI reference --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent b38c731 commit 45ed5aa

File tree

9 files changed

+317
-277
lines changed

9 files changed

+317
-277
lines changed

.github/workflows/rust.yml

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -235,9 +235,6 @@ jobs:
235235
#
236236
- name: Check datafusion-functions (no-default-features)
237237
run: cargo check --profile ci --no-default-features -p datafusion-functions
238-
# Fails due https://github.com/apache/datafusion/issues/15207
239-
#- name: Check datafusion-functions (core_expressions)
240-
# run: cargo check --profile ci --no-default-features -p datafusion-functions --features=core_expressions
241238
- name: Check datafusion-functions (crypto_expressions)
242239
run: cargo check --profile ci --no-default-features -p datafusion-functions --features=crypto_expressions
243240
- name: Check datafusion-functions (datetime_expressions)

datafusion/functions/Cargo.toml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,11 @@ all-features = true
3535
workspace = true
3636

3737
[features]
38-
# enable core functions
39-
core_expressions = []
4038
crypto_expressions = ["md-5", "sha2", "blake2", "blake3"]
4139
# enable datetime functions
4240
datetime_expressions = []
4341
# Enable encoding by default so the doctests work. In general don't automatically enable all packages.
4442
default = [
45-
"core_expressions",
4643
"datetime_expressions",
4744
"encoding_expressions",
4845
"math_expressions",
@@ -146,7 +143,6 @@ required-features = ["math_expressions"]
146143
[[bench]]
147144
harness = false
148145
name = "nullif"
149-
required-features = ["core_expressions"]
150146

151147
[[bench]]
152148
harness = false

datafusion/functions/src/core/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pub mod named_struct;
3232
pub mod nullif;
3333
pub mod nvl;
3434
pub mod nvl2;
35+
pub mod overlay;
3536
pub mod planner;
3637
pub mod r#struct;
3738
pub mod union_extract;
@@ -42,6 +43,7 @@ make_udf_function!(arrow_cast::ArrowCastFunc, arrow_cast);
4243
make_udf_function!(nullif::NullIfFunc, nullif);
4344
make_udf_function!(nvl::NVLFunc, nvl);
4445
make_udf_function!(nvl2::NVL2Func, nvl2);
46+
make_udf_function!(overlay::OverlayFunc, overlay);
4547
make_udf_function!(arrowtypeof::ArrowTypeOfFunc, arrow_typeof);
4648
make_udf_function!(r#struct::StructFunc, r#struct);
4749
make_udf_function!(named_struct::NamedStructFunc, named_struct);
@@ -71,6 +73,10 @@ pub mod expr_fn {
7173
nvl2,
7274
"Returns value2 if value1 is not NULL; otherwise, it returns value3.",
7375
arg1 arg2 arg3
76+
),(
77+
overlay,
78+
"replace the substring of string that starts at the start'th character and extends for count characters with new substring",
79+
args,
7480
),(
7581
arrow_typeof,
7682
"Returns the Arrow type of the input expression.",
@@ -115,6 +121,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
115121
arrow_cast(),
116122
nvl(),
117123
nvl2(),
124+
overlay(),
118125
arrow_typeof(),
119126
named_struct(),
120127
// Note: most users invoke `get_field` indirectly via field access
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
use std::any::Any;
19+
use std::sync::Arc;
20+
21+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
22+
use arrow::datatypes::DataType;
23+
24+
use crate::utils::{make_scalar_function, utf8_to_str_type};
25+
use datafusion_common::cast::{
26+
as_generic_string_array, as_int64_array, as_string_view_array,
27+
};
28+
use datafusion_common::{exec_err, Result};
29+
use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility};
30+
use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
31+
use datafusion_macros::user_doc;
32+
33+
#[user_doc(
34+
doc_section(label = "String Functions"),
35+
description = "Returns the string which is replaced by another string from the specified position and specified count length.",
36+
syntax_example = "overlay(str PLACING substr FROM pos [FOR count])",
37+
sql_example = r#"```sql
38+
> select overlay('Txxxxas' placing 'hom' from 2 for 4);
39+
+--------------------------------------------------------+
40+
| overlay(Utf8("Txxxxas"),Utf8("hom"),Int64(2),Int64(4)) |
41+
+--------------------------------------------------------+
42+
| Thomas |
43+
+--------------------------------------------------------+
44+
```"#,
45+
standard_argument(name = "str", prefix = "String"),
46+
argument(name = "substr", description = "Substring to replace in str."),
47+
argument(
48+
name = "pos",
49+
description = "The start position to start the replace in str."
50+
),
51+
argument(
52+
name = "count",
53+
description = "The count of characters to be replaced from start position of str. If not specified, will use substr length instead."
54+
)
55+
)]
56+
#[derive(Debug)]
57+
pub struct OverlayFunc {
58+
signature: Signature,
59+
}
60+
61+
impl Default for OverlayFunc {
62+
fn default() -> Self {
63+
Self::new()
64+
}
65+
}
66+
67+
impl OverlayFunc {
68+
pub fn new() -> Self {
69+
use DataType::*;
70+
Self {
71+
signature: Signature::one_of(
72+
vec![
73+
TypeSignature::Exact(vec![Utf8View, Utf8View, Int64, Int64]),
74+
TypeSignature::Exact(vec![Utf8, Utf8, Int64, Int64]),
75+
TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64, Int64]),
76+
TypeSignature::Exact(vec![Utf8View, Utf8View, Int64]),
77+
TypeSignature::Exact(vec![Utf8, Utf8, Int64]),
78+
TypeSignature::Exact(vec![LargeUtf8, LargeUtf8, Int64]),
79+
],
80+
Volatility::Immutable,
81+
),
82+
}
83+
}
84+
}
85+
86+
impl ScalarUDFImpl for OverlayFunc {
87+
fn as_any(&self) -> &dyn Any {
88+
self
89+
}
90+
91+
fn name(&self) -> &str {
92+
"overlay"
93+
}
94+
95+
fn signature(&self) -> &Signature {
96+
&self.signature
97+
}
98+
99+
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
100+
utf8_to_str_type(&arg_types[0], "overlay")
101+
}
102+
103+
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
104+
match args.args[0].data_type() {
105+
DataType::Utf8View | DataType::Utf8 => {
106+
make_scalar_function(overlay::<i32>, vec![])(&args.args)
107+
}
108+
DataType::LargeUtf8 => {
109+
make_scalar_function(overlay::<i64>, vec![])(&args.args)
110+
}
111+
other => exec_err!("Unsupported data type {other:?} for function overlay"),
112+
}
113+
}
114+
115+
fn documentation(&self) -> Option<&Documentation> {
116+
self.doc()
117+
}
118+
}
119+
120+
macro_rules! process_overlay {
121+
// For the three-argument case
122+
($string_array:expr, $characters_array:expr, $pos_num:expr) => {{
123+
$string_array
124+
.iter()
125+
.zip($characters_array.iter())
126+
.zip($pos_num.iter())
127+
.map(|((string, characters), start_pos)| {
128+
match (string, characters, start_pos) {
129+
(Some(string), Some(characters), Some(start_pos)) => {
130+
let string_len = string.chars().count();
131+
let characters_len = characters.chars().count();
132+
let replace_len = characters_len as i64;
133+
let mut res =
134+
String::with_capacity(string_len.max(characters_len));
135+
136+
//as sql replace index start from 1 while string index start from 0
137+
if start_pos > 1 && start_pos - 1 < string_len as i64 {
138+
let start = (start_pos - 1) as usize;
139+
res.push_str(&string[..start]);
140+
}
141+
res.push_str(characters);
142+
// if start + replace_len - 1 >= string_length, just to string end
143+
if start_pos + replace_len - 1 < string_len as i64 {
144+
let end = (start_pos + replace_len - 1) as usize;
145+
res.push_str(&string[end..]);
146+
}
147+
Ok(Some(res))
148+
}
149+
_ => Ok(None),
150+
}
151+
})
152+
.collect::<Result<GenericStringArray<T>>>()
153+
}};
154+
155+
// For the four-argument case
156+
($string_array:expr, $characters_array:expr, $pos_num:expr, $len_num:expr) => {{
157+
$string_array
158+
.iter()
159+
.zip($characters_array.iter())
160+
.zip($pos_num.iter())
161+
.zip($len_num.iter())
162+
.map(|(((string, characters), start_pos), len)| {
163+
match (string, characters, start_pos, len) {
164+
(Some(string), Some(characters), Some(start_pos), Some(len)) => {
165+
let string_len = string.chars().count();
166+
let characters_len = characters.chars().count();
167+
let replace_len = len.min(string_len as i64);
168+
let mut res =
169+
String::with_capacity(string_len.max(characters_len));
170+
171+
//as sql replace index start from 1 while string index start from 0
172+
if start_pos > 1 && start_pos - 1 < string_len as i64 {
173+
let start = (start_pos - 1) as usize;
174+
res.push_str(&string[..start]);
175+
}
176+
res.push_str(characters);
177+
// if start + replace_len - 1 >= string_length, just to string end
178+
if start_pos + replace_len - 1 < string_len as i64 {
179+
let end = (start_pos + replace_len - 1) as usize;
180+
res.push_str(&string[end..]);
181+
}
182+
Ok(Some(res))
183+
}
184+
_ => Ok(None),
185+
}
186+
})
187+
.collect::<Result<GenericStringArray<T>>>()
188+
}};
189+
}
190+
191+
/// OVERLAY(string1 PLACING string2 FROM integer FOR integer2)
192+
/// Replaces a substring of string1 with string2 starting at the integer bit
193+
/// pgsql overlay('Txxxxas' placing 'hom' from 2 for 4) → Thomas
194+
/// overlay('Txxxxas' placing 'hom' from 2) -> Thomxas, without for option, str2's len is instead
195+
fn overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
196+
let use_string_view = args[0].data_type() == &DataType::Utf8View;
197+
if use_string_view {
198+
string_view_overlay::<T>(args)
199+
} else {
200+
string_overlay::<T>(args)
201+
}
202+
}
203+
204+
pub fn string_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
205+
match args.len() {
206+
3 => {
207+
let string_array = as_generic_string_array::<T>(&args[0])?;
208+
let characters_array = as_generic_string_array::<T>(&args[1])?;
209+
let pos_num = as_int64_array(&args[2])?;
210+
211+
let result = process_overlay!(string_array, characters_array, pos_num)?;
212+
Ok(Arc::new(result) as ArrayRef)
213+
}
214+
4 => {
215+
let string_array = as_generic_string_array::<T>(&args[0])?;
216+
let characters_array = as_generic_string_array::<T>(&args[1])?;
217+
let pos_num = as_int64_array(&args[2])?;
218+
let len_num = as_int64_array(&args[3])?;
219+
220+
let result =
221+
process_overlay!(string_array, characters_array, pos_num, len_num)?;
222+
Ok(Arc::new(result) as ArrayRef)
223+
}
224+
other => {
225+
exec_err!("overlay was called with {other} arguments. It requires 3 or 4.")
226+
}
227+
}
228+
}
229+
230+
pub fn string_view_overlay<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
231+
match args.len() {
232+
3 => {
233+
let string_array = as_string_view_array(&args[0])?;
234+
let characters_array = as_string_view_array(&args[1])?;
235+
let pos_num = as_int64_array(&args[2])?;
236+
237+
let result = process_overlay!(string_array, characters_array, pos_num)?;
238+
Ok(Arc::new(result) as ArrayRef)
239+
}
240+
4 => {
241+
let string_array = as_string_view_array(&args[0])?;
242+
let characters_array = as_string_view_array(&args[1])?;
243+
let pos_num = as_int64_array(&args[2])?;
244+
let len_num = as_int64_array(&args[3])?;
245+
246+
let result =
247+
process_overlay!(string_array, characters_array, pos_num, len_num)?;
248+
Ok(Arc::new(result) as ArrayRef)
249+
}
250+
other => {
251+
exec_err!("overlay was called with {other} arguments. It requires 3 or 4.")
252+
}
253+
}
254+
}
255+
256+
#[cfg(test)]
257+
mod tests {
258+
use arrow::array::{Int64Array, StringArray};
259+
260+
use super::*;
261+
262+
#[test]
263+
fn to_overlay() -> Result<()> {
264+
let string =
265+
Arc::new(StringArray::from(vec!["123", "abcdefg", "xyz", "Txxxxas"]));
266+
let replace_string =
267+
Arc::new(StringArray::from(vec!["abc", "qwertyasdfg", "ijk", "hom"]));
268+
let start = Arc::new(Int64Array::from(vec![4, 1, 1, 2])); // start
269+
let end = Arc::new(Int64Array::from(vec![5, 7, 2, 4])); // replace len
270+
271+
let res = overlay::<i32>(&[string, replace_string, start, end]).unwrap();
272+
let result = as_generic_string_array::<i32>(&res).unwrap();
273+
let expected = StringArray::from(vec!["abc", "qwertyasdfg", "ijkz", "Thomas"]);
274+
assert_eq!(&expected, result);
275+
276+
Ok(())
277+
}
278+
}

datafusion/functions/src/core/planner.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ impl ExprPlanner for CoreFunctionPlanner {
6060

6161
fn plan_overlay(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
6262
Ok(PlannerResult::Planned(Expr::ScalarFunction(
63-
ScalarFunction::new_udf(crate::string::overlay(), args),
63+
ScalarFunction::new_udf(crate::core::overlay(), args),
6464
)))
6565
}
6666

datafusion/functions/src/lib.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,8 @@ pub mod string;
100100
make_stub_package!(string, "string_expressions");
101101

102102
/// Core datafusion expressions
103-
/// Enabled via feature flag `core_expressions`
104-
#[cfg(feature = "core_expressions")]
103+
/// These are always available and not controlled by a feature flag
105104
pub mod core;
106-
make_stub_package!(core, "core_expressions");
107105

108106
/// Date and time expressions.
109107
/// Contains functions such as to_timestamp
@@ -148,7 +146,6 @@ pub mod utils;
148146

149147
/// Fluent-style API for creating `Expr`s
150148
pub mod expr_fn {
151-
#[cfg(feature = "core_expressions")]
152149
pub use super::core::expr_fn::*;
153150
#[cfg(feature = "crypto_expressions")]
154151
pub use super::crypto::expr_fn::*;

datafusion/functions/src/string/mod.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,6 @@ make_udf_function!(levenshtein::LevenshteinFunc, levenshtein);
5555
make_udf_function!(ltrim::LtrimFunc, ltrim);
5656
make_udf_function!(lower::LowerFunc, lower);
5757
make_udf_function!(octet_length::OctetLengthFunc, octet_length);
58-
make_udf_function!(overlay::OverlayFunc, overlay);
5958
make_udf_function!(repeat::RepeatFunc, repeat);
6059
make_udf_function!(replace::ReplaceFunc, replace);
6160
make_udf_function!(rtrim::RtrimFunc, rtrim);
@@ -108,10 +107,6 @@ pub mod expr_fn {
108107
octet_length,
109108
"returns the number of bytes of a string",
110109
args
111-
),(
112-
overlay,
113-
"replace the substring of string that starts at the start'th character and extends for count characters with new substring",
114-
args,
115110
),(
116111
repeat,
117112
"Repeats the `string` to `n` times",

0 commit comments

Comments
 (0)