Skip to content

Commit f2685d3

Browse files
authored
Update INITCAP scalar function to support Utf8View (#11888)
* Update INITCAP scalar function to support Utf8View * Fix comments * Fix comments
1 parent 63ca714 commit f2685d3

File tree

2 files changed

+116
-32
lines changed

2 files changed

+116
-32
lines changed

datafusion/functions/src/string/initcap.rs

Lines changed: 72 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,10 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
21+
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray};
2222
use arrow::datatypes::DataType;
2323

24-
use datafusion_common::cast::as_generic_string_array;
24+
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
2525
use datafusion_common::{exec_err, Result};
2626
use datafusion_expr::{ColumnarValue, Volatility};
2727
use datafusion_expr::{ScalarUDFImpl, Signature};
@@ -45,7 +45,7 @@ impl InitcapFunc {
4545
Self {
4646
signature: Signature::uniform(
4747
1,
48-
vec![Utf8, LargeUtf8],
48+
vec![Utf8, LargeUtf8, Utf8View],
4949
Volatility::Immutable,
5050
),
5151
}
@@ -73,6 +73,7 @@ impl ScalarUDFImpl for InitcapFunc {
7373
match args[0].data_type() {
7474
DataType::Utf8 => make_scalar_function(initcap::<i32>, vec![])(args),
7575
DataType::LargeUtf8 => make_scalar_function(initcap::<i64>, vec![])(args),
76+
DataType::Utf8View => make_scalar_function(initcap_utf8view, vec![])(args),
7677
other => {
7778
exec_err!("Unsupported data type {other:?} for function initcap")
7879
}
@@ -88,28 +89,41 @@ fn initcap<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
8889
// first map is the iterator, second is for the `Option<_>`
8990
let result = string_array
9091
.iter()
91-
.map(|string| {
92-
string.map(|string: &str| {
93-
let mut char_vector = Vec::<char>::new();
94-
let mut previous_character_letter_or_number = false;
95-
for c in string.chars() {
96-
if previous_character_letter_or_number {
97-
char_vector.push(c.to_ascii_lowercase());
98-
} else {
99-
char_vector.push(c.to_ascii_uppercase());
100-
}
101-
previous_character_letter_or_number = c.is_ascii_uppercase()
102-
|| c.is_ascii_lowercase()
103-
|| c.is_ascii_digit();
104-
}
105-
char_vector.iter().collect::<String>()
106-
})
107-
})
92+
.map(initcap_string)
10893
.collect::<GenericStringArray<T>>();
10994

11095
Ok(Arc::new(result) as ArrayRef)
11196
}
11297

98+
fn initcap_utf8view(args: &[ArrayRef]) -> Result<ArrayRef> {
99+
let string_view_array = as_string_view_array(&args[0])?;
100+
101+
let result = string_view_array
102+
.iter()
103+
.map(initcap_string)
104+
.collect::<StringArray>();
105+
106+
Ok(Arc::new(result) as ArrayRef)
107+
}
108+
109+
fn initcap_string(string: Option<&str>) -> Option<String> {
110+
let mut char_vector = Vec::<char>::new();
111+
string.map(|string: &str| {
112+
char_vector.clear();
113+
let mut previous_character_letter_or_number = false;
114+
for c in string.chars() {
115+
if previous_character_letter_or_number {
116+
char_vector.push(c.to_ascii_lowercase());
117+
} else {
118+
char_vector.push(c.to_ascii_uppercase());
119+
}
120+
previous_character_letter_or_number =
121+
c.is_ascii_uppercase() || c.is_ascii_lowercase() || c.is_ascii_digit();
122+
}
123+
char_vector.iter().collect::<String>()
124+
})
125+
}
126+
113127
#[cfg(test)]
114128
mod tests {
115129
use crate::string::initcap::InitcapFunc;
@@ -153,6 +167,44 @@ mod tests {
153167
Utf8,
154168
StringArray
155169
);
170+
test_function!(
171+
InitcapFunc::new(),
172+
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
173+
"hi THOMAS".to_string()
174+
)))],
175+
Ok(Some("Hi Thomas")),
176+
&str,
177+
Utf8,
178+
StringArray
179+
);
180+
test_function!(
181+
InitcapFunc::new(),
182+
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
183+
"hi THOMAS wIth M0re ThAN 12 ChaRs".to_string()
184+
)))],
185+
Ok(Some("Hi Thomas With M0re Than 12 Chars")),
186+
&str,
187+
Utf8,
188+
StringArray
189+
);
190+
test_function!(
191+
InitcapFunc::new(),
192+
&[ColumnarValue::Scalar(ScalarValue::Utf8View(Some(
193+
"".to_string()
194+
)))],
195+
Ok(Some("")),
196+
&str,
197+
Utf8,
198+
StringArray
199+
);
200+
test_function!(
201+
InitcapFunc::new(),
202+
&[ColumnarValue::Scalar(ScalarValue::Utf8View(None))],
203+
Ok(None),
204+
&str,
205+
Utf8,
206+
StringArray
207+
);
156208

157209
Ok(())
158210
}

datafusion/sqllogictest/test_files/string_view.slt

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,50 @@ logical_plan
425425
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
426426
02)--TableScan: test projection=[column1_utf8view]
427427

428+
### Initcap
429+
430+
query TT
431+
EXPLAIN SELECT
432+
INITCAP(column1_utf8view) as c
433+
FROM test;
434+
----
435+
logical_plan
436+
01)Projection: initcap(test.column1_utf8view) AS c
437+
02)--TableScan: test projection=[column1_utf8view]
438+
439+
# Create a table with lowercase strings
440+
statement ok
441+
CREATE TABLE test_lowercase AS SELECT
442+
lower(column1_utf8) as column1_utf8_lower,
443+
lower(column1_large_utf8) as column1_large_utf8_lower,
444+
lower(column1_utf8view) as column1_utf8view_lower
445+
FROM test;
446+
447+
# Test INITCAP with utf8view, utf8, and largeutf8
448+
# Should not cast anything
449+
query TT
450+
EXPLAIN SELECT
451+
INITCAP(column1_utf8view_lower) as c1,
452+
INITCAP(column1_utf8_lower) as c2,
453+
INITCAP(column1_large_utf8_lower) as c3
454+
FROM test_lowercase;
455+
----
456+
logical_plan
457+
01)Projection: initcap(test_lowercase.column1_utf8view_lower) AS c1, initcap(test_lowercase.column1_utf8_lower) AS c2, initcap(test_lowercase.column1_large_utf8_lower) AS c3
458+
02)--TableScan: test_lowercase projection=[column1_utf8_lower, column1_large_utf8_lower, column1_utf8view_lower]
459+
460+
query TTT
461+
SELECT
462+
INITCAP(column1_utf8view_lower) as c1,
463+
INITCAP(column1_utf8_lower) as c2,
464+
INITCAP(column1_large_utf8_lower) as c3
465+
FROM test_lowercase;
466+
----
467+
Andrew Andrew Andrew
468+
Xiangpeng Xiangpeng Xiangpeng
469+
Raphael Raphael Raphael
470+
NULL NULL NULL
471+
428472
# Ensure string functions use native StringView implementation
429473
# and do not fall back to Utf8 or LargeUtf8
430474
# Should see no casts to Utf8 in the plans below
@@ -586,18 +630,6 @@ logical_plan
586630
02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view
587631
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]
588632

589-
590-
## Ensure no casts for INITCAP
591-
## TODO https://github.com/apache/datafusion/issues/11853
592-
query TT
593-
EXPLAIN SELECT
594-
INITCAP(column1_utf8view) as c
595-
FROM test;
596-
----
597-
logical_plan
598-
01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c
599-
02)--TableScan: test projection=[column1_utf8view]
600-
601633
## Ensure no casts for LEVENSHTEIN
602634
## TODO https://github.com/apache/datafusion/issues/11854
603635
query TT

0 commit comments

Comments
 (0)