Skip to content

Commit 7e49d69

Browse files
djandersondemetribu
authored andcommitted
Update ASCII scalar function to support Utf8View #11834
1 parent 053795c commit 7e49d69

File tree

3 files changed

+192
-32
lines changed

3 files changed

+192
-32
lines changed

datafusion/core/src/datasource/physical_plan/arrow_file.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,9 @@ impl FileOpener for ArrowOpener {
331331
.into_iter()
332332
.zip(recordbatch_results)
333333
.filter_map(move |(block, data)| {
334-
match decoder.read_record_batch(&block, &data.into()) {
335-
Ok(Some(record_batch)) => Some(Ok(record_batch)),
336-
Ok(None) => None,
337-
Err(err) => Some(Err(err)),
338-
}
334+
decoder
335+
.read_record_batch(&block, &data.into())
336+
.transpose()
339337
}),
340338
)
341339
.boxed())

datafusion/functions/src/string/ascii.rs

+89-27
Original file line numberDiff line numberDiff line change
@@ -17,32 +17,14 @@
1717

1818
use crate::utils::make_scalar_function;
1919
use arrow::array::Int32Array;
20-
use arrow::array::{ArrayRef, OffsetSizeTrait};
20+
use arrow::array::{ArrayRef, AsArray};
2121
use arrow::datatypes::DataType;
22-
use datafusion_common::{cast::as_generic_string_array, internal_err, Result};
22+
use datafusion_common::Result;
2323
use datafusion_expr::ColumnarValue;
2424
use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2525
use std::any::Any;
2626
use std::sync::Arc;
2727

28-
/// Returns the numeric code of the first character of the argument.
29-
/// ascii('x') = 120
30-
pub fn ascii<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
31-
let string_array = as_generic_string_array::<T>(&args[0])?;
32-
33-
let result = string_array
34-
.iter()
35-
.map(|string| {
36-
string.map(|string: &str| {
37-
let mut chars = string.chars();
38-
chars.next().map_or(0, |v| v as i32)
39-
})
40-
})
41-
.collect::<Int32Array>();
42-
43-
Ok(Arc::new(result) as ArrayRef)
44-
}
45-
4628
#[derive(Debug)]
4729
pub struct AsciiFunc {
4830
signature: Signature,
@@ -60,7 +42,7 @@ impl AsciiFunc {
6042
Self {
6143
signature: Signature::uniform(
6244
1,
63-
vec![Utf8, LargeUtf8],
45+
vec![Utf8, LargeUtf8, Utf8View],
6446
Volatility::Immutable,
6547
),
6648
}
@@ -87,12 +69,92 @@ impl ScalarUDFImpl for AsciiFunc {
8769
}
8870

8971
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
90-
match args[0].data_type() {
91-
DataType::Utf8 => make_scalar_function(ascii::<i32>, vec![])(args),
92-
DataType::LargeUtf8 => {
93-
return make_scalar_function(ascii::<i64>, vec![])(args);
94-
}
95-
_ => internal_err!("Unsupported data type"),
72+
make_scalar_function(ascii, vec![])(args)
73+
}
74+
}
75+
76+
fn calculate_ascii<'a, I>(string_array: I) -> Result<ArrayRef>
77+
where
78+
I: IntoIterator<Item = Option<&'a str>>,
79+
{
80+
let result = string_array
81+
.into_iter()
82+
.map(|string| {
83+
string.map(|s| {
84+
let mut chars = s.chars();
85+
chars.next().map_or(0, |v| v as i32)
86+
})
87+
})
88+
.collect::<Int32Array>();
89+
90+
Ok(Arc::new(result) as ArrayRef)
91+
}
92+
93+
/// Returns the numeric code of the first character of the argument.
94+
pub fn ascii(args: &[ArrayRef]) -> Result<ArrayRef> {
95+
match args[0].data_type() {
96+
DataType::Utf8 => {
97+
let string_array = args[0].as_string::<i32>();
98+
calculate_ascii(string_array.iter())
99+
}
100+
DataType::LargeUtf8 => {
101+
let string_array = args[0].as_string::<i64>();
102+
calculate_ascii(string_array.iter())
96103
}
104+
DataType::Utf8View => {
105+
let string_array = args[0].as_string_view();
106+
calculate_ascii(string_array.iter())
107+
}
108+
_ => unreachable!(),
109+
}
110+
}
111+
112+
#[cfg(test)]
113+
mod tests {
114+
use crate::string::ascii::AsciiFunc;
115+
use crate::utils::test::test_function;
116+
use arrow::array::{Array, Int32Array};
117+
use arrow::datatypes::DataType::Int32;
118+
use datafusion_common::{Result, ScalarValue};
119+
use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
120+
121+
macro_rules! test_ascii {
122+
($INPUT:expr, $EXPECTED:expr) => {
123+
test_function!(
124+
AsciiFunc::new(),
125+
&[ColumnarValue::Scalar(ScalarValue::Utf8($INPUT))],
126+
$EXPECTED,
127+
i32,
128+
Int32,
129+
Int32Array
130+
);
131+
132+
test_function!(
133+
AsciiFunc::new(),
134+
&[ColumnarValue::Scalar(ScalarValue::LargeUtf8($INPUT))],
135+
$EXPECTED,
136+
i32,
137+
Int32,
138+
Int32Array
139+
);
140+
141+
test_function!(
142+
AsciiFunc::new(),
143+
&[ColumnarValue::Scalar(ScalarValue::Utf8View($INPUT))],
144+
$EXPECTED,
145+
i32,
146+
Int32,
147+
Int32Array
148+
);
149+
};
150+
}
151+
152+
#[test]
153+
fn test_functions() -> Result<()> {
154+
test_ascii!(Some(String::from("x")), Ok(Some(120)));
155+
test_ascii!(Some(String::from("a")), Ok(Some(97)));
156+
test_ascii!(Some(String::from("")), Ok(Some(0)));
157+
test_ascii!(None, Ok(None));
158+
Ok(())
97159
}
98160
}

datafusion/sqllogictest/test_files/string_view.slt

+100
Original file line numberDiff line numberDiff line change
@@ -500,3 +500,103 @@ select column2|| ' ' ||column3 from temp;
500500
----
501501
rust fast
502502
datafusion cool
503+
504+
505+
### ASCII
506+
# Setup the initial test data
507+
statement ok
508+
create table test_source as values
509+
('Andrew', 'X'),
510+
('Xiangpeng', 'Xiangpeng'),
511+
('Raphael', 'R'),
512+
(NULL, 'R');
513+
514+
# Table with the different combination of column types
515+
statement ok
516+
create table test as
517+
SELECT
518+
arrow_cast(column1, 'Utf8') as column1_utf8,
519+
arrow_cast(column2, 'Utf8') as column2_utf8,
520+
arrow_cast(column1, 'LargeUtf8') as column1_large_utf8,
521+
arrow_cast(column2, 'LargeUtf8') as column2_large_utf8,
522+
arrow_cast(column1, 'Utf8View') as column1_utf8view,
523+
arrow_cast(column2, 'Utf8View') as column2_utf8view
524+
FROM test_source;
525+
526+
# Test ASCII with utf8view against utf8view, utf8, and largeutf8
527+
# (should be no casts)
528+
query TT
529+
EXPLAIN SELECT
530+
ASCII(column1_utf8view) as c1,
531+
ASCII(column2_utf8) as c2,
532+
ASCII(column2_large_utf8) as c3
533+
FROM test;
534+
----
535+
logical_plan
536+
01)Projection: ascii(test.column1_utf8view) AS c1, ascii(test.column2_utf8) AS c2, ascii(test.column2_large_utf8) AS c3
537+
02)--TableScan: test projection=[column2_utf8, column2_large_utf8, column1_utf8view]
538+
539+
query III
540+
SELECT
541+
ASCII(column1_utf8view) as c1,
542+
ASCII(column2_utf8) as c2,
543+
ASCII(column2_large_utf8) as c3
544+
FROM test;
545+
----
546+
65 88 88
547+
88 88 88
548+
82 82 82
549+
NULL 82 82
550+
551+
query TT
552+
EXPLAIN SELECT
553+
ASCII(column1_utf8) as c1,
554+
ASCII(column1_large_utf8) as c2,
555+
ASCII(column2_utf8view) as c3,
556+
ASCII('hello') as c4,
557+
ASCII(arrow_cast('world', 'Utf8View')) as c5
558+
FROM test;
559+
----
560+
logical_plan
561+
01)Projection: ascii(test.column1_utf8) AS c1, ascii(test.column1_large_utf8) AS c2, ascii(test.column2_utf8view) AS c3, Int32(104) AS c4, Int32(119) AS c5
562+
02)--TableScan: test projection=[column1_utf8, column1_large_utf8, column2_utf8view]
563+
564+
query IIIII
565+
SELECT
566+
ASCII(column1_utf8) as c1,
567+
ASCII(column1_large_utf8) as c2,
568+
ASCII(column2_utf8view) as c3,
569+
ASCII('hello') as c4,
570+
ASCII(arrow_cast('world', 'Utf8View')) as c5
571+
FROM test;
572+
----
573+
65 65 88 104 119
574+
88 88 88 104 119
575+
82 82 82 104 119
576+
NULL NULL 82 104 119
577+
578+
# Test ASCII with literals cast to Utf8View
579+
query TT
580+
EXPLAIN SELECT
581+
ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
582+
ASCII(arrow_cast('', 'Utf8View')) as c2,
583+
ASCII(arrow_cast(NULL, 'Utf8View')) as c3
584+
FROM test;
585+
----
586+
logical_plan
587+
01)Projection: Int32(228) AS c1, Int32(0) AS c2, Int32(NULL) AS c3
588+
02)--TableScan: test projection=[]
589+
590+
query III
591+
SELECT
592+
ASCII(arrow_cast('äöüß', 'Utf8View')) as c1,
593+
ASCII(arrow_cast('', 'Utf8View')) as c2,
594+
ASCII(arrow_cast(NULL, 'Utf8View')) as c3
595+
----
596+
228 0 NULL
597+
598+
statement ok
599+
drop table test;
600+
601+
statement ok
602+
drop table test_source;

0 commit comments

Comments
 (0)