Skip to content

Commit 61fa572

Browse files
authored
feat: Add stringview support to encode and decode and bit_length (#13332)
* add stringview * add tests * remove utf8view * remove array_to_string changes * remove use
1 parent 73507c3 commit 61fa572

File tree

4 files changed

+52
-8
lines changed

4 files changed

+52
-8
lines changed

datafusion/functions/src/core/named_struct.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,9 @@ use datafusion_expr::{ScalarUDFImpl, Signature, Volatility};
2424
use std::any::Any;
2525
use std::sync::{Arc, OnceLock};
2626

27-
/// put values in a struct array.
27+
/// Put values in a struct array.
2828
fn named_struct_expr(args: &[ColumnarValue]) -> Result<ColumnarValue> {
29-
// do not accept 0 arguments.
29+
// Do not accept 0 arguments.
3030
if args.is_empty() {
3131
return exec_err!(
3232
"named_struct requires at least one pair of arguments, got 0 instead"

datafusion/functions/src/encoding/inner.rs

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ impl ScalarUDFImpl for EncodeFunc {
108108
}
109109

110110
match arg_types[0] {
111-
DataType::Utf8 | DataType::Binary | DataType::Null => {
111+
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
112112
Ok(vec![DataType::Utf8; 2])
113113
}
114114
DataType::LargeUtf8 | DataType::LargeBinary => {
@@ -195,7 +195,7 @@ impl ScalarUDFImpl for DecodeFunc {
195195
}
196196

197197
match arg_types[0] {
198-
DataType::Utf8 | DataType::Binary | DataType::Null => {
198+
DataType::Utf8 | DataType::Utf8View | DataType::Binary | DataType::Null => {
199199
Ok(vec![DataType::Binary, DataType::Utf8])
200200
}
201201
DataType::LargeUtf8 | DataType::LargeBinary => {
@@ -224,6 +224,7 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
224224
ColumnarValue::Array(a) => match a.data_type() {
225225
DataType::Utf8 => encoding.encode_utf8_array::<i32>(a.as_ref()),
226226
DataType::LargeUtf8 => encoding.encode_utf8_array::<i64>(a.as_ref()),
227+
DataType::Utf8View => encoding.encode_utf8_array::<i32>(a.as_ref()),
227228
DataType::Binary => encoding.encode_binary_array::<i32>(a.as_ref()),
228229
DataType::LargeBinary => encoding.encode_binary_array::<i64>(a.as_ref()),
229230
other => exec_err!(
@@ -237,6 +238,9 @@ fn encode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
237238
}
238239
ScalarValue::LargeUtf8(a) => Ok(encoding
239240
.encode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes()))),
241+
ScalarValue::Utf8View(a) => {
242+
Ok(encoding.encode_scalar(a.as_ref().map(|s: &String| s.as_bytes())))
243+
}
240244
ScalarValue::Binary(a) => Ok(
241245
encoding.encode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
242246
),
@@ -255,6 +259,7 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
255259
ColumnarValue::Array(a) => match a.data_type() {
256260
DataType::Utf8 => encoding.decode_utf8_array::<i32>(a.as_ref()),
257261
DataType::LargeUtf8 => encoding.decode_utf8_array::<i64>(a.as_ref()),
262+
DataType::Utf8View => encoding.decode_utf8_array::<i32>(a.as_ref()),
258263
DataType::Binary => encoding.decode_binary_array::<i32>(a.as_ref()),
259264
DataType::LargeBinary => encoding.decode_binary_array::<i64>(a.as_ref()),
260265
other => exec_err!(
@@ -268,6 +273,9 @@ fn decode_process(value: &ColumnarValue, encoding: Encoding) -> Result<ColumnarV
268273
}
269274
ScalarValue::LargeUtf8(a) => encoding
270275
.decode_large_scalar(a.as_ref().map(|s: &String| s.as_bytes())),
276+
ScalarValue::Utf8View(a) => {
277+
encoding.decode_scalar(a.as_ref().map(|s: &String| s.as_bytes()))
278+
}
271279
ScalarValue::Binary(a) => {
272280
encoding.decode_scalar(a.as_ref().map(|v: &Vec<u8>| v.as_slice()))
273281
}
@@ -512,7 +520,7 @@ impl FromStr for Encoding {
512520
}
513521
}
514522

515-
/// Encodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
523+
/// Encodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
516524
/// Second argument is the encoding to use.
517525
/// Standard encodings are base64 and hex.
518526
fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
@@ -524,7 +532,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
524532
}
525533
let encoding = match &args[1] {
526534
ColumnarValue::Scalar(scalar) => match scalar {
527-
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
535+
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
528536
method.parse::<Encoding>()
529537
}
530538
_ => not_impl_err!(
@@ -538,7 +546,7 @@ fn encode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
538546
encode_process(&args[0], encoding)
539547
}
540548

541-
/// Decodes the given data, accepts Binary, LargeBinary, Utf8 or LargeUtf8 and returns a [`ColumnarValue`].
549+
/// Decodes the given data, accepts Binary, LargeBinary, Utf8, Utf8View or LargeUtf8 and returns a [`ColumnarValue`].
542550
/// Second argument is the encoding to use.
543551
/// Standard encodings are base64 and hex.
544552
fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
@@ -550,7 +558,7 @@ fn decode(args: &[ColumnarValue]) -> Result<ColumnarValue> {
550558
}
551559
let encoding = match &args[1] {
552560
ColumnarValue::Scalar(scalar) => match scalar {
553-
ScalarValue::Utf8(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
561+
ScalarValue::Utf8(Some(method)) | ScalarValue::Utf8View(Some(method)) | ScalarValue::LargeUtf8(Some(method)) => {
554562
method.parse::<Encoding>()
555563
}
556564
_ => not_impl_err!(

datafusion/sqllogictest/test_files/encoding.slt

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,34 @@ select to_hex(num) from test ORDER BY num;
7171
0
7272
1
7373
2
74+
75+
# test for Utf8View support for encode
76+
statement ok
77+
CREATE TABLE test_source AS VALUES
78+
('Andrew', 'X'),
79+
('Xiangpeng', 'Xiangpeng'),
80+
('Raphael', 'R'),
81+
(NULL, 'R');
82+
83+
statement ok
84+
CREATE TABLE test_utf8view AS
85+
select
86+
arrow_cast(column1, 'Utf8View') AS column1_utf8view,
87+
arrow_cast(column2, 'Utf8View') AS column2_utf8view
88+
FROM test_source;
89+
90+
query TTTTTT
91+
SELECT
92+
column1_utf8view,
93+
encode(column1_utf8view, 'base64') AS column1_base64,
94+
encode(column1_utf8view, 'hex') AS column1_hex,
95+
96+
column2_utf8view,
97+
encode(column2_utf8view, 'base64') AS column2_base64,
98+
encode(column2_utf8view, 'hex') AS column2_hex
99+
FROM test_utf8view;
100+
----
101+
Andrew QW5kcmV3 416e64726577 X WA 58
102+
Xiangpeng WGlhbmdwZW5n 5869616e6770656e67 Xiangpeng WGlhbmdwZW5n 5869616e6770656e67
103+
Raphael UmFwaGFlbA 5261706861656c R Ug 52
104+
NULL NULL NULL R Ug 52

datafusion/sqllogictest/test_files/expr.slt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,11 @@ SELECT bit_length(NULL)
364364
----
365365
NULL
366366

367+
query I
368+
SELECT bit_length(arrow_cast('jonathan', 'Utf8View'));
369+
----
370+
64
371+
367372
query T
368373
SELECT btrim(' xyxtrimyyx ', NULL)
369374
----

0 commit comments

Comments
 (0)