Skip to content

Commit ccb4baf

Browse files
alambWeijun-HPsiACEXiangpengHao
authored
Initial support for StringView, merge changes from string-view development branch (#11402)
* Update `string-view` branch to arrow-rs main (#10966) * Pin to arrow main * Fix clippy with latest arrow * Uncomment test that needs new arrow-rs to work * Update datafusion-cli Cargo.lock * Update Cargo.lock * tapelo * feat: Implement equality = and inequality <> support for StringView (#10985) * feat: Implement equality = and inequality <> support for StringView * chore: Add tests for the StringView * chore * chore: Update tests for NULL * fix: Used build_array_string! * chore: Update string_coercion function to handle Utf8View type in binary.rs * chore: add tests * chore: ci * Add more StringView comparison test coverage (#10997) * Add more StringView comparison test coverage * add reference * Add another test showing casting on columns works correctly * feat: Implement equality = and inequality <> support for BinaryView (#11004) * feat: Implement equality = and inequality <> support for BinaryView Signed-off-by: Chojan Shang <[email protected]> * chore: make fmt happy Signed-off-by: Chojan Shang <[email protected]> --------- Signed-off-by: Chojan Shang <[email protected]> * Implement support for LargeString and LargeBinary for StringView and BinaryView (#11034) * implement large binary * add tests for large string * better comments for string coercion * Improve filter predicates with `Utf8View` literals (#11043) * refactor: Improve type coercion logic in TypeCoercionRewriter * refactor: Improve type coercion logic in TypeCoercionRewriter * chore * chore: Update test * refactor: Improve type coercion logic in TypeCoercionRewriter * refactor: Remove unused import and update code formatting in unwrap_cast_in_comparison.rs * Remove arrow-patch --------- Signed-off-by: Chojan Shang <[email protected]> Co-authored-by: Alex Huang <[email protected]> Co-authored-by: Chojan Shang <[email protected]> Co-authored-by: Xiangpeng Hao <[email protected]>
1 parent f11bdf0 commit ccb4baf

File tree

5 files changed

+566
-32
lines changed

5 files changed

+566
-32
lines changed

datafusion/common/src/scalar/mod.rs

+2-6
Original file line numberDiff line numberDiff line change
@@ -1682,8 +1682,10 @@ impl ScalarValue {
16821682
DataType::UInt16 => build_array_primitive!(UInt16Array, UInt16),
16831683
DataType::UInt32 => build_array_primitive!(UInt32Array, UInt32),
16841684
DataType::UInt64 => build_array_primitive!(UInt64Array, UInt64),
1685+
DataType::Utf8View => build_array_string!(StringViewArray, Utf8View),
16851686
DataType::Utf8 => build_array_string!(StringArray, Utf8),
16861687
DataType::LargeUtf8 => build_array_string!(LargeStringArray, LargeUtf8),
1688+
DataType::BinaryView => build_array_string!(BinaryViewArray, BinaryView),
16871689
DataType::Binary => build_array_string!(BinaryArray, Binary),
16881690
DataType::LargeBinary => build_array_string!(LargeBinaryArray, LargeBinary),
16891691
DataType::Date32 => build_array_primitive!(Date32Array, Date32),
@@ -1841,8 +1843,6 @@ impl ScalarValue {
18411843
| DataType::Time64(TimeUnit::Millisecond)
18421844
| DataType::Map(_, _)
18431845
| DataType::RunEndEncoded(_, _)
1844-
| DataType::Utf8View
1845-
| DataType::BinaryView
18461846
| DataType::ListView(_)
18471847
| DataType::LargeListView(_) => {
18481848
return _internal_err!(
@@ -5695,16 +5695,12 @@ mod tests {
56955695
DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)),
56965696
);
56975697

5698-
// needs https://github.com/apache/arrow-rs/issues/5893
5699-
/*
57005698
check_scalar_cast(ScalarValue::Utf8(None), DataType::Utf8View);
57015699
check_scalar_cast(ScalarValue::from("foo"), DataType::Utf8View);
57025700
check_scalar_cast(
57035701
ScalarValue::from("larger than 12 bytes string"),
57045702
DataType::Utf8View,
57055703
);
5706-
5707-
*/
57085704
}
57095705

57105706
// mimics how casting work on scalar values by `casting` `scalar` to `desired_type`

datafusion/expr/src/type_coercion/binary.rs

+26-10
Original file line numberDiff line numberDiff line change
@@ -919,16 +919,21 @@ fn string_concat_internal_coercion(
919919
}
920920
}
921921

922-
/// Coercion rules for string types (Utf8/LargeUtf8): If at least one argument is
923-
/// a string type and both arguments can be coerced into a string type, coerce
924-
/// to string type.
922+
/// Coercion rules for string view types (Utf8/LargeUtf8/Utf8View):
923+
/// If at least one argument is a string view, we coerce to string view
924+
/// based on the observation that StringArray to StringViewArray is cheap but not vice versa.
925+
///
926+
/// Between Utf8 and LargeUtf8, we coerce to LargeUtf8.
925927
fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
926928
use arrow::datatypes::DataType::*;
927929
match (lhs_type, rhs_type) {
930+
// If Utf8View is in any side, we coerce to Utf8View.
931+
(Utf8View, Utf8View | Utf8 | LargeUtf8) | (Utf8 | LargeUtf8, Utf8View) => {
932+
Some(Utf8View)
933+
}
934+
// Then, if LargeUtf8 is in any side, we coerce to LargeUtf8.
935+
(LargeUtf8, Utf8 | LargeUtf8) | (Utf8, LargeUtf8) => Some(LargeUtf8),
928936
(Utf8, Utf8) => Some(Utf8),
929-
(LargeUtf8, Utf8) => Some(LargeUtf8),
930-
(Utf8, LargeUtf8) => Some(LargeUtf8),
931-
(LargeUtf8, LargeUtf8) => Some(LargeUtf8),
932937
_ => None,
933938
}
934939
}
@@ -975,15 +980,26 @@ fn binary_to_string_coercion(
975980
}
976981
}
977982

978-
/// Coercion rules for binary types (Binary/LargeBinary): If at least one argument is
983+
/// Coercion rules for binary types (Binary/LargeBinary/BinaryView): If at least one argument is
979984
/// a binary type and both arguments can be coerced into a binary type, coerce
980985
/// to binary type.
981986
fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
982987
use arrow::datatypes::DataType::*;
983988
match (lhs_type, rhs_type) {
984-
(Binary | Utf8, Binary) | (Binary, Utf8) => Some(Binary),
985-
(LargeBinary | Binary | Utf8 | LargeUtf8, LargeBinary)
986-
| (LargeBinary, Binary | Utf8 | LargeUtf8) => Some(LargeBinary),
989+
// If BinaryView is in any side, we coerce to BinaryView.
990+
(BinaryView, BinaryView | Binary | LargeBinary | Utf8 | LargeUtf8 | Utf8View)
991+
| (LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, BinaryView) => {
992+
Some(BinaryView)
993+
}
994+
// Prefer LargeBinary over Binary
995+
(LargeBinary | Binary | Utf8 | LargeUtf8 | Utf8View, LargeBinary)
996+
| (LargeBinary, Binary | Utf8 | LargeUtf8 | Utf8View) => Some(LargeBinary),
997+
998+
// If Utf8View/LargeUtf8 presents need to be large Binary
999+
(Utf8View | LargeUtf8, Binary) | (Binary, Utf8View | LargeUtf8) => {
1000+
Some(LargeBinary)
1001+
}
1002+
(Binary, Utf8) | (Utf8, Binary) => Some(Binary),
9871003
_ => None,
9881004
}
9891005
}

datafusion/optimizer/src/unwrap_cast_in_comparison.rs

+10-16
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
3333
use datafusion_common::{internal_err, DFSchema, DFSchemaRef, Result, ScalarValue};
3434
use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast};
3535
use datafusion_expr::utils::merge_schema;
36-
use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan, Operator};
36+
use datafusion_expr::{lit, Expr, ExprSchemable, LogicalPlan};
3737

3838
/// [`UnwrapCastInComparison`] attempts to remove casts from
3939
/// comparisons to literals ([`ScalarValue`]s) by applying the casts
@@ -146,7 +146,7 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
146146
};
147147
is_supported_type(&left_type)
148148
&& is_supported_type(&right_type)
149-
&& is_comparison_op(op)
149+
&& op.is_comparison_operator()
150150
} =>
151151
{
152152
match (left.as_mut(), right.as_mut()) {
@@ -262,18 +262,6 @@ impl TreeNodeRewriter for UnwrapCastExprRewriter {
262262
}
263263
}
264264

265-
fn is_comparison_op(op: &Operator) -> bool {
266-
matches!(
267-
op,
268-
Operator::Eq
269-
| Operator::NotEq
270-
| Operator::Gt
271-
| Operator::GtEq
272-
| Operator::Lt
273-
| Operator::LtEq
274-
)
275-
}
276-
277265
/// Returns true if [UnwrapCastExprRewriter] supports this data type
278266
fn is_supported_type(data_type: &DataType) -> bool {
279267
is_supported_numeric_type(data_type)
@@ -300,7 +288,10 @@ fn is_supported_numeric_type(data_type: &DataType) -> bool {
300288

301289
/// Returns true if [UnwrapCastExprRewriter] supports casting this value as a string
302290
fn is_supported_string_type(data_type: &DataType) -> bool {
303-
matches!(data_type, DataType::Utf8 | DataType::LargeUtf8)
291+
matches!(
292+
data_type,
293+
DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View
294+
)
304295
}
305296

306297
/// Returns true if [UnwrapCastExprRewriter] supports casting this value as a dictionary
@@ -473,12 +464,15 @@ fn try_cast_string_literal(
473464
target_type: &DataType,
474465
) -> Option<ScalarValue> {
475466
let string_value = match lit_value {
476-
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => s.clone(),
467+
ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) | ScalarValue::Utf8View(s) => {
468+
s.clone()
469+
}
477470
_ => return None,
478471
};
479472
let scalar_value = match target_type {
480473
DataType::Utf8 => ScalarValue::Utf8(string_value),
481474
DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value),
475+
DataType::Utf8View => ScalarValue::Utf8View(string_value),
482476
_ => return None,
483477
};
484478
Some(scalar_value)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
########
19+
## Test setup
20+
########
21+
22+
statement ok
23+
create table test_source as values
24+
('Andrew', 'X'),
25+
('Xiangpeng', 'Xiangpeng'),
26+
('Raphael', 'R'),
27+
(NULL, 'R')
28+
;
29+
30+
# Table with the different combination of column types
31+
statement ok
32+
CREATE TABLE test AS
33+
SELECT
34+
arrow_cast(column1, 'Utf8') as column1_utf8,
35+
arrow_cast(column2, 'Utf8') as column2_utf8,
36+
arrow_cast(column1, 'Binary') AS column1_binary,
37+
arrow_cast(column2, 'Binary') AS column2_binary,
38+
arrow_cast(column1, 'LargeBinary') AS column1_large_binary,
39+
arrow_cast(column2, 'LargeBinary') AS column2_large_binary,
40+
arrow_cast(arrow_cast(column1, 'Binary'), 'BinaryView') AS column1_binaryview,
41+
arrow_cast(arrow_cast(column2, 'Binary'), 'BinaryView') AS column2_binaryview,
42+
arrow_cast(column1, 'Dictionary(Int32, Binary)') AS column1_dict,
43+
arrow_cast(column2, 'Dictionary(Int32, Binary)') AS column2_dict
44+
FROM test_source;
45+
46+
statement ok
47+
drop table test_source
48+
49+
########
50+
## BinaryView to BinaryView
51+
########
52+
53+
# BinaryView scalar to BinaryView scalar
54+
55+
query BBBB
56+
SELECT
57+
arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison1,
58+
arrow_cast(arrow_cast('NULL', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison2,
59+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison3,
60+
arrow_cast(arrow_cast('Xiangpeng', 'Binary'), 'BinaryView') <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') AS comparison4;
61+
----
62+
false true true true
63+
64+
65+
# BinaryView column to BinaryView column comparison as filters
66+
67+
query TT
68+
select column1_utf8, column2_utf8 from test where column1_binaryview = column2_binaryview;
69+
----
70+
Xiangpeng Xiangpeng
71+
72+
query TT
73+
select column1_utf8, column2_utf8 from test where column1_binaryview <> column2_binaryview;
74+
----
75+
Andrew X
76+
Raphael R
77+
78+
# BinaryView column to BinaryView column
79+
query TTBB
80+
select
81+
column1_utf8, column2_utf8,
82+
column1_binaryview = column2_binaryview,
83+
column1_binaryview <> column2_binaryview
84+
from test;
85+
----
86+
Andrew X false true
87+
Xiangpeng Xiangpeng true false
88+
Raphael R false true
89+
NULL R NULL NULL
90+
91+
# BinaryView column to BinaryView scalar comparison
92+
query TTBBBB
93+
select
94+
column1_utf8, column2_utf8,
95+
column1_binaryview = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
96+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binaryview,
97+
column1_binaryview <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
98+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binaryview
99+
from test;
100+
----
101+
Andrew X true true false false
102+
Xiangpeng Xiangpeng false false true true
103+
Raphael R false false true true
104+
NULL R NULL NULL NULL NULL
105+
106+
########
107+
## BinaryView to Binary
108+
########
109+
110+
# test BinaryViewArray with Binary columns
111+
query TTBBBB
112+
select
113+
column1_utf8, column2_utf8,
114+
column1_binaryview = column2_binary,
115+
column2_binary = column1_binaryview,
116+
column1_binaryview <> column2_binary,
117+
column2_binary <> column1_binaryview
118+
from test;
119+
----
120+
Andrew X false false true true
121+
Xiangpeng Xiangpeng true true false false
122+
Raphael R false false true true
123+
NULL R NULL NULL NULL NULL
124+
125+
# test BinaryViewArray with LargeBinary columns
126+
query TTBBBB
127+
select
128+
column1_utf8, column2_utf8,
129+
column1_binaryview = column2_large_binary,
130+
column2_large_binary = column1_binaryview,
131+
column1_binaryview <> column2_large_binary,
132+
column2_large_binary <> column1_binaryview
133+
from test;
134+
----
135+
Andrew X false false true true
136+
Xiangpeng Xiangpeng true true false false
137+
Raphael R false false true true
138+
NULL R NULL NULL NULL NULL
139+
140+
# BinaryView column to Binary scalar
141+
query TTBBBB
142+
select
143+
column1_utf8, column2_utf8,
144+
column1_binaryview = arrow_cast('Andrew', 'Binary'),
145+
arrow_cast('Andrew', 'Binary') = column1_binaryview,
146+
column1_binaryview <> arrow_cast('Andrew', 'Binary'),
147+
arrow_cast('Andrew', 'Binary') <> column1_binaryview
148+
from test;
149+
----
150+
Andrew X true true false false
151+
Xiangpeng Xiangpeng false false true true
152+
Raphael R false false true true
153+
NULL R NULL NULL NULL NULL
154+
155+
# BinaryView column to LargeBinary scalar
156+
query TTBBBB
157+
select
158+
column1_utf8, column2_utf8,
159+
column1_binaryview = arrow_cast('Andrew', 'LargeBinary'),
160+
arrow_cast('Andrew', 'LargeBinary') = column1_binaryview,
161+
column1_binaryview <> arrow_cast('Andrew', 'LargeBinary'),
162+
arrow_cast('Andrew', 'LargeBinary') <> column1_binaryview
163+
from test;
164+
----
165+
Andrew X true true false false
166+
Xiangpeng Xiangpeng false false true true
167+
Raphael R false false true true
168+
NULL R NULL NULL NULL NULL
169+
170+
# Binary column to BinaryView scalar
171+
query TTBBBB
172+
select
173+
column1_utf8, column2_utf8,
174+
column1_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
175+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_binary,
176+
column1_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
177+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_binary
178+
from test;
179+
----
180+
Andrew X true true false false
181+
Xiangpeng Xiangpeng false false true true
182+
Raphael R false false true true
183+
NULL R NULL NULL NULL NULL
184+
185+
186+
# LargeBinary column to BinaryView scalar
187+
query TTBBBB
188+
select
189+
column1_utf8, column2_utf8,
190+
column1_large_binary = arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
191+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') = column1_large_binary,
192+
column1_large_binary <> arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView'),
193+
arrow_cast(arrow_cast('Andrew', 'Binary'), 'BinaryView') <> column1_large_binary
194+
from test;
195+
----
196+
Andrew X true true false false
197+
Xiangpeng Xiangpeng false false true true
198+
Raphael R false false true true
199+
NULL R NULL NULL NULL NULL
200+
201+
statement ok
202+
drop table test;

0 commit comments

Comments
 (0)