Skip to content

Support utf8view datatype for window #15257

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions datafusion/expr/src/type_coercion/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,12 @@ pub fn is_datetime(dt: &DataType) -> bool {
)
}

/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`.
pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool {
matches!(dt, DataType::Utf8 | DataType::LargeUtf8)
/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`.
pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool {
matches!(
dt,
DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8
)
}

/// Determine whether the given data type `dt` is a `Decimal`.
Expand Down
4 changes: 2 additions & 2 deletions datafusion/optimizer/src/analyzer/type_coercion.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ use datafusion_expr::type_coercion::functions::{
use datafusion_expr::type_coercion::other::{
get_coerce_type_for_case_expression, get_coerce_type_for_list,
};
use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8};
use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8};
use datafusion_expr::utils::merge_schema;
use datafusion_expr::{
is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not,
Expand Down Expand Up @@ -713,7 +713,7 @@ fn coerce_frame_bound(

fn extract_window_frame_target_type(col_type: &DataType) -> Result<DataType> {
if col_type.is_numeric()
|| is_utf8_or_large_utf8(col_type)
|| is_utf8_or_utf8view_or_large_utf8(col_type)
|| matches!(col_type, DataType::Null)
|| matches!(col_type, DataType::Boolean)
{
Expand Down
29 changes: 29 additions & 0 deletions datafusion/sqllogictest/test_files/subquery_sort.slt
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,35 @@ physical_plan
05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false]
06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true

#Test with utf8view for window function
statement ok
CREATE TABLE sink_table_with_utf8view AS
SELECT arrow_cast(c1, 'Utf8View') AS c1, c2, c3, c9
FROM sink_table;


query TT
EXPLAIN SELECT t2.c1, t2.r FROM (SELECT c1, RANK() OVER (ORDER BY c1 DESC) AS r, c3, c9 FROM sink_table_with_utf8view ORDER BY c1, c3 LIMIT 2) AS t2 ORDER BY t2.c1, t2.c3, t2.c9;
----
logical_plan
01)Projection: t2.c1, t2.r
02)--Sort: t2.c1 ASC NULLS LAST, t2.c3 ASC NULLS LAST, t2.c9 ASC NULLS LAST
03)----SubqueryAlias: t2
04)------Sort: sink_table_with_utf8view.c1 ASC NULLS LAST, sink_table_with_utf8view.c3 ASC NULLS LAST, fetch=2
05)--------Projection: sink_table_with_utf8view.c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r, sink_table_with_utf8view.c3, sink_table_with_utf8view.c9
06)----------WindowAggr: windowExpr=[[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]]
07)------------TableScan: sink_table_with_utf8view projection=[c1, c3, c9]
physical_plan
01)ProjectionExec: expr=[c1@0 as c1, r@1 as r]
02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false]
03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9]
04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted]
05)--------SortPreservingMergeExec: [c1@0 DESC]
06)----------SortExec: expr=[c1@0 DESC], preserve_partitioning=[true]
07)------------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0]

statement ok
DROP TABLE sink_table_with_utf8view;

query TT
EXPLAIN SELECT c1, c2 FROM (SELECT DISTINCT ON (c1) c1, c2, c3, c9 FROM sink_table ORDER BY c1, c3 DESC, c9) AS t2 ORDER BY t2.c1, t2.c3 DESC, t2.c9
Expand Down
59 changes: 59 additions & 0 deletions datafusion/sqllogictest/test_files/window.slt
Original file line number Diff line number Diff line change
Expand Up @@ -5536,3 +5536,62 @@ physical_plan
01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5]
02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }]
03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true

# Testing Utf8View with window
statement ok
CREATE TABLE aggregate_test_100_utf8view AS SELECT
arrow_cast(c1, 'Utf8View') as c1,
c9,
c13
FROM aggregate_test_100;


#fn window_frame_ranges_string_check
query II
SELECT
SUM(LENGTH(c13)) OVER(ORDER BY c13),
SUM(LENGTH(c1)) OVER(ORDER BY c1)
FROM aggregate_test_100_utf8view
ORDER BY c9
LIMIT 5
----
2100 100
510 79
1440 21
1830 61
2010 21


#fn test_window_rank
query IIIIIRR
SELECT
c9,
RANK() OVER(ORDER BY c1) AS rank1,
RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2,
DENSE_RANK() OVER(ORDER BY c1) as dense_rank1,
DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2,
PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1,
PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2
FROM aggregate_test_100_utf8view
ORDER BY c9
LIMIT 5
----
28774375 80 80 5 5 0.79797979798 0.79797979798
63044568 62 62 4 4 0.616161616162 0.616161616162
141047417 1 1 1 1 0 0
141680161 41 41 3 3 0.40404040404 0.40404040404
145294611 1 1 1 1 0 0


# CTAS with NTILE function
statement ok
CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM aggregate_test_100_utf8view;

statement ok
DROP TABLE new_table;

statement ok
DROP TABLE aggregate_test_100_utf8view;

statement ok
DROP TABLE aggregate_test_100