diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs index 3a5c65fb46ee..4fc150ef2996 100644 --- a/datafusion/expr/src/type_coercion/mod.rs +++ b/datafusion/expr/src/type_coercion/mod.rs @@ -79,9 +79,12 @@ pub fn is_datetime(dt: &DataType) -> bool { ) } -/// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`. -pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool { - matches!(dt, DataType::Utf8 | DataType::LargeUtf8) +/// Determine whether the given data type `dt` is a `Utf8` or `Utf8View` or `LargeUtf8`. +pub fn is_utf8_or_utf8view_or_large_utf8(dt: &DataType) -> bool { + matches!( + dt, + DataType::Utf8 | DataType::Utf8View | DataType::LargeUtf8 + ) } /// Determine whether the given data type `dt` is a `Decimal`. diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index c9c0b7a3b789..c0a43fcd049f 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -46,7 +46,7 @@ use datafusion_expr::type_coercion::functions::{ use datafusion_expr::type_coercion::other::{ get_coerce_type_for_case_expression, get_coerce_type_for_list, }; -use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_large_utf8}; +use datafusion_expr::type_coercion::{is_datetime, is_utf8_or_utf8view_or_large_utf8}; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ is_false, is_not_false, is_not_true, is_not_unknown, is_true, is_unknown, not, @@ -713,7 +713,7 @@ fn coerce_frame_bound( fn extract_window_frame_target_type(col_type: &DataType) -> Result { if col_type.is_numeric() - || is_utf8_or_large_utf8(col_type) + || is_utf8_or_utf8view_or_large_utf8(col_type) || matches!(col_type, DataType::Null) || matches!(col_type, DataType::Boolean) { diff --git a/datafusion/sqllogictest/test_files/subquery_sort.slt b/datafusion/sqllogictest/test_files/subquery_sort.slt index 4ca19c0b6af7..5d22bf92e7e6 100644 --- a/datafusion/sqllogictest/test_files/subquery_sort.slt +++ b/datafusion/sqllogictest/test_files/subquery_sort.slt @@ -104,6 +104,35 @@ physical_plan 05)--------SortExec: expr=[c1@0 DESC], preserve_partitioning=[false] 06)----------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c1, c3, c9], file_type=csv, has_header=true +#Test with utf8view for window function +statement ok +CREATE TABLE sink_table_with_utf8view AS +SELECT arrow_cast(c1, 'Utf8View') AS c1, c2, c3, c9 +FROM sink_table; + + +query TT +EXPLAIN SELECT t2.c1, t2.r FROM (SELECT c1, RANK() OVER (ORDER BY c1 DESC) AS r, c3, c9 FROM sink_table_with_utf8view ORDER BY c1, c3 LIMIT 2) AS t2 ORDER BY t2.c1, t2.c3, t2.c9; +---- +logical_plan +01)Projection: t2.c1, t2.r +02)--Sort: t2.c1 ASC NULLS LAST, t2.c3 ASC NULLS LAST, t2.c9 ASC NULLS LAST +03)----SubqueryAlias: t2 +04)------Sort: sink_table_with_utf8view.c1 ASC NULLS LAST, sink_table_with_utf8view.c3 ASC NULLS LAST, fetch=2 +05)--------Projection: sink_table_with_utf8view.c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW AS r, sink_table_with_utf8view.c3, sink_table_with_utf8view.c9 +06)----------WindowAggr: windowExpr=[[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW]] +07)------------TableScan: sink_table_with_utf8view projection=[c1, c3, c9] +physical_plan +01)ProjectionExec: expr=[c1@0 as c1, r@1 as r] +02)--SortExec: TopK(fetch=2), expr=[c1@0 ASC NULLS LAST, c3@2 ASC NULLS LAST, c9@3 ASC NULLS LAST], preserve_partitioning=[false] +03)----ProjectionExec: expr=[c1@0 as c1, rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW@3 as r, c3@1 as c3, c9@2 as c9] +04)------BoundedWindowAggExec: wdw=[rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW: Ok(Field { name: "rank() ORDER BY [sink_table_with_utf8view.c1 DESC NULLS FIRST] RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW", data_type: UInt64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Range, start_bound: Preceding(Utf8View(NULL)), end_bound: CurrentRow, is_causal: false }], mode=[Sorted] +05)--------SortPreservingMergeExec: [c1@0 DESC] +06)----------SortExec: expr=[c1@0 DESC], preserve_partitioning=[true] +07)------------DataSourceExec: partitions=4, partition_sizes=[1, 0, 0, 0] + +statement ok +DROP TABLE sink_table_with_utf8view; query TT EXPLAIN SELECT c1, c2 FROM (SELECT DISTINCT ON (c1) c1, c2, c3, c9 FROM sink_table ORDER BY c1, c3 DESC, c9) AS t2 ORDER BY t2.c1, t2.c3 DESC, t2.c9 diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 1a9acc0f531a..fd623b67fe9f 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -5536,3 +5536,62 @@ physical_plan 01)ProjectionExec: expr=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING@1 as max_c5] 02)--WindowAggExec: wdw=[max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING: Ok(Field { name: "max(aggregate_test_100_ordered.c5) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), frame: WindowFrame { units: Rows, start_bound: Preceding(UInt64(NULL)), end_bound: Following(UInt64(NULL)), is_causal: false }] 03)----DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/testing/data/csv/aggregate_test_100.csv]]}, projection=[c5], file_type=csv, has_header=true + +# Testing Utf8View with window +statement ok +CREATE TABLE aggregate_test_100_utf8view AS SELECT + arrow_cast(c1, 'Utf8View') as c1, + c9, + c13 +FROM aggregate_test_100; + + +#fn window_frame_ranges_string_check +query II +SELECT +SUM(LENGTH(c13)) OVER(ORDER BY c13), +SUM(LENGTH(c1)) OVER(ORDER BY c1) +FROM aggregate_test_100_utf8view +ORDER BY c9 +LIMIT 5 +---- +2100 100 +510 79 +1440 21 +1830 61 +2010 21 + + +#fn test_window_rank +query IIIIIRR +SELECT + c9, + RANK() OVER(ORDER BY c1) AS rank1, + RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as rank2, + DENSE_RANK() OVER(ORDER BY c1) as dense_rank1, + DENSE_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as dense_rank2, + PERCENT_RANK() OVER(ORDER BY c1) as percent_rank1, + PERCENT_RANK() OVER(ORDER BY c1 ROWS BETWEEN 10 PRECEDING and 1 FOLLOWING) as percent_rank2 + FROM aggregate_test_100_utf8view + ORDER BY c9 + LIMIT 5 +---- +28774375 80 80 5 5 0.79797979798 0.79797979798 +63044568 62 62 4 4 0.616161616162 0.616161616162 +141047417 1 1 1 1 0 0 +141680161 41 41 3 3 0.40404040404 0.40404040404 +145294611 1 1 1 1 0 0 + + +# CTAS with NTILE function +statement ok +CREATE TABLE new_table AS SELECT NTILE(2) OVER(ORDER BY c1) AS ntile_2 FROM aggregate_test_100_utf8view; + +statement ok +DROP TABLE new_table; + +statement ok +DROP TABLE aggregate_test_100_utf8view; + +statement ok +DROP TABLE aggregate_test_100