From 3ced8490060066d88bcdcc39b9d59929f4d435d8 Mon Sep 17 00:00:00 2001 From: Junhao Liu Date: Sat, 4 May 2024 11:52:49 -0500 Subject: [PATCH 1/3] add first sqllogictest --- .../test_files/parquet_sorted_statistics.slt | 72 +++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index f7a81f08456f..e0d0aea7fa2f 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -260,3 +260,75 @@ physical_plan 01)SortPreservingMergeExec: [constant_col@0 ASC NULLS LAST] 02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] + + +# Create a table as a data source +statement ok +CREATE TABLE my_test ( + a INT, + b INT, + c INT +) AS VALUES +-- first file +(1, 100, 0), +(2, 200, 0), +(3, 300, 0), +-- second file +(7, 700, 0), +(8, 800, 0), +(9, 900, 0); + +# Create files: file1.parquet, file2.parquet both sorted on a +# but file 1 has the columns in the order a, b, c and file 2 has the columns in the order c, b, a +# The keyranges of values of a should be non overlapping + +# Create file1.parquet +query III +COPY (SELECT * FROM my_test WHERE a <= 3 ORDER BY a) +TO 'test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet' +STORED AS PARQUET; +---- +3 + +# Create file2.parquet +query III +COPY (SELECT c, b, a FROM my_test WHERE a > 6 ORDER BY a) +TO 'test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet' +STORED AS PARQUET; +---- +3 + +# Create an external table a, b, c with explicit order by a +statement ok +CREATE EXTERNAL TABLE my_test_table ( + partition_col TEXT NOT NULL, + a INT NOT NULL, + b INT NOT NULL, + c INT NOT NULL +) +STORED AS PARQUET +PARTITIONED BY (partition_col) +WITH ORDER (a ASC NULLS LAST) +LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table1'; + +query TT +EXPLAIN SELECT * +FROM my_test_table +ORDER BY a; +---- +logical_plan +01)Sort: my_test_table.a ASC NULLS LAST +02)--TableScan: my_test_table projection=[a, b, c, partition_col] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] + +query TT +EXPLAIN SELECT * +FROM my_test_table +ORDER BY b; +---- +logical_plan +01)Sort: my_test_table.b ASC NULLS LAST +02)--TableScan: my_test_table projection=[a, b, c, partition_col] +physical_plan +01)SortExec: expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] +02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] \ No newline at end of file From 2bb40a71fcc6c4811faaa6ad8e7d54f53904f216 Mon Sep 17 00:00:00 2001 From: Junhao Liu Date: Sat, 4 May 2024 11:56:24 -0500 Subject: [PATCH 2/3] change name --- .../test_files/parquet_sorted_statistics.slt | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index e0d0aea7fa2f..010ee2949c3c 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -261,10 +261,12 @@ physical_plan 02)--SortExec: expr=[constant_col@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----ParquetExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=A/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=B/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table/partition_col=C/2.parquet]]}, projection=[constant_col] +statement ok +DROP TABLE test_table; # Create a table as a data source statement ok -CREATE TABLE my_test ( +CREATE TABLE src_table2 ( a INT, b INT, c INT @@ -284,7 +286,7 @@ CREATE TABLE my_test ( # Create file1.parquet query III -COPY (SELECT * FROM my_test WHERE a <= 3 ORDER BY a) +COPY (SELECT * FROM src_table2 WHERE a <= 3 ORDER BY a) TO 'test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet' STORED AS PARQUET; ---- @@ -292,7 +294,7 @@ STORED AS PARQUET; # Create file2.parquet query III -COPY (SELECT c, b, a FROM my_test WHERE a > 6 ORDER BY a) +COPY (SELECT c, b, a FROM src_table2 WHERE a > 6 ORDER BY a) TO 'test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet' STORED AS PARQUET; ---- @@ -300,7 +302,7 @@ STORED AS PARQUET; # Create an external table a, b, c with explicit order by a statement ok -CREATE EXTERNAL TABLE my_test_table ( +CREATE EXTERNAL TABLE test_table ( partition_col TEXT NOT NULL, a INT NOT NULL, b INT NOT NULL, @@ -313,22 +315,22 @@ LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table1'; query TT EXPLAIN SELECT * -FROM my_test_table +FROM test_table ORDER BY a; ---- logical_plan -01)Sort: my_test_table.a ASC NULLS LAST -02)--TableScan: my_test_table projection=[a, b, c, partition_col] +01)Sort: test_table.a ASC NULLS LAST +02)--TableScan: test_table projection=[a, b, c, partition_col] physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] query TT EXPLAIN SELECT * -FROM my_test_table +FROM test_table ORDER BY b; ---- logical_plan -01)Sort: my_test_table.b ASC NULLS LAST -02)--TableScan: my_test_table projection=[a, b, c, partition_col] +01)Sort: test_table.b ASC NULLS LAST +02)--TableScan: test_table projection=[a, b, c, partition_col] physical_plan 01)SortExec: expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] 02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] \ No newline at end of file From d9f83e2d7a60b9c94e689655489f929e4ab71ad3 Mon Sep 17 00:00:00 2001 From: Junhao Liu Date: Mon, 13 May 2024 21:17:33 -0500 Subject: [PATCH 3/3] add description --- .../test_files/parquet_sorted_statistics.slt | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt index 010ee2949c3c..1b37ddcb3526 100644 --- a/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_sorted_statistics.slt @@ -266,36 +266,36 @@ DROP TABLE test_table; # Create a table as a data source statement ok -CREATE TABLE src_table2 ( +CREATE TABLE int_table ( a INT, b INT, c INT ) AS VALUES -- first file -(1, 100, 0), +(3, 100, 0), (2, 200, 0), -(3, 300, 0), +(1, 300, 0), -- second file -(7, 700, 0), +(9, 700, 0), (8, 800, 0), -(9, 900, 0); +(7, 900, 0); -# Create files: file1.parquet, file2.parquet both sorted on a +# Create files: 0.parquet, 1.parquet both sorted on a, the schema of the files is different but compatible # but file 1 has the columns in the order a, b, c and file 2 has the columns in the order c, b, a # The keyranges of values of a should be non overlapping -# Create file1.parquet +# Create 0.parquet query III -COPY (SELECT * FROM src_table2 WHERE a <= 3 ORDER BY a) -TO 'test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet' +COPY (SELECT * FROM int_table WHERE a <= 3 ORDER BY a) +TO 'test_files/scratch/parquet_sorted_statistics/int_table/0.parquet' STORED AS PARQUET; ---- 3 -# Create file2.parquet +# Create 1.parquet query III -COPY (SELECT c, b, a FROM src_table2 WHERE a > 6 ORDER BY a) -TO 'test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet' +COPY (SELECT c, b, a FROM int_table WHERE a > 6 ORDER BY a) +TO 'test_files/scratch/parquet_sorted_statistics/int_table/1.parquet' STORED AS PARQUET; ---- 3 @@ -311,8 +311,9 @@ CREATE EXTERNAL TABLE test_table ( STORED AS PARQUET PARTITIONED BY (partition_col) WITH ORDER (a ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet_sorted_statistics/test_table1'; +LOCATION 'test_files/scratch/parquet_sorted_statistics/int_table'; +# Make sure the output plan doesn't use sort preserving merge query TT EXPLAIN SELECT * FROM test_table @@ -321,8 +322,9 @@ ORDER BY a; logical_plan 01)Sort: test_table.a ASC NULLS LAST 02)--TableScan: test_table projection=[a, b, c, partition_col] -physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] +physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/int_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/int_table/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] +# Make sure the output plan doesn't use sort preserving merge query TT EXPLAIN SELECT * FROM test_table @@ -333,4 +335,4 @@ logical_plan 02)--TableScan: test_table projection=[a, b, c, partition_col] physical_plan 01)SortExec: expr=[b@1 ASC NULLS LAST], preserve_partitioning=[false] -02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/test_table1/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] \ No newline at end of file +02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/int_table/0.parquet, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_sorted_statistics/int_table/1.parquet]]}, projection=[a, b, c, partition_col], output_ordering=[a@0 ASC NULLS LAST] \ No newline at end of file