@@ -5969,6 +5969,188 @@ true false true false false false true true false false true false true
5969
5969
#----
5970
5970
#true false true false false false true true false false true false true
5971
5971
5972
+ # rewrite various array_has operations to InList where the haystack is a literal list
5973
+ # NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists
5974
+
5975
+ query I
5976
+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
5977
+ select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
5978
+ ----
5979
+ 1
5980
+
5981
+ query TT
5982
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
5983
+ select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c');
5984
+ ----
5985
+ logical_plan
5986
+ 01)Projection: count(Int64(1)) AS count(*)
5987
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
5988
+ 03)----SubqueryAlias: test
5989
+ 04)------SubqueryAlias: t
5990
+ 05)--------Projection:
5991
+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
5992
+ 07)------------TableScan: tmp_table projection=[value]
5993
+ physical_plan
5994
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
5995
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
5996
+ 03)----CoalescePartitionsExec
5997
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
5998
+ 05)--------ProjectionExec: expr=[]
5999
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6000
+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6001
+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6002
+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6003
+
6004
+ query I
6005
+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6006
+ select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
6007
+ ----
6008
+ 1
6009
+
6010
+ query TT
6011
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6012
+ select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']);
6013
+ ----
6014
+ logical_plan
6015
+ 01)Projection: count(Int64(1)) AS count(*)
6016
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6017
+ 03)----SubqueryAlias: test
6018
+ 04)------SubqueryAlias: t
6019
+ 05)--------Projection:
6020
+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6021
+ 07)------------TableScan: tmp_table projection=[value]
6022
+ physical_plan
6023
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6024
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6025
+ 03)----CoalescePartitionsExec
6026
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6027
+ 05)--------ProjectionExec: expr=[]
6028
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6029
+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6030
+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6031
+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6032
+
6033
+ query I
6034
+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6035
+ select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
6036
+ ----
6037
+ 1
6038
+
6039
+ query TT
6040
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6041
+ select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle);
6042
+ ----
6043
+ logical_plan
6044
+ 01)Projection: count(Int64(1)) AS count(*)
6045
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6046
+ 03)----SubqueryAlias: test
6047
+ 04)------SubqueryAlias: t
6048
+ 05)--------Projection:
6049
+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6050
+ 07)------------TableScan: tmp_table projection=[value]
6051
+ physical_plan
6052
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6053
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6054
+ 03)----CoalescePartitionsExec
6055
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6056
+ 05)--------ProjectionExec: expr=[]
6057
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6058
+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6059
+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6060
+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6061
+
6062
+ # FIXME: due to rewrite below not working, this is _extremely_ slow to evaluate
6063
+ # query I
6064
+ # with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6065
+ # select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
6066
+ # ----
6067
+ # 1
6068
+
6069
+ # FIXME: array_has with large list haystack not currently rewritten to InList
6070
+ query TT
6071
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6072
+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle);
6073
+ ----
6074
+ logical_plan
6075
+ 01)Projection: count(Int64(1)) AS count(*)
6076
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6077
+ 03)----SubqueryAlias: test
6078
+ 04)------SubqueryAlias: t
6079
+ 05)--------Projection:
6080
+ 06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)))
6081
+ 07)------------TableScan: tmp_table projection=[value]
6082
+ physical_plan
6083
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6084
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6085
+ 03)----CoalescePartitionsExec
6086
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6087
+ 05)--------ProjectionExec: expr=[]
6088
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6089
+ 07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32))
6090
+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6091
+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6092
+
6093
+ query I
6094
+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6095
+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
6096
+ ----
6097
+ 1
6098
+
6099
+ query TT
6100
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6101
+ select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle);
6102
+ ----
6103
+ logical_plan
6104
+ 01)Projection: count(Int64(1)) AS count(*)
6105
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6106
+ 03)----SubqueryAlias: test
6107
+ 04)------SubqueryAlias: t
6108
+ 05)--------Projection:
6109
+ 06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")])
6110
+ 07)------------TableScan: tmp_table projection=[value]
6111
+ physical_plan
6112
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6113
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6114
+ 03)----CoalescePartitionsExec
6115
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6116
+ 05)--------ProjectionExec: expr=[]
6117
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6118
+ 07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }])
6119
+ 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6120
+ 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6121
+
6122
+ query I
6123
+ with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6124
+ select count(*) from test WHERE array_has([needle], needle);
6125
+ ----
6126
+ 100000
6127
+
6128
+ # TODO: this should probably be possible to completely remove the filter as always true?
6129
+ query TT
6130
+ explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i))
6131
+ select count(*) from test WHERE array_has([needle], needle);
6132
+ ----
6133
+ logical_plan
6134
+ 01)Projection: count(Int64(1)) AS count(*)
6135
+ 02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]]
6136
+ 03)----SubqueryAlias: test
6137
+ 04)------SubqueryAlias: t
6138
+ 05)--------Projection:
6139
+ 06)----------Filter: __common_expr_3 = __common_expr_3
6140
+ 07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3
6141
+ 08)--------------TableScan: tmp_table projection=[value]
6142
+ physical_plan
6143
+ 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)]
6144
+ 02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))]
6145
+ 03)----CoalescePartitionsExec
6146
+ 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
6147
+ 05)--------ProjectionExec: expr=[]
6148
+ 06)----------CoalesceBatchesExec: target_batch_size=8192
6149
+ 07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0
6150
+ 08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3]
6151
+ 09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1
6152
+ 10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]
6153
+
5972
6154
# any operator
5973
6155
query ?
5974
6156
select column3 from arrays where 'L'=any(column3);
0 commit comments