-
Notifications
You must be signed in to change notification settings - Fork 1.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
simplify array_has
UDF to InList
expr when haystack is constant
#15354
Changes from all commits
57bc86c
3a2e170
aed4795
72b311b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5960,6 +5960,188 @@ true false true false false false true true false false true false true | |
#---- | ||
#true false true false false false true true false false true false true | ||
|
||
# rewrite various array_has operations to InList where the haystack is a literal list | ||
# NB that `col in (a, b, c)` is simplified to OR if there are <= 3 elements, so we make 4-element haystack lists | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
|
||
query I | ||
with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'); | ||
---- | ||
1 | ||
|
||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE needle IN ('7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) | ||
07)------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) | ||
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
|
||
query I | ||
with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']); | ||
---- | ||
1 | ||
|
||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE needle = ANY(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c']); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that is cool to see |
||
07)------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) | ||
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
|
||
query I | ||
with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle); | ||
---- | ||
1 | ||
|
||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], needle); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) | ||
07)------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) | ||
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
|
||
# FIXME: due to rewrite below not working, this is _extremely_ slow to evaluate | ||
# query I | ||
# with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
# select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); | ||
# ---- | ||
# 1 | ||
|
||
# FIXME: array_has with large list haystack not currently rewritten to InList | ||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'LargeList(Utf8View)'), needle); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32))) | ||
07)------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32)) | ||
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
|
||
query I | ||
with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle); | ||
---- | ||
1 | ||
|
||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has(arrow_cast(['7f4b18de3cfeb9b4ac78c381ee2ad278', 'a', 'b', 'c'], 'FixedSizeList(4, Utf8View)'), needle); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) | ||
07)------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) | ||
08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
|
||
query I | ||
with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has([needle], needle); | ||
---- | ||
100000 | ||
|
||
# TODO: this should probably be possible to completely remove the filter as always true? | ||
query TT | ||
explain with test AS (SELECT substr(md5(i)::text, 1, 32) as needle FROM generate_series(1, 100000) t(i)) | ||
select count(*) from test WHERE array_has([needle], needle); | ||
---- | ||
logical_plan | ||
01)Projection: count(Int64(1)) AS count(*) | ||
02)--Aggregate: groupBy=[[]], aggr=[[count(Int64(1))]] | ||
03)----SubqueryAlias: test | ||
04)------SubqueryAlias: t | ||
05)--------Projection: | ||
06)----------Filter: __common_expr_3 = __common_expr_3 | ||
07)------------Projection: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) AS __common_expr_3 | ||
08)--------------TableScan: tmp_table projection=[value] | ||
physical_plan | ||
01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] | ||
02)--AggregateExec: mode=Final, gby=[], aggr=[count(Int64(1))] | ||
03)----CoalescePartitionsExec | ||
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] | ||
05)--------ProjectionExec: expr=[] | ||
06)----------CoalesceBatchesExec: target_batch_size=8192 | ||
07)------------FilterExec: __common_expr_3@0 = __common_expr_3@0 | ||
08)--------------ProjectionExec: expr=[substr(md5(CAST(value@0 AS Utf8)), 1, 32) as __common_expr_3] | ||
09)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 | ||
10)------------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] | ||
Comment on lines
+6119
to
+6143
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is probably #15387 |
||
|
||
# any operator | ||
query ? | ||
select column3 from arrays where 'L'=any(column3); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would expect that during constant evaluation make_array would be turned into a literal so this case would be unecessary
However, you wouldn't observe that simplification happening in unit tests (only in the slt tests when everything was put together)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tested removing this case and the slt tests failed like this
I found that unexpected but don't have time to look into it more now