Skip to content

Commit 976192a

Browse files
vladimirg-dbMaxGekk
authored andcommitted
[SPARK-50559][SQL] Store Except, Intersect and Union's outputs as lazy vals
### What changes were proposed in this pull request? Store `Except`, `Intersect` and `Union`'s outputs as lazy vals. ### Why are the changes needed? Currently `Union`'s (same is for `Except` and `Intersect`) `output` is a `def`. This creates performance issues for queries with large number of stacked `UNION`s because of rules like `WidenSetOperationTypes` that traverse the logical plan and call `output` on each `Union` node. This has quadratic complexity: O(number_of_unions * (1 + 2 + 3 + ... + number_of_unions)). Profile: ![image](https://github.com/user-attachments/assets/97192bf3-c38e-47dd-81ac-49ee9a546525) ![image](https://github.com/user-attachments/assets/68ed13d7-b108-4c8d-b156-1dcec07c324b) [flamegraph.tar.gz](https://github.com/user-attachments/files/18118260/flamegraph.tar.gz) The improvement in parsing + analysis wall-clock time for a query with 500 UNIONs over 30 columns each is 13x (5.5s -> 400ms): ![image](https://github.com/user-attachments/assets/a824c693-0a6b-4c6b-8a90-a783a3c44d6d) Repro: ``` def genValues(num: Int) = s"VALUES (${(0 until num).mkString(", ")})" def genUnions(numUnions: Int, numValues: Int) = (0 until numUnions).map(_ => genValues(numValues)).mkString(" UNION ALL ") spark.time { spark.sql(s"SELECT * FROM ${genUnions(numUnions = 500, numValues = 30)}").queryExecution.analyzed } ``` For `EXCEPT` the perf difference is not that noticeable. Perhaps because it reuses the same `Seq` (it just calls `left.output`). ### Does this PR introduce _any_ user-facing change? No, this is an optimization. ### How was this patch tested? - Ran the async-profiler - Ran the benchmark in spark-shell. - Existing tests. ### Was this patch authored or co-authored using generative AI tooling? copilot.nvim. Closes apache#49166 from vladimirg-db/vladimirg-db/store-union-output-as-lazy-val. Authored-by: Vladimir Golubev <[email protected]> Signed-off-by: Max Gekk <[email protected]>
1 parent 2b9eb08 commit 976192a

File tree

5 files changed

+177
-16
lines changed

5 files changed

+177
-16
lines changed

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala

+48-16
Original file line numberDiff line numberDiff line change
@@ -376,10 +376,13 @@ case class Intersect(
376376

377377
final override val nodePatterns: Seq[TreePattern] = Seq(INTERSECT)
378378

379-
override def output: Seq[Attribute] =
380-
left.output.zip(right.output).map { case (leftAttr, rightAttr) =>
381-
leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable)
379+
override def output: Seq[Attribute] = {
380+
if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) {
381+
lazyOutput
382+
} else {
383+
computeOutput()
382384
}
385+
}
383386

384387
override def metadataOutput: Seq[Attribute] = Nil
385388

@@ -396,15 +399,29 @@ case class Intersect(
396399

397400
override protected def withNewChildrenInternal(
398401
newLeft: LogicalPlan, newRight: LogicalPlan): Intersect = copy(left = newLeft, right = newRight)
402+
403+
private lazy val lazyOutput: Seq[Attribute] = computeOutput()
404+
405+
/** We don't use right.output because those rows get excluded from the set. */
406+
private def computeOutput(): Seq[Attribute] =
407+
left.output.zip(right.output).map { case (leftAttr, rightAttr) =>
408+
leftAttr.withNullability(leftAttr.nullable && rightAttr.nullable)
409+
}
399410
}
400411

401412
case class Except(
402413
left: LogicalPlan,
403414
right: LogicalPlan,
404415
isAll: Boolean) extends SetOperation(left, right) {
405416
override def nodeName: String = getClass.getSimpleName + ( if ( isAll ) " All" else "" )
406-
/** We don't use right.output because those rows get excluded from the set. */
407-
override def output: Seq[Attribute] = left.output
417+
418+
override def output: Seq[Attribute] = {
419+
if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) {
420+
lazyOutput
421+
} else {
422+
computeOutput()
423+
}
424+
}
408425

409426
override def metadataOutput: Seq[Attribute] = Nil
410427

@@ -416,6 +433,11 @@ case class Except(
416433

417434
override protected def withNewChildrenInternal(
418435
newLeft: LogicalPlan, newRight: LogicalPlan): Except = copy(left = newLeft, right = newRight)
436+
437+
private lazy val lazyOutput: Seq[Attribute] = computeOutput()
438+
439+
/** We don't use right.output because those rows get excluded from the set. */
440+
private def computeOutput(): Seq[Attribute] = left.output
419441
}
420442

421443
/** Factory for constructing new `Union` nodes. */
@@ -479,18 +501,11 @@ case class Union(
479501
AttributeSet.fromAttributeSets(children.map(_.outputSet)).size
480502
}
481503

482-
// updating nullability to make all the children consistent
483504
override def output: Seq[Attribute] = {
484-
children.map(_.output).transpose.map { attrs =>
485-
val firstAttr = attrs.head
486-
val nullable = attrs.exists(_.nullable)
487-
val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge)
488-
if (firstAttr.dataType == newDt) {
489-
firstAttr.withNullability(nullable)
490-
} else {
491-
AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)(
492-
firstAttr.exprId, firstAttr.qualifier)
493-
}
505+
if (conf.getConf(SQLConf.LAZY_SET_OPERATOR_OUTPUT)) {
506+
lazyOutput
507+
} else {
508+
computeOutput()
494509
}
495510
}
496511

@@ -509,6 +524,23 @@ case class Union(
509524
children.length > 1 && !(byName || allowMissingCol) && childrenResolved && allChildrenCompatible
510525
}
511526

527+
private lazy val lazyOutput: Seq[Attribute] = computeOutput()
528+
529+
// updating nullability to make all the children consistent
530+
private def computeOutput(): Seq[Attribute] = {
531+
children.map(_.output).transpose.map { attrs =>
532+
val firstAttr = attrs.head
533+
val nullable = attrs.exists(_.nullable)
534+
val newDt = attrs.map(_.dataType).reduce(StructType.unionLikeMerge)
535+
if (firstAttr.dataType == newDt) {
536+
firstAttr.withNullability(nullable)
537+
} else {
538+
AttributeReference(firstAttr.name, newDt, nullable, firstAttr.metadata)(
539+
firstAttr.exprId, firstAttr.qualifier)
540+
}
541+
}
542+
}
543+
512544
/**
513545
* Maps the constraints containing a given (original) sequence of attributes to those with a
514546
* given (reference) sequence of attributes. Given the nature of union, we expect that the

sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

+13
Original file line numberDiff line numberDiff line change
@@ -5306,6 +5306,19 @@ object SQLConf {
53065306
.booleanConf
53075307
.createWithDefault(true)
53085308

5309+
val LAZY_SET_OPERATOR_OUTPUT = buildConf("spark.sql.lazySetOperatorOutput.enabled")
5310+
.internal()
5311+
.doc(
5312+
"When set to true, Except/Intersect/Union operator's output will be a lazy val. It " +
5313+
"is a performance optimization for querires with a large number of stacked set operators. " +
5314+
"This is because of rules like WidenSetOperationTypes that traverse the logical plan tree " +
5315+
"and call output on each Except/Intersect/Union node. Such traversal has quadratic " +
5316+
"complexity: O(number_of_nodes * (1 + 2 + 3 + ... + number_of_nodes))."
5317+
)
5318+
.version("4.0.0")
5319+
.booleanConf
5320+
.createWithDefault(true)
5321+
53095322
/**
53105323
* Holds information about keys that have been deprecated.
53115324
*
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
================================================================================================
2+
Set Operations Benchmark
3+
================================================================================================
4+
5+
OpenJDK 64-Bit Server VM 21.0.5+11-Ubuntu-1ubuntu120.04 on Linux 5.4.0-1131-aws-fips
6+
Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
7+
Parsing + Analysis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
8+
------------------------------------------------------------------------------------------------------------------------
9+
UNION ALL 342 358 22 0.0 22784.2 1.0X
10+
EXCEPT ALL 310 351 44 0.0 20675.4 1.1X
11+
INTERSECT ALL 305 309 5 0.0 20301.6 1.1X
12+
13+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
================================================================================================
2+
Set Operations Benchmark
3+
================================================================================================
4+
5+
OpenJDK 64-Bit Server VM 17.0.12+7-Ubuntu-1ubuntu220.04 on Linux 5.4.0-1131-aws-fips
6+
Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
7+
Parsing + Analysis: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative
8+
------------------------------------------------------------------------------------------------------------------------
9+
UNION ALL 360 423 70 0.0 24019.4 1.0X
10+
EXCEPT ALL 322 328 5 0.0 21463.2 1.1X
11+
INTERSECT ALL 327 360 33 0.0 21777.2 1.1X
12+
13+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.execution.benchmark
19+
20+
import org.apache.spark.benchmark.Benchmark
21+
22+
/**
23+
* Benchmark to measure performance for set operations.
24+
* To run this benchmark:
25+
* {{{
26+
* 1. without sbt:
27+
* bin/spark-submit --class <this class>
28+
* --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
29+
* 2. build/sbt "sql/Test/runMain <this class>"
30+
* 3. generate result:
31+
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this class>"
32+
* Results will be written to "benchmarks/SetOperationsBenchmark-results.txt".
33+
* }}}
34+
*/
35+
object SetOperationsBenchmark extends SqlBasedBenchmark {
36+
private val setOperations = Seq("UNION ALL", "EXCEPT ALL", "INTERSECT ALL")
37+
38+
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
39+
runBenchmark("Set Operations Benchmark") {
40+
val numOperations = 500
41+
val numValues = 30
42+
43+
val benchmark =
44+
new Benchmark(
45+
"Parsing + Analysis",
46+
valuesPerIteration = numOperations * numValues,
47+
output = output
48+
)
49+
50+
for (operation <- setOperations) {
51+
benchmark.addCase(operation) { _ =>
52+
spark
53+
.sql(
54+
generateQuery(
55+
operation = operation,
56+
numOperations = numOperations,
57+
numValues = numValues
58+
)
59+
)
60+
.queryExecution
61+
.analyzed
62+
()
63+
}
64+
}
65+
66+
benchmark.run()
67+
}
68+
}
69+
70+
private def generateQuery(operation: String, numOperations: Int, numValues: Int) = {
71+
s"""
72+
SELECT
73+
*
74+
FROM
75+
${generateOperations(
76+
operation = operation,
77+
numOperations = numOperations,
78+
numValues = numValues
79+
)}
80+
"""
81+
}
82+
83+
private def generateOperations(operation: String, numOperations: Int, numValues: Int) = {
84+
(0 until numOperations).map(_ => generateValues(numValues)).mkString(s" ${operation} ")
85+
}
86+
87+
private def generateValues(num: Int) = {
88+
s"VALUES (${(0 until num).mkString(", ")})"
89+
}
90+
}

0 commit comments

Comments
 (0)