Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
70 commits
Select commit Hold shift + click to select a range
1746bcc
feat: Arrow-direct codegen dispatcher for Spark expressions and Scala…
mbutrovich May 8, 2026
08d6b78
prettier, add new suites to CI checks.
mbutrovich May 8, 2026
557752e
make format, fix shims for 4.0+
mbutrovich May 8, 2026
896f61f
make format, fix shims for 4.0+
mbutrovich May 8, 2026
a82e160
Merge branch 'main' into codegen_scala_udf
mbutrovich May 8, 2026
2a158f4
strengthen tests for composed expressions
mbutrovich May 8, 2026
654bbad
make format, again.
mbutrovich May 8, 2026
10df7e0
fix pr_benchmark_check.yml
mbutrovich May 8, 2026
7afe69f
fix arrow shading issue in CI.
mbutrovich May 8, 2026
0dc5855
fix Spark 4.0 collation expression shim
mbutrovich May 8, 2026
43a7b0c
apply common subexpression elimination, add tests for subqueries in UDFs
mbutrovich May 8, 2026
9640897
make format
mbutrovich May 8, 2026
f0c8296
decimal fast path. document 64KB limitation right now
mbutrovich May 9, 2026
2173f40
pass through task context to get around tokio worker pool calling ove…
mbutrovich May 9, 2026
2f9585b
fix compilation on scala 2.12, fix format issue
mbutrovich May 9, 2026
582cd17
Merge branch 'main' into codegen_scala_udf
mbutrovich May 9, 2026
22f3256
decimal output, utf8 output, non-nullable output optimizations
mbutrovich May 9, 2026
7666715
optimization menu
mbutrovich May 9, 2026
0a34636
estimate binaryview and binary size
mbutrovich May 9, 2026
e94b6db
fix "CSE collapses a repeated subtree to one evaluation in the genera…
mbutrovich May 9, 2026
d0f1f27
Merge remote-tracking branch 'origin/codegen_scala_udf' into codegen_…
mbutrovich May 9, 2026
07e37ea
add some complex type support, remove #4239 code. update docs.
mbutrovich May 9, 2026
ebf77c4
split codegen input and output, basic struct WIP
mbutrovich May 9, 2026
6836c30
split massive codegen file, handle recursive nested types
mbutrovich May 9, 2026
5d91a8f
map input
mbutrovich May 9, 2026
2a28aaf
more struct support
mbutrovich May 9, 2026
0c6586a
revert some benchmark changes
mbutrovich May 9, 2026
8497fe7
cleanup part 1
mbutrovich May 10, 2026
8d703c3
cleanup part 2
mbutrovich May 10, 2026
5ec0e3f
cleanup part 3
mbutrovich May 10, 2026
a22051e
remove view support, it's dead code right now
mbutrovich May 10, 2026
421c60c
use cometplainvector part 1
mbutrovich May 10, 2026
0705dff
use cometplainvector part 2
mbutrovich May 10, 2026
9a00874
make generated class final
mbutrovich May 10, 2026
d7b43fc
clean up test names
mbutrovich May 10, 2026
034e1f5
fix format
mbutrovich May 11, 2026
317feaf
Merge branch 'main' into codegen_scala_udf
mbutrovich May 11, 2026
db1f1f2
Merge branch 'main' into codegen_scala_udf
mbutrovich May 12, 2026
caffed9
fix 2.12 mapvalues usage
mbutrovich May 12, 2026
4be8144
Remove code related to #4239.
mbutrovich May 12, 2026
6fcd81c
Merge remote-tracking branch 'apache/main' into codegen_scala_udf
mbutrovich May 14, 2026
9f8aa07
fix after merging in upstream/main.
mbutrovich May 14, 2026
17b2714
switch to taskid-keyed state for CometUDFs.
mbutrovich May 14, 2026
ff8ee79
Merge branch 'main' into codegen_scala_udf
mbutrovich May 14, 2026
7ed806a
reduce the scope to just ScalaUDF instead of general spark expression…
mbutrovich May 14, 2026
6ff5aa0
update docs
mbutrovich May 14, 2026
935aec6
reorg codegen
mbutrovich May 14, 2026
cbf96df
more tests
mbutrovich May 14, 2026
5966055
cleanup
mbutrovich May 15, 2026
748f943
document optimizations
mbutrovich May 15, 2026
f9318d8
fix tests
mbutrovich May 15, 2026
19ac9f6
try to trim comments a bit
mbutrovich May 15, 2026
13270bf
update two tests
mbutrovich May 15, 2026
1111c6f
revert unintended diff from main
mbutrovich May 15, 2026
61ae5b7
add Java UDF test
mbutrovich May 15, 2026
6643208
update stale TODO references
mbutrovich May 15, 2026
965c2ba
better input fuzz coverage
mbutrovich May 15, 2026
948f3b9
better input fuzz coverage
mbutrovich May 15, 2026
41fc046
better input fuzz coverage
mbutrovich May 15, 2026
25c2511
simplify input logic
mbutrovich May 15, 2026
a057687
fix format
mbutrovich May 15, 2026
650f619
add fallback for too many args and a test, clean up printing code
mbutrovich May 15, 2026
b1e1c55
stronger tests
mbutrovich May 15, 2026
0f6f68c
Merge branch 'main' into codegen_scala_udf
mbutrovich May 15, 2026
d967143
fix(udf): scope the dispatcher's compile cache per task to isolate bo…
mbutrovich May 15, 2026
10da742
update docs
mbutrovich May 15, 2026
23df354
add missing suite
mbutrovich May 15, 2026
b161169
synchronize per-task UDF evaluation
mbutrovich May 16, 2026
f86e70b
Merge branch 'main' into codegen_scala_udf
mbutrovich May 16, 2026
dca8b22
update spark diffs
mbutrovich May 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions .github/workflows/pr_benchmark_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,7 @@ jobs:
${{ runner.os }}-benchmark-maven-

- name: Check Scala compilation and linting
# Pin to spark-4.0 (Scala 2.13.16) because the default profile is now
# spark-4.1 / Scala 2.13.17, and semanticdb-scalac_2.13.17 is not yet
# published, which breaks `-Psemanticdb`. See pr_build_linux.yml for
# the same exclusion in the main lint matrix.
# Pinned to spark-4.0 because semanticdb-scalac_2.13.17 (spark-4.1 default)
# is not yet published, which breaks the -Psemanticdb scalafix lint.
run: |
./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -Pspark-4.0 -DskipTests
./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Pspark-4.0 -Psemanticdb -DskipTests
4 changes: 4 additions & 0 deletions .github/workflows/pr_build_linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,7 @@ jobs:
org.apache.comet.CometFuzzAggregateSuite
org.apache.comet.CometFuzzIcebergSuite
org.apache.comet.CometFuzzMathSuite
org.apache.comet.CometCodegenDispatchFuzzSuite
org.apache.comet.DataGeneratorSuite
- name: "shuffle"
value: |
Expand Down Expand Up @@ -385,6 +386,9 @@ jobs:
org.apache.comet.expressions.conditional.CometIfSuite
org.apache.comet.expressions.conditional.CometCoalesceSuite
org.apache.comet.expressions.conditional.CometCaseWhenSuite
org.apache.comet.CometCodegenDispatchSmokeSuite
org.apache.comet.CometCodegenSourceSuite
org.apache.comet.CometCodegenHOFSuite
- name: "sql"
value: |
org.apache.spark.sql.CometToPrettyStringSuite
Expand Down
4 changes: 4 additions & 0 deletions .github/workflows/pr_build_macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ jobs:
org.apache.comet.CometFuzzAggregateSuite
org.apache.comet.CometFuzzIcebergSuite
org.apache.comet.CometFuzzMathSuite
org.apache.comet.CometCodegenDispatchFuzzSuite
org.apache.comet.DataGeneratorSuite
- name: "shuffle"
value: |
Expand Down Expand Up @@ -232,6 +233,9 @@ jobs:
org.apache.comet.expressions.conditional.CometIfSuite
org.apache.comet.expressions.conditional.CometCoalesceSuite
org.apache.comet.expressions.conditional.CometCaseWhenSuite
org.apache.comet.CometCodegenDispatchSmokeSuite
org.apache.comet.CometCodegenSourceSuite
org.apache.comet.CometCodegenHOFSuite
- name: "sql"
value: |
org.apache.spark.sql.CometToPrettyStringSuite
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.comet.codegen;

import org.apache.arrow.vector.FieldVector;
import org.apache.arrow.vector.ValueVector;

/**
* Abstract base extended by the Janino-compiled batch kernel emitted by {@code
* CometBatchKernelCodegen}. The generated subclass extends {@code CometInternalRow} (so Spark's
* {@code BoundReference.genCode} can call {@code this.getUTF8String(ord)} directly) and carries
* typed input fields baked at codegen time, one per input column. Expression evaluation plus Arrow
* read/write fuse into one method per expression tree.
*
* <p>Input scope: any {@code ValueVector[]}; the generated subclass casts each slot to the concrete
* Arrow type the compile-time schema specified. Output is a generic {@code FieldVector}; the
* generated subclass casts to the concrete type matching the bound expression's {@code dataType}.
* Widen input support by adding vector classes to the getter switch in {@code
* CometBatchKernelCodegen.emitTypedGetters}; widen output support by adding cases in {@code
* CometBatchKernelCodegen.allocateOutput} and {@code emitOutputWriter}.
*/
public abstract class CometBatchKernel extends CometInternalRow {

protected final Object[] references;

protected CometBatchKernel(Object[] references) {
this.references = references;
}

/**
* Process one batch.
*
* @param inputs Arrow input vectors; length and concrete classes must match the schema the kernel
* was compiled against
* @param output Arrow output vector; caller allocates to the expression's {@code dataType}
* @param numRows number of rows in this batch
*/
public abstract void process(ValueVector[] inputs, FieldVector output, int numRows);

/**
* Run partition-dependent initialization. The generated subclass overrides this to execute
* statements collected via {@code CodegenContext.addPartitionInitializationStatement}, for
* example reseeding {@code Rand}'s {@code XORShiftRandom} from {@code seed + partitionIndex}.
* Deterministic expressions leave this as a no-op.
*
* <p>The caller must invoke this before the first {@code process} call of each partition. The
* generated subclass is not thread-safe across concurrent {@code process} calls, so kernels are
* allocated per dispatcher invocation and init is run once on the fresh instance.
*/
public void init(int partitionIndex) {}
Comment thread
mbutrovich marked this conversation as resolved.
}
11 changes: 11 additions & 0 deletions common/src/main/scala/org/apache/comet/CometConf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,17 @@ object CometConf extends ShimCometConf {
.booleanConf
.createWithDefault(false)

val COMET_SCALA_UDF_CODEGEN_ENABLED: ConfigEntry[Boolean] =
conf("spark.comet.exec.scalaUDF.codegen.enabled")
.category(CATEGORY_EXEC)
.doc(
"Whether to route Spark `ScalaUDF` expressions through Comet's Arrow-direct codegen " +
"dispatcher. When enabled, a supported ScalaUDF is compiled into a per-batch kernel " +
"that reads and writes Arrow vectors directly from native execution. When disabled, " +
"plans containing a ScalaUDF fall back to Spark for the enclosing operator.")
.booleanConf
.createWithDefault(true)

val COMET_EXEC_SHUFFLE_WITH_HASH_PARTITIONING_ENABLED: ConfigEntry[Boolean] =
conf("spark.comet.native.shuffle.partitioning.hash.enabled")
.category(CATEGORY_SHUFFLE)
Expand Down
106 changes: 106 additions & 0 deletions common/src/main/scala/org/apache/comet/codegen/CometArrayData.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.apache.comet.codegen

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}

import org.apache.comet.shims.CometInternalRowShim

/**
* Throwing-default base for [[ArrayData]] in the Arrow-direct codegen kernel. Subclasses override
* only the getters their element type needs (e.g. `numElements`, `isNullAt`, `getUTF8String` for
* an `ArrayType(StringType)` input).
*
* Consumer: `InputArray_${path}` nested classes the input emitter generates per `ArrayType` input
* column. They back `getArray(ord)` plus the recursion for `Array<Array<...>>` and array-typed
* map keys / struct fields.
*
* `ArrayData` and [[CometInternalRow]]'s [[InternalRow]] are sibling abstract classes in Spark
* (both extend `SpecializedGetters`, neither inherits the other), so a base aimed at one cannot
* serve the other. The dispatch body that '''is''' shared between them lives in
* [[CometSpecializedGettersDispatch]]. The third sibling, [[CometMapData]], backs `InputMap_*`
* and routes `keyArray()` / `valueArray()` through `CometArrayData` instances.
*
* Mixes in [[CometInternalRowShim]] for the same reason `CometInternalRow` does: Spark 4.x adds
* abstract `SpecializedGetters` methods (`getVariant`, `getGeography`, `getGeometry`) that both
* `InternalRow` and `ArrayData` inherit; the per-profile shim provides throwing defaults.
*/
abstract class CometArrayData extends ArrayData with CometInternalRowShim {

override def getInterval(ordinal: Int): CalendarInterval = unsupported("getInterval")

override def get(ordinal: Int, dataType: DataType): AnyRef =
CometSpecializedGettersDispatch.get(this, ordinal, dataType)

override def isNullAt(ordinal: Int): Boolean = unsupported("isNullAt")

override def getBoolean(ordinal: Int): Boolean = unsupported("getBoolean")

override def getByte(ordinal: Int): Byte = unsupported("getByte")

override def getShort(ordinal: Int): Short = unsupported("getShort")

override def getInt(ordinal: Int): Int = unsupported("getInt")

override def getLong(ordinal: Int): Long = unsupported("getLong")

override def getFloat(ordinal: Int): Float = unsupported("getFloat")

override def getDouble(ordinal: Int): Double = unsupported("getDouble")

override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal =
unsupported("getDecimal")

override def getUTF8String(ordinal: Int): UTF8String = unsupported("getUTF8String")

override def getBinary(ordinal: Int): Array[Byte] = unsupported("getBinary")

override def getStruct(ordinal: Int, numFields: Int): InternalRow = unsupported("getStruct")

override def getArray(ordinal: Int): ArrayData = unsupported("getArray")

override def getMap(ordinal: Int): MapData = unsupported("getMap")

override def setNullAt(i: Int): Unit = unsupported("setNullAt")

protected def unsupported(method: String): Nothing =
throw new UnsupportedOperationException(
s"${getClass.getSimpleName}: $method not implemented for this array shape")

override def update(i: Int, value: Any): Unit = unsupported("update")

override def copy(): ArrayData = unsupported("copy")

override def array: Array[Any] = unsupported("array")

override def toString(): String = {
val n =
try numElements().toString
catch {
case _: Throwable => "?"
}
s"${getClass.getSimpleName}(numElements=$n)"
}

override def numElements(): Int = unsupported("numElements")
}
Loading
Loading