diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 5e3e8be4abc..983c4ee4bac 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -45,6 +45,7 @@
initializenone
+ false
@@ -196,6 +197,7 @@
runprocess-classes
+ ${rapids.aggregator.downstream.refresh.skip}/dev/null 2>&1 && pwd)
+SOURCE_DIR=$(cd "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd)
+
+function first_pom_value() {
+ local key="$1"
+ local pom="$2"
+ sed -n "0,/<$key>/{s|.*<$key>\([^<]*\)$key>.*|\1|p}" "$pom" | head -n 1
+}
+
+function last_pom_value() {
+ local key="$1"
+ local pom="$2"
+ sed -n "s|.*<$key>\([^<]*\)$key>.*|\1|p" "$pom" | tail -n 1
+}
function join_by { local IFS="$1"; shift; echo "$*"; }
@@ -56,6 +75,14 @@ function print_usage() {
echo " repackage the dist module artifact using installed dependencies"
echo " --scala213"
echo " build 2.13 shims"
+ echo " --unshim-fast"
+ echo " skip Maven checks/docs, tests, build metadata, coverage, enforcer, and snapshot refresh for repeated unshim/dist iteration"
+ echo " --parallel-world-only, --unshim-parallel-world-only"
+ echo " build analyzer-only parallel-world output without the final Maven dist invocation"
+ echo " --unshim-reuse-built-jars"
+ echo " with --unshim-fast --parallel-world-only, skip shim Maven builds and reuse existing target jars"
+ echo " --unshim-allowlist-only"
+ echo " imply --unshim-fast --parallel-world-only --unshim-reuse-built-jars and require only unshim allowlist changes"
}
function bloopInstall() {
@@ -148,6 +175,25 @@ case "$1" in
SCALA213=1
;;
+--unshim-fast|--fast-unshim)
+ UNSHIM_FAST=1
+ ;;
+
+--parallel-world-only|--unshim-parallel-world-only)
+ UNSHIM_PARALLEL_WORLD_ONLY=1
+ ;;
+
+--unshim-reuse-built-jars)
+ UNSHIM_REUSE_BUILT_JARS=1
+ ;;
+
+--unshim-allowlist-only)
+ UNSHIM_ALLOWLIST_ONLY=1
+ UNSHIM_FAST=1
+ UNSHIM_PARALLEL_WORLD_ONLY=1
+ UNSHIM_REUSE_BUILT_JARS=1
+ ;;
+
--rebuild-dist-only)
SKIP_DIST_DEPS="1"
MODULE="dist"
@@ -174,14 +220,62 @@ if [[ "$DIST_PROFILE" == *Scala213 ]]; then
SCALA213=1
fi
+if [[ "$UNSHIM_PARALLEL_WORLD_ONLY" == "1" ]]; then
+ FINAL_OP="generate-resources"
+ MODULE="${MODULE:-dist}"
+fi
+
MVN=${MVN:-"mvn"}
# include options to mvn command
export MVN="$MVN -Dmaven.wagon.http.retryHandler.count=3 ${MVN_OPT}"
+if [[ "$UNSHIM_FAST" == "1" ]]; then
+ export MAVEN_REFRESH_OPT="--no-snapshot-updates"
+ export MVN_FAST_SKIP_OPTS="-Dmaven.test.skip=true -Drat.skip=true -Dmaven.scalastyle.skip=true -Dmaven.scaladoc.skip=true -Dmaven.javadoc.skip=true -Ddist.jar.compress=false -Djacoco.skip=true -Denforcer.skip=true -Drapids.build.info.skip=true -Dignore.shim.revisions.check=true"
+else
+ export MAVEN_REFRESH_OPT="-U"
+ export MVN_FAST_SKIP_OPTS=""
+fi
+export UNSHIM_FAST
+export UNSHIM_PARALLEL_WORLD_ONLY
+export UNSHIM_ALLOWLIST_ONLY
+
+if [[ "$UNSHIM_REUSE_BUILT_JARS" == "1" && \
+ ( "$UNSHIM_FAST" != "1" || "$UNSHIM_PARALLEL_WORLD_ONLY" != "1" ) ]]; then
+ echo >&2 "--unshim-reuse-built-jars requires --unshim-fast --parallel-world-only"
+ exit 1
+fi
+
+if [[ "$UNSHIM_ALLOWLIST_ONLY" == "1" ]] && \
+ git -C "$SOURCE_DIR" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
+ ALLOWLIST_ONLY_DIRTY=$(
+ {
+ git -C "$SOURCE_DIR" diff --name-only -- \
+ . \
+ ':(exclude)dist/unshimmed-common-from-single-shim.txt' \
+ ':(exclude)dist/unshimmed-from-each-spark3xx.txt' \
+ ':(exclude)dist/keep-in-spark-shared.txt'
+ git -C "$SOURCE_DIR" diff --cached --name-only -- \
+ . \
+ ':(exclude)dist/unshimmed-common-from-single-shim.txt' \
+ ':(exclude)dist/unshimmed-from-each-spark3xx.txt' \
+ ':(exclude)dist/keep-in-spark-shared.txt'
+ } | sort -u
+ )
+ if [[ -n "$ALLOWLIST_ONLY_DIRTY" ]]; then
+ echo >&2 "--unshim-allowlist-only can only reuse jars when tracked changes are limited to dist/unshimmed*.txt or dist/keep-in-spark-shared.txt"
+ echo >&2 "$ALLOWLIST_ONLY_DIRTY"
+ exit 1
+ fi
+fi
if [[ "$SCALA213" == "1" ]]; then
POM_FILE="scala2.13/pom.xml"
export MVN="$MVN -f scala2.13/"
- $(dirname $0)/make-scala-version-build-files.sh 2.13
+ if [[ "$UNSHIM_FAST" == "1" && -f "$POM_FILE" ]]; then
+ echo "Unshim fast: reusing existing Scala 2.13 POMs"
+ else
+ "$SCRIPT_DIR"/make-scala-version-build-files.sh 2.13
+ fi
else
POM_FILE="pom.xml"
fi
@@ -216,7 +310,26 @@ case $DIST_PROFILE in
esac
echo "Spark versions involved: ${SPARK_SHIM_VERSIONS[@]} ..."
-export MVN_BASE_DIR=$($MVN help:evaluate -Dexpression=project.basedir -q -DforceStdout)
+if [[ "$UNSHIM_FAST" == "1" ]]; then
+ if [[ "$SCALA213" == "1" ]]; then
+ export MVN_BASE_DIR="$SOURCE_DIR/scala2.13"
+ else
+ export MVN_BASE_DIR="$SOURCE_DIR"
+ fi
+ export RAPIDS_PROJECT_VERSION=$(first_pom_value version "$POM_FILE")
+ export RAPIDS_SCALA_BINARY_VERSION=$(last_pom_value scala.binary.version "$POM_FILE")
+else
+ export MVN_BASE_DIR=$($MVN help:evaluate -Dexpression=project.basedir -q -DforceStdout)
+fi
+
+if [[ "$UNSHIM_PARALLEL_WORLD_ONLY" == "1" ]]; then
+ echo "Unshim parallel-world-only: preparing analyzer-only output and skipping JNI unpack, shimplify, and reduced POM generation"
+ MVN_FAST_SKIP_OPTS="$MVN_FAST_SKIP_OPTS -Drapids.jni.unpack.skip=true -Drapids.shimplify.skip=true -Drapids.parallel.world.skip.reduced.pom=true -Drapids.aggregator.downstream.refresh.skip=true"
+elif [[ "$UNSHIM_FAST" == "1" && -d "$MVN_BASE_DIR/dist/target/jni-deps" ]]; then
+ echo "Unshim fast: reusing existing JNI deps from $MVN_BASE_DIR/dist/target/jni-deps"
+ MVN_FAST_SKIP_OPTS="$MVN_FAST_SKIP_OPTS -Drapids.jni.unpack.skip=true"
+fi
+export MVN_FAST_SKIP_OPTS
if [[ "$GEN_BLOOP" == "true" ]]; then
bloopInstall
@@ -237,9 +350,45 @@ fi
echo "Building a combined dist jar with Shims for ${SPARK_SHIM_VERSIONS[@]} ..."
+function refresh_fast_aggregator_jar() {
+ [[ "$UNSHIM_FAST" == "1" ]] || return 0
+ local BUILD_VER=$1
+ local agg_dir="$MVN_BASE_DIR/aggregator/target/spark$BUILD_VER"
+ local agg_base="rapids-4-spark-aggregator_${RAPIDS_SCALA_BINARY_VERSION}-${RAPIDS_PROJECT_VERSION}"
+ local shaded_jar="$agg_dir/${agg_base}-shaded.jar"
+ local downstream_jar="$agg_dir/${agg_base}-spark$BUILD_VER.jar"
+ if [[ ! -f "$shaded_jar" ]]; then
+ echo >&2 "Expected shaded aggregator jar missing: $shaded_jar"
+ exit 255
+ fi
+ if [[ -f "$downstream_jar" ]] && cmp -s "$shaded_jar" "$downstream_jar"; then
+ return 0
+ fi
+ cp -p "$shaded_jar" "$downstream_jar"
+}
+export -f refresh_fast_aggregator_jar
+
+function verify_reusable_unshim_artifacts() {
+ local BUILD_VER=$1
+ local classifier="spark$BUILD_VER"
+ local api_base="rapids-4-spark-sql-plugin-api_${RAPIDS_SCALA_BINARY_VERSION}-${RAPIDS_PROJECT_VERSION}"
+ local agg_base="rapids-4-spark-aggregator_${RAPIDS_SCALA_BINARY_VERSION}-${RAPIDS_PROJECT_VERSION}"
+ local api_jar="$MVN_BASE_DIR/sql-plugin-api/target/$classifier/${api_base}-$classifier.jar"
+ local agg_shaded_jar="$MVN_BASE_DIR/aggregator/target/$classifier/${agg_base}-shaded.jar"
+ local jar_path
+ for jar_path in "$api_jar" "$agg_shaded_jar"; do
+ if [[ ! -f "$jar_path" ]]; then
+ echo >&2 "Expected reusable unshim artifact missing: $jar_path"
+ echo >&2 "Re-run without --unshim-reuse-built-jars after source or dependency changes."
+ exit 255
+ fi
+ done
+}
+export -f verify_reusable_unshim_artifacts
+
function build_single_shim() {
[[ "$BUILD_ALL_DEBUG" == "1" ]] && set -x
- BUILD_VER=$1
+ local BUILD_VER=$1
mkdir -p "$MVN_BASE_DIR/target"
if (( BUILD_PARALLEL == 1 || NUM_SHIMS == 1 )); then
# Single-shim/serial build: stream Maven output live rather than to a log
@@ -255,8 +404,8 @@ function build_single_shim() {
LOG_FILE="$MVN_BASE_DIR/target/mvn-build-$BUILD_VER.log"
fi
- if [[ "$BUILD_VER" == "$BASE_VER" ]]; then
- SKIP_CHECKS="false"
+ if [[ "$BUILD_VER" == "$BASE_VER" && \
+ ( "$UNSHIM_FAST" != "1" || "$UNSHIM_PARALLEL_WORLD_ONLY" != "1" ) ]]; then
# WORKAROUND:
# maven build on L193 currently relies on aggregator dependency which
# will removed by
@@ -267,10 +416,20 @@ function build_single_shim() {
#
MVN_PHASE="install"
else
- SKIP_CHECKS="true"
MVN_PHASE="package"
fi
+ if [[ "$UNSHIM_FAST" == "1" || "$BUILD_VER" != "$BASE_VER" ]]; then
+ SKIP_CHECKS="true"
+ else
+ SKIP_CHECKS="false"
+ fi
+
+ local BUILD_PROJECTS="tools"
+ if [[ "$UNSHIM_FAST" == "1" ]]; then
+ BUILD_PROJECTS="aggregator"
+ fi
+
echo "#### REDIRECTING mvn output to ${LOG_FILE:-stdout} ####"
(
if [[ "$LOG_FILE" == "" ]]; then
@@ -278,13 +437,15 @@ function build_single_shim() {
else
exec > "$LOG_FILE" 2>&1 || exit $?
fi
- $MVN -U "$MVN_PHASE" \
+ $MVN $MAVEN_REFRESH_OPT "$MVN_PHASE" \
-DskipTests \
-Dbuildver="$BUILD_VER" \
-Drat.skip="$SKIP_CHECKS" \
- -Dmaven.scaladoc.skip \
+ -Dmaven.scaladoc.skip=true \
+ -Dmaven.javadoc.skip=true \
-Dmaven.scalastyle.skip="$SKIP_CHECKS" \
- -pl tools -am
+ $MVN_FAST_SKIP_OPTS \
+ -pl "$BUILD_PROJECTS" -am
) || {
# Only tail when output went to a real log file; for a live stream
# (/dev/tty or existing stdout) the failure output is already on screen.
@@ -294,6 +455,7 @@ function build_single_shim() {
esac
exit 255
}
+ refresh_fast_aggregator_jar "$BUILD_VER"
}
export -f build_single_shim
@@ -310,25 +472,62 @@ export -f build_single_shim
time (
# printf a single buildver array element per line
if [[ "$SKIP_DIST_DEPS" != "1" ]]; then
+ if [[ "$UNSHIM_REUSE_BUILT_JARS" == "1" ]]; then
+ echo "Unshim fast: reusing existing per-shim jars and skipping Maven shim builds"
+ for bv in "${SPARK_SHIM_VERSIONS[@]}"; do
+ verify_reusable_unshim_artifacts "$bv"
+ refresh_fast_aggregator_jar "$bv"
+ done
+ else
# Execute initialize to download a massive jar for spark-rapids-jni in a single thread to
- # avoid repeating this work in parallel
- # Initialize sql-plugin-api only to avoid dealing with missing submodule dependencies
- #
- $MVN initialize -pl sql-plugin-api -am
+ # avoid repeating this work in parallel. This is unnecessary in unshim-fast modes that skip
+ # JNI unpacking.
+ if [[ "$UNSHIM_FAST" == "1" && "$MVN_FAST_SKIP_OPTS" == *"-Drapids.jni.unpack.skip=true"* ]]; then
+ echo "Unshim fast: skipping serial Maven initialize preflight"
+ else
+ # Initialize sql-plugin-api only to avoid dealing with missing submodule dependencies.
+ $MVN initialize -pl sql-plugin-api -am
+ fi
printf "%s\n" "${SPARK_SHIM_VERSIONS[@]}" | \
xargs -t -I% -P "$BUILD_PARALLEL" -n 1 \
bash -c 'build_single_shim "$@"' _ %
+ fi
fi
- # This used to resume from dist. However, without including aggregator in the build
- # the build does not properly initialize spark.version property via buildver profiles
- # in the root pom, and we get a missing spark330 dependency even for --profile=330,331
- # where the build does not require it. Moving it to aggregator resolves this issue with
- # a negligible increase of the build time by ~2 seconds.
+ if [[ "$UNSHIM_FAST" == "1" && "$UNSHIM_REUSE_BUILT_JARS" != "1" ]]; then
+ for bv in "${SPARK_SHIM_VERSIONS[@]}"; do
+ refresh_fast_aggregator_jar "$bv"
+ done
+ fi
+ # Non-fast builds resume from aggregator so Maven initializes the buildver-derived
+ # spark.version.classifier before dist resolves its aggregator dependency. The unshim-fast
+ # dist path can skip that extra aggregator pass because the per-shim builds above already
+ # installed the base aggregator jar and refreshed all target aggregator jars.
joinShimBuildFrom="aggregator"
INCLUDED_BUILDVERS_OPT=-Dincluded_buildvers=$(join_by , "${SPARK_SHIM_VERSIONS[@]}")
- echo "Resuming from $joinShimBuildFrom build only using $BASE_VER"
- $MVN $FINAL_OP -rf $joinShimBuildFrom $MODULE_OPT $MVN_PROFILE_OPT $INCLUDED_BUILDVERS_OPT \
+ if [[ "$UNSHIM_FAST" == "1" && "$MODULE" == "dist" ]]; then
+ if [[ "$UNSHIM_PARALLEL_WORLD_ONLY" == "1" ]]; then
+ echo "Unshim fast: assembling parallel-world directly without final Maven dist invocation"
+ python3 "$SOURCE_DIR/dist/scripts/build-unshim-parallel-world.py" \
+ --mvn-base-dir "$MVN_BASE_DIR" \
+ --source-dir "$SOURCE_DIR" \
+ --project-version "$RAPIDS_PROJECT_VERSION" \
+ --scala-binary-version "$RAPIDS_SCALA_BINARY_VERSION" \
+ --buildvers "$(join_by , "${SPARK_SHIM_VERSIONS[@]}")" \
+ --ignore-shim-revisions-check
+ exit 0
+ else
+ echo "Resuming at dist only using $BASE_VER"
+ FINAL_RESUME_OPT=""
+ FINAL_MODULE_OPT="--projects dist"
+ fi
+ else
+ echo "Resuming from $joinShimBuildFrom build only using $BASE_VER"
+ FINAL_RESUME_OPT="-rf $joinShimBuildFrom"
+ FINAL_MODULE_OPT="$MODULE_OPT"
+ fi
+ $MVN $FINAL_OP $FINAL_RESUME_OPT $FINAL_MODULE_OPT $MVN_PROFILE_OPT $INCLUDED_BUILDVERS_OPT \
-Dbuildver="$BASE_VER" \
- -DskipTests -Dmaven.scaladoc.skip
+ -DskipTests -Dmaven.scaladoc.skip=true -Dmaven.javadoc.skip=true \
+ $MVN_FAST_SKIP_OPTS
)
diff --git a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCheckDeltaInvariant.scala b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCheckDeltaInvariant.scala
index f7d5a9f402b..045ca4d238b 100644
--- a/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCheckDeltaInvariant.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/databricks/sql/transaction/tahoe/rapids/GpuCheckDeltaInvariant.scala
@@ -136,7 +136,7 @@ object GpuCheckDeltaInvariant extends Logging {
ExprChecks.projectOnly(
TypeSig.all,
TypeSig.all,
- paramCheck = Seq(ParamCheck("input", TypeSig.all, TypeSig.all))),
+ paramCheck = Seq(new ParamCheck("input", TypeSig.all, TypeSig.all))),
(c, conf, p, r) => new GpuCheckDeltaInvariantMeta(c, conf, p, r))
def maybeConvertToGpu(
diff --git a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala
index 0520a924367..156cd168768 100644
--- a/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala
+++ b/delta-lake/common/src/main/databricks/scala/com/nvidia/spark/rapids/delta/DatabricksDeltaProviderBase.scala
@@ -174,10 +174,10 @@ trait DatabricksDeltaProviderBase extends DeltaProviderImplBase {
"Delta RTAS was tagged as unsupported and should not be converted to GPU")
}
- protected case class DeltaWriteV1Config(
- deltaLog: DeltaLog,
- forceOverwrite: Boolean,
- options: mutable.HashMap[String, String])
+ protected class DeltaWriteV1Config(
+ val deltaLog: DeltaLog,
+ val forceOverwrite: Boolean,
+ val options: mutable.HashMap[String, String])
private def extractWriteV1Config(
meta: RapidsMeta[_, _, _],
@@ -210,7 +210,7 @@ trait DatabricksDeltaProviderBase extends DeltaProviderImplBase {
f.get(outerObj).asInstanceOf[mutable.HashMap[String, String]]
}
if (forceOverwrite.isDefined && options.isDefined) {
- Some(DeltaWriteV1Config(deltaLog, forceOverwrite.get, options.get))
+ Some(new DeltaWriteV1Config(deltaLog, forceOverwrite.get, options.get))
} else {
meta.willNotWorkOnGpu(s"write class has unsupported outer class $outerClass")
None
diff --git a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala
index 84ab70e804a..0cee43f69d0 100644
--- a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala
+++ b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/DeltaProviderBase.scala
@@ -41,11 +41,12 @@ import org.apache.spark.sql.types._
import org.apache.spark.sql.vectorized.ColumnarBatch
// Expression support shared across versions - defined outside class to avoid serialization issues
-case class GpuIncrementMetricMeta(
- cpuInc: IncrementMetric,
- override val conf: RapidsConf,
- p: Option[RapidsMeta[_, _, _]],
- r: DataFromReplacementRule) extends ExprMeta[IncrementMetric](cpuInc, conf, p, r) {
+class GpuIncrementMetricMeta(
+ val cpuInc: IncrementMetric,
+ override val conf: RapidsConf,
+ val p: Option[RapidsMeta[_, _, _]],
+ val r: DataFromReplacementRule)
+ extends ExprMeta[IncrementMetric](cpuInc, conf, p, r) with Serializable {
override def convertToGpuImpl(): GpuExpression = {
val gpuChild = childExprs.head.convertToGpu()
GpuIncrementMetric(cpuInc, gpuChild)
@@ -88,7 +89,7 @@ abstract class DeltaProviderBase extends DeltaIOProvider {
GpuOverrides.expr[IncrementMetric](
"IncrementMetric",
ExprChecks.unaryProject(TypeSig.all, TypeSig.all, TypeSig.all, TypeSig.all),
- (cpuInc, conf, p, r) => GpuIncrementMetricMeta(cpuInc, conf, p, r)
+ (cpuInc, conf, p, r) => new GpuIncrementMetricMeta(cpuInc, conf, p, r)
)
).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
diff --git a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala
index 61c586b955e..fa3997dbdbc 100644
--- a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala
+++ b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase.scala
@@ -26,7 +26,6 @@ import org.apache.hadoop.fs.Path
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
@@ -52,7 +51,7 @@ class GpuDeltaParquetFileFormatBase(
optimizationsEnabled: Boolean = true,
tablePath: Option[String] = None,
isCDCRead: Boolean = false
- ) extends com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormat with Logging {
+ ) extends com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormat {
// Validate either we have all arguments for DV enabled read or none of them.
diff --git a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase2.scala b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase2.scala
index 48587ce3bca..b1a06781ec1 100644
--- a/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase2.scala
+++ b/delta-lake/common/src/main/delta-33x-41x/scala/com/nvidia/spark/rapids/delta/common/GpuDeltaParquetFileFormatBase2.scala
@@ -36,7 +36,6 @@ import org.apache.parquet.schema.MessageType
import org.apache.spark.TaskContext
import org.apache.spark.broadcast.Broadcast
-import org.apache.spark.internal.Logging
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.QuotingUtils
@@ -71,7 +70,7 @@ class GpuDeltaParquetFileFormatBase2(
optimizationsEnabled: Boolean = true,
tablePath: Option[String] = None,
isCDCRead: Boolean = false
-) extends com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormat with Logging {
+) extends com.nvidia.spark.rapids.delta.GpuDeltaParquetFileFormat {
// Validate either we have all arguments for DV enabled read or none of them.
@@ -443,11 +442,11 @@ class GpuDeltaParquetFileFormatBase2(
* @param rowGroupNumRows number of rows in each row group
* @param partitionIndex index into rowsPerPartition / allPartValues this file contributes to
*/
- case class PerFileDVEntry(
- dvDescriptor: Option[String],
- rowGroupOffsets: Array[Long],
- rowGroupNumRows: Array[Int],
- partitionIndex: Int)
+ class PerFileDVEntry(
+ val dvDescriptor: Option[String],
+ val rowGroupOffsets: Array[Long],
+ val rowGroupNumRows: Array[Int],
+ val partitionIndex: Int)
/**
* Per-file DV load result produced during [[prepareForDecode]].
@@ -455,7 +454,7 @@ class GpuDeltaParquetFileFormatBase2(
* @param gpuBitmap serialized roaring bitmap buffer for the file's deletion vector
* @param aliveCount number of alive (non-deleted) rows in the file
*/
- case class SerializedRoaringBitmap(gpuBitmap: SpillableHostBuffer, aliveCount: Long)
+ class SerializedRoaringBitmap(val gpuBitmap: SpillableHostBuffer, val aliveCount: Long)
/**
* Per-batch DV info that replaces [[ParquetExtraInfo]] in [[CurrentChunkMeta]] after batch
@@ -464,13 +463,13 @@ class GpuDeltaParquetFileFormatBase2(
* - [[loadedDVResults]] is filled in by [[prepareForDecode]] after the copy phase.
* [[perFileEntries]] and [[loadedDVResults]] are always parallel sequences of the same length.
*/
- case class DeltaBatchExtraInfo(
+ class DeltaBatchExtraInfo(
override val dateRebaseMode: DateTimeRebaseMode,
override val timestampRebaseMode: DateTimeRebaseMode,
override val hasInt96Timestamps: Boolean,
val perFileEntries: Seq[PerFileDVEntry],
// Filled by prepareForDecode() after the copy phase; empty until then.
- val loadedDVResults: Seq[SerializedRoaringBitmap] = Seq.empty
+ val loadedDVResults: Seq[SerializedRoaringBitmap]
) extends ParquetExtraInfo(dateRebaseMode, timestampRebaseMode, hasInt96Timestamps) {
/**
* True if at least one file in this batch carries a deletion vector descriptor.
@@ -481,7 +480,8 @@ class GpuDeltaParquetFileFormatBase2(
* Returns a copy of this instance with [[loadedDVResults]] set.
*/
def withLoadedDVResults(loadedDVResults: Seq[SerializedRoaringBitmap]): DeltaBatchExtraInfo =
- this.copy(loadedDVResults = loadedDVResults)
+ new DeltaBatchExtraInfo(dateRebaseMode, timestampRebaseMode, hasInt96Timestamps,
+ perFileEntries, loadedDVResults)
/**
* Closes the DV bitmaps in [[loadedDVResults]].
@@ -502,7 +502,7 @@ class GpuDeltaParquetFileFormatBase2(
queryUsesInputFile: Boolean)
extends AbstractGpuParquetMultiFilePartitionReaderFactory(sqlConf, broadcastedConf,
dataSchema, readDataSchema, partitionSchema, filters, rapidsConf, poolConfBuilder,
- metrics, queryUsesInputFile) with Logging {
+ metrics, queryUsesInputFile) {
logDebug("Using GpuDeltaParquetMultiFilePartitionReaderFactory for multi-threaded Parquet " +
"reading with deletion vectors")
@@ -584,11 +584,11 @@ class GpuDeltaParquetFileFormatBase2(
val (rowGroupOffsets, rowGroupNumRows) =
RapidsDeletionVectors.getRowGroupMetadata(singleFileInfo.blocks)
clippedBlocks ++= singleFileInfo.blocks.zipWithIndex.map { case (block, i) =>
- ParquetSingleDataBlockMeta(
+ new ParquetSingleDataBlockMeta(
singleFileInfo.filePath,
- ParquetDataBlock(block, compressCfg),
+ new ParquetDataBlock(block, compressCfg),
metaAndFile.file.partitionValues,
- ParquetSchemaWrapper(singleFileInfo.schema),
+ new ParquetSchemaWrapper(singleFileInfo.schema),
singleFileInfo.readSchema,
new DeltaParquetExtraInfo(
singleFileInfo.dateRebaseMode,
@@ -745,55 +745,53 @@ class GpuDeltaParquetFileFormatBase2(
/**
* Deletion vector metadata for a single host memory buffer containing a part of data.
*/
- private case class SingleBufferDVMetadata(
- maybeDvInfo: Option[SpillableDeletionVectorInfo]
- )
-
- private case class DeletionVectorMetadata(
- metadatas: Array[SingleBufferDVMetadata]
- )
-
- private object DeletionVectorMetadata {
- def forSingleBuffer(maybeDvInfo: Option[SpillableDeletionVectorInfo]) = {
- DeletionVectorMetadata(
- Array(
- SingleBufferDVMetadata(maybeDvInfo)
- )
+ private class SingleBufferDVMetadata(
+ val maybeDvInfo: Option[SpillableDeletionVectorInfo])
+
+ private class DeletionVectorMetadata(
+ val metadatas: Array[SingleBufferDVMetadata])
+
+ private def deletionVectorMetadataForSingleBuffer(
+ maybeDvInfo: Option[SpillableDeletionVectorInfo]): DeletionVectorMetadata = {
+ new DeletionVectorMetadata(
+ Array(
+ new SingleBufferDVMetadata(maybeDvInfo)
)
- }
+ )
+ }
- def combine(metadatas: Array[DeletionVectorMetadata]): DeletionVectorMetadata = {
- DeletionVectorMetadata(metadatas.flatMap(_.metadatas))
- }
+ private def combineDeletionVectorMetadata(
+ metadatas: Array[DeletionVectorMetadata]): DeletionVectorMetadata = {
+ new DeletionVectorMetadata(metadatas.flatMap(_.metadatas))
}
- private case class DeltaParquetHostMemoryEmptyMetaData(
+ private class DeltaParquetHostMemoryEmptyMetaData(
override val partitionedFile: PartitionedFile,
- bufferSize: Long,
+ val bufferSize: Long,
override val bytesRead: Long,
- dateRebaseMode: DateTimeRebaseMode,
- timestampRebaseMode: DateTimeRebaseMode,
- hasInt96Timestamps: Boolean,
- clippedSchema: MessageType,
- readSchema: StructType,
- numRows: Long,
- dvMetadata: Array[DeletionVectorMetadata],
- override val allPartValues: Option[Array[(Long, InternalRow)]] = None)
+ val dateRebaseMode: DateTimeRebaseMode,
+ val timestampRebaseMode: DateTimeRebaseMode,
+ val hasInt96Timestamps: Boolean,
+ val clippedSchema: MessageType,
+ val readSchema: StructType,
+ val numRows: Long,
+ val dvMetadata: Array[DeletionVectorMetadata],
+ override val allPartValues: Option[Array[(Long, InternalRow)]])
extends HostMemoryEmptyMetaData {}
- private case class DeltaParquetHostMemoryBuffersWithMetaData(
+ private class DeltaParquetHostMemoryBuffersWithMetaData(
override val partitionedFile: PartitionedFile,
override val memBuffersAndSizes: Array[SingleHMBAndMeta],
override val bytesRead: Long,
- dateRebaseMode: DateTimeRebaseMode,
- timestampRebaseMode: DateTimeRebaseMode,
- hasInt96Timestamps: Boolean,
- clippedSchema: MessageType,
- readSchema: StructType,
+ val dateRebaseMode: DateTimeRebaseMode,
+ val timestampRebaseMode: DateTimeRebaseMode,
+ val hasInt96Timestamps: Boolean,
+ val clippedSchema: MessageType,
+ val readSchema: StructType,
override val allPartValues: Option[Array[(Long, InternalRow)]],
// deletion vector metadata. should be aligned with memBuffersAndSizes if deletion vectors
// are present.
- dvMetadata: Array[DeletionVectorMetadata]
+ val dvMetadata: Array[DeletionVectorMetadata]
) extends HostMemoryBuffersWithMetaData {
override def consumeHeadBuffer(): HostMemoryBuffersWithMetaData = {
@@ -806,7 +804,17 @@ class GpuDeltaParquetFileFormatBase2(
} else {
(Array.empty[SingleHMBAndMeta], Array.empty[DeletionVectorMetadata])
}
- this.copy(memBuffersAndSizes = remainingBuffers, dvMetadata = newDvMetadata)
+ new DeltaParquetHostMemoryBuffersWithMetaData(
+ partitionedFile,
+ remainingBuffers,
+ bytesRead,
+ dateRebaseMode,
+ timestampRebaseMode,
+ hasInt96Timestamps,
+ clippedSchema,
+ readSchema,
+ allPartValues,
+ newDvMetadata)
}
}
@@ -843,7 +851,7 @@ class GpuDeltaParquetFileFormatBase2(
}
closeOnExcept(maybeSerializedDV) { _ =>
- val dvMetadata = DeletionVectorMetadata.forSingleBuffer(
+ val dvMetadata = deletionVectorMetadataForSingleBuffer(
maybeSerializedDV.map{ serializedDV =>
val (rowGroupOffsets, rowGroupNumRows) = RapidsDeletionVectors
.getRowGroupMetadata(blocks)
@@ -853,7 +861,7 @@ class GpuDeltaParquetFileFormatBase2(
rowGroupOffsets,
rowGroupNumRows)}
)
- DeltaParquetHostMemoryEmptyMetaData(
+ new DeltaParquetHostMemoryEmptyMetaData(
partitionedFile,
bufferSize,
bytesRead,
@@ -863,7 +871,8 @@ class GpuDeltaParquetFileFormatBase2(
clippedSchema,
readSchema,
numRows,
- Array(dvMetadata)
+ Array(dvMetadata),
+ None
)
}
}
@@ -872,9 +881,9 @@ class GpuDeltaParquetFileFormatBase2(
nonEmptyMeta: CombinedMeta): HostMemoryEmptyMetaData = {
val metaForEmpty = emptyMeta.metaForEmpty
val toCombine = emptyMeta.emptyMetas.map(_.asInstanceOf[DeltaParquetHostMemoryEmptyMetaData])
- val combinedDVMeta = DeletionVectorMetadata.combine(toCombine.flatMap(_.dvMetadata))
+ val combinedDVMeta = combineDeletionVectorMetadata(toCombine.flatMap(_.dvMetadata))
- DeltaParquetHostMemoryEmptyMetaData(
+ new DeltaParquetHostMemoryEmptyMetaData(
metaForEmpty.partitionedFile, // just pick one since not used
emptyMeta.emptyBufferSize,
emptyMeta.emptyTotalBytesRead,
@@ -912,7 +921,7 @@ class GpuDeltaParquetFileFormatBase2(
.map(_.asInstanceOf[ParquetDataBlock].dataBlock)
val (rowGroupOffsets, rowGroupNumRows) = RapidsDeletionVectors
.getRowGroupMetadata(dataBlocks)
- DeletionVectorMetadata.forSingleBuffer(
+ deletionVectorMetadataForSingleBuffer(
maybeSerializedDV.map { serializedDV =>
serializedDV.incRefCount()
SpillableDeletionVectorInfo(
@@ -923,7 +932,7 @@ class GpuDeltaParquetFileFormatBase2(
})
}
- DeltaParquetHostMemoryBuffersWithMetaData(
+ new DeltaParquetHostMemoryBuffersWithMetaData(
partitionedFile,
memBuffersAndSize,
bytesRead,
@@ -946,9 +955,9 @@ class GpuDeltaParquetFileFormatBase2(
val metaToUse = combinedMeta.firstNonEmpty
val toCombine = combinedMeta.toCombine
.collect { case hmb: DeltaParquetHostMemoryBuffersWithMetaData => hmb }
- val combinedDVMeta = DeletionVectorMetadata.combine(toCombine.flatMap(_.dvMetadata))
+ val combinedDVMeta = combineDeletionVectorMetadata(toCombine.flatMap(_.dvMetadata))
- DeltaParquetHostMemoryBuffersWithMetaData(
+ new DeltaParquetHostMemoryBuffersWithMetaData(
metaToUse.partitionedFile,
Array(newHmbBufferInfo),
offset,
@@ -1113,12 +1122,12 @@ class GpuDeltaParquetFileFormatBase2(
fileNumRows += extra.rowGroupNumRows
}
- PerFileDVEntry(fileDesc, fileOffsets.toArray, fileNumRows.toArray, partitionIndex)
+ new PerFileDVEntry(fileDesc, fileOffsets.toArray, fileNumRows.toArray, partitionIndex)
}.toSeq
val batchExtra = new DeltaBatchExtraInfo(
meta.extraInfo.dateRebaseMode, meta.extraInfo.timestampRebaseMode,
- meta.extraInfo.hasInt96Timestamps, fileEntries)
+ meta.extraInfo.hasInt96Timestamps, fileEntries, Seq.empty)
meta.copy(extraInfo = batchExtra)
}
@@ -1160,7 +1169,7 @@ class GpuDeltaParquetFileFormatBase2(
require(numDeleted <= totalRows,
s"Deletion vector cardinality ($numDeleted) exceeds " +
s"file row count ($totalRows)")
- SerializedRoaringBitmap(gpuBitmap, totalRows - numDeleted)
+ new SerializedRoaringBitmap(gpuBitmap, totalRows - numDeleted)
}
}
})
@@ -1238,8 +1247,8 @@ class GpuDeltaParquetFileFormatBase2(
* A simple wrapper to adapt the DeletionVector.ParquetChunkedReader to the ChunkedReader interface
* expected by AbstractParquetTableReader.
*/
-case class DeltaParquetChunkedReader(delegate: DeletionVector.ParquetChunkedReader)
- extends ChunkedReader {
+class DeltaParquetChunkedReader(val delegate: DeletionVector.ParquetChunkedReader)
+ extends ChunkedReader with Serializable {
override def hasNext: Boolean = delegate.hasNext
override def next: Table = delegate.readChunk()
override def close(): Unit = delegate.close()
@@ -1248,7 +1257,7 @@ case class DeltaParquetChunkedReader(delegate: DeletionVector.ParquetChunkedRead
/**
* A chunked reader for Parquet files with deletion vectors.
*/
-case class DeltaParquetTableReader(
+class DeltaParquetTableReader(
conf: Configuration,
chunkSizeByteLimit: Long,
maxChunkedReaderMemoryUsageSizeBytes: Long,
@@ -1268,11 +1277,11 @@ case class DeltaParquetTableReader(
conf, chunkSizeByteLimit, maxChunkedReaderMemoryUsageSizeBytes, opts, buffers, metrics,
dateRebaseMode, timestampRebaseMode, isSchemaCaseSensitive, useFieldId, readDataSchema,
clippedParquetSchema, splits, debugDumpPrefix, debugDumpAlways
-) {
+) with Serializable {
logDebug("Using DeltaParquetTableReader for reading Parquet with deletion vectors")
- override protected val reader = DeltaParquetChunkedReader(
+ override protected val reader = new DeltaParquetChunkedReader(
DeletionVector.newParquetChunkedReader(chunkSizeByteLimit,
maxChunkedReaderMemoryUsageSizeBytes, opts, buffers, dvInfos)
)
@@ -1287,7 +1296,7 @@ case class DeltaParquetTableReader(
}
}
-object MakeParquetTableWithDVProducer extends Logging {
+object MakeParquetTableWithDVProducer extends RapidsLocalLog {
def apply(
useChunkedReader: Boolean,
maxChunkedReaderMemoryUsageSizeBytes: Long,
@@ -1318,7 +1327,7 @@ object MakeParquetTableWithDVProducer extends Logging {
}
}
if (useChunkedReader) {
- DeltaParquetTableReader(conf, chunkSizeByteLimit, maxChunkedReaderMemoryUsageSizeBytes,
+ new DeltaParquetTableReader(conf, chunkSizeByteLimit, maxChunkedReaderMemoryUsageSizeBytes,
opts, buffers, metrics, dateRebaseMode, timestampRebaseMode,
isSchemaCaseSensitive, useFieldId, readDataSchema, clippedParquetSchema,
splits, debugDumpPrefix, debugDumpAlways, deletionVectorInfos)
diff --git a/delta-lake/common/src/main/delta-33x-41x/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShimBase.scala b/delta-lake/common/src/main/delta-33x-41x/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShimBase.scala
index b72f13da81a..f59d892f73e 100644
--- a/delta-lake/common/src/main/delta-33x-41x/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShimBase.scala
+++ b/delta-lake/common/src/main/delta-33x-41x/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShimBase.scala
@@ -58,7 +58,7 @@ abstract class DeltaRuntimeShimBase extends DeltaRuntimeShim {
override def startTransaction(log: DeltaLog, conf: RapidsConf, clock: Clock):
GpuOptimisticTransactionBase = {
- startTransaction(StartTransactionArg(log, conf, clock))
+ startTransaction(new StartTransactionArg(log, conf, clock))
}
override def startTransaction(arg: StartTransactionArg): GpuOptimisticTransactionBase = {
diff --git a/delta-lake/common/src/main/delta-33x/scala/com/nvidia/spark/rapids/delta/shims/StatsExprShim.scala b/delta-lake/common/src/main/delta-33x/scala/com/nvidia/spark/rapids/delta/shims/StatsExprShim.scala
index a388c5256f4..99c4f148f5d 100644
--- a/delta-lake/common/src/main/delta-33x/scala/com/nvidia/spark/rapids/delta/shims/StatsExprShim.scala
+++ b/delta-lake/common/src/main/delta-33x/scala/com/nvidia/spark/rapids/delta/shims/StatsExprShim.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,3 +24,4 @@ import org.apache.spark.sql.catalyst.expressions.Expression
object StatsExprShim {
def unwrapRuntimeReplaceable(expr: Expression): Expression = expr
}
+// Keep executable line numbers aligned with older Delta shims for binary-dedupe.
diff --git a/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/ShimDeltaInvariantCheckerExec.scala b/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/ShimDeltaInvariantCheckerExec.scala
index a488356e6f2..52019f7db2a 100644
--- a/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/ShimDeltaInvariantCheckerExec.scala
+++ b/delta-lake/common/src/main/delta-33x/scala/org/apache/spark/sql/delta/rapids/ShimDeltaInvariantCheckerExec.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -32,3 +32,4 @@ object ShimDeltaInvariantCheckerExec {
DeltaInvariantCheckerExec(plan, constraints)
}
}
+// Keep executable line numbers aligned with older Delta shims for binary-dedupe.
diff --git a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
index 8fd372c0a1b..f04dc75ff69 100644
--- a/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
+++ b/delta-lake/common/src/main/delta-io/scala/com/nvidia/spark/rapids/delta/DeltaIOProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -106,10 +106,10 @@ abstract class DeltaIOProvider extends DeltaProviderImplBase {
writeOptionsFromExec(cpuExec.writeOptions), cpuExec.session)
}
- private case class DeltaWriteV1Config(
- deltaLog: DeltaLog,
- forceOverwrite: Boolean,
- options: mutable.HashMap[String, String])
+ private class DeltaWriteV1Config(
+ val deltaLog: DeltaLog,
+ val forceOverwrite: Boolean,
+ val options: mutable.HashMap[String, String])
private def extractWriteV1Config(
meta: RapidsMeta[_, _, _],
@@ -142,7 +142,7 @@ abstract class DeltaIOProvider extends DeltaProviderImplBase {
f.get(outerObj).asInstanceOf[mutable.HashMap[String, String]]
}
if (forceOverwrite.isDefined && options.isDefined) {
- Some(DeltaWriteV1Config(deltaLog, forceOverwrite.get, options.get))
+ Some(new DeltaWriteV1Config(deltaLog, forceOverwrite.get, options.get))
} else {
meta.willNotWorkOnGpu(s"write class has unsupported outer class $outerClass")
None
diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
index 039368e7356..e6d02552d3b 100644
--- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
+++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/DeltaRuntimeShim.scala
@@ -30,15 +30,15 @@ import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.util.Clock
-case class StartTransactionArg(log: DeltaLog, conf: RapidsConf, clock: Clock,
- catalogTable: Option[CatalogTable] = None, snapshot: Option[Snapshot] = None)
+class StartTransactionArg(val log: DeltaLog, val conf: RapidsConf, val clock: Clock,
+ val catalogTable: Option[CatalogTable] = None, val snapshot: Option[Snapshot] = None)
trait DeltaRuntimeShim {
def getDeltaConfigChecker: DeltaConfigChecker
def getDeltaProvider: DeltaProvider
def startTransaction(log: DeltaLog, conf: RapidsConf, clock: Clock)
: GpuOptimisticTransactionBase = {
- startTransaction(StartTransactionArg(log, conf, clock))
+ startTransaction(new StartTransactionArg(log, conf, clock))
}
def startTransaction(arg: StartTransactionArg): GpuOptimisticTransactionBase
def stringFromStringUdf(f: String => String): UserDefinedFunction
diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuCheckDeltaInvariant.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuCheckDeltaInvariant.scala
index 67164017bbe..bcbbef9b9a4 100644
--- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuCheckDeltaInvariant.scala
+++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuCheckDeltaInvariant.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* This file was derived from CheckDeltaInvariant.scala in the
* Delta Lake project at https://github.com/delta-io/delta.
@@ -132,8 +132,8 @@ object GpuCheckDeltaInvariant extends Logging {
ExprChecks.projectOnly(
TypeSig.all,
TypeSig.all,
- paramCheck = Seq(ParamCheck("input", TypeSig.all, TypeSig.all)),
- repeatingParamCheck = Some(RepeatingParamCheck("extra", TypeSig.all, TypeSig.all))
+ paramCheck = Seq(new ParamCheck("input", TypeSig.all, TypeSig.all)),
+ repeatingParamCheck = Some(new RepeatingParamCheck("extra", TypeSig.all, TypeSig.all))
),
(c, conf, p, r) => new GpuCheckDeltaInvariantMeta(c, conf, p, r))
diff --git a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaLog.scala b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaLog.scala
index cdc0b7582e6..8fb66bfb4f2 100644
--- a/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaLog.scala
+++ b/delta-lake/common/src/main/delta-io/scala/org/apache/spark/sql/delta/rapids/GpuDeltaLog.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ class GpuDeltaLog(val deltaLog: DeltaLog, val rapidsConf: RapidsConf) {
* directly to the DeltaLog otherwise they will not be checked for conflicts.
*/
def startTransaction(): GpuOptimisticTransactionBase = {
- DeltaRuntimeShim.startTransaction(StartTransactionArg(deltaLog, rapidsConf, _clock, None,
+ DeltaRuntimeShim.startTransaction(new StartTransactionArg(deltaLog, rapidsConf, _clock, None,
None))
}
@@ -62,7 +62,7 @@ class GpuDeltaLog(val deltaLog: DeltaLog, val rapidsConf: RapidsConf) {
def startTransaction(
catalogTableOpt: Option[CatalogTable],
snapshotOpt: Option[Snapshot] = None): GpuOptimisticTransactionBase = {
- DeltaRuntimeShim.startTransaction(StartTransactionArg(deltaLog, rapidsConf, _clock,
+ DeltaRuntimeShim.startTransaction(new StartTransactionArg(deltaLog, rapidsConf, _clock,
catalogTableOpt, snapshotOpt))
}
diff --git a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala
index 4cbd693b01f..464368a326c 100644
--- a/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala
+++ b/delta-lake/common/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaTaskStatisticsTracker.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* This file was derived from DataSkippingStatsTracker.scala
* in the Delta Lake project at https://github.com/delta-io/delta.
@@ -38,7 +38,7 @@ import org.apache.spark.sql.vectorized.ColumnarBatch
* A [[WriteTaskStats]] that contains a map from file name to the json representation
* of the collected statistics.
*/
-case class GpuDeltaFileStatistics(stats: Map[String, String]) extends WriteTaskStats
+class GpuDeltaFileStatistics(val stats: Map[String, String]) extends WriteTaskStats
/**
* GPU version of DeltaTaskStatisticsTracker.
@@ -168,7 +168,7 @@ class GpuDeltaTaskStatisticsTracker(
}
override def getFinalStats(taskCommitTime: Long): GpuDeltaFileStatistics = {
- GpuDeltaFileStatistics(results.toMap)
+ new GpuDeltaFileStatistics(results.toMap)
}
}
diff --git a/delta-lake/delta-spark400db173/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatNativeDV.scala b/delta-lake/delta-spark400db173/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatNativeDV.scala
index 1e39cbc79ef..141b20bd339 100644
--- a/delta-lake/delta-spark400db173/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatNativeDV.scala
+++ b/delta-lake/delta-spark400db173/src/main/scala/com/nvidia/spark/rapids/delta/GpuDeltaParquetFileFormatNativeDV.scala
@@ -636,11 +636,11 @@ case class GpuDeltaParquetFileFormatNativeDV(
val (rowGroupOffsets, rowGroupNumRows) =
RapidsDeletionVectors.getRowGroupMetadata(singleFileInfo.blocks)
clippedBlocks ++= singleFileInfo.blocks.zipWithIndex.map { case (block, i) =>
- ParquetSingleDataBlockMeta(
+ new ParquetSingleDataBlockMeta(
singleFileInfo.filePath,
- ParquetDataBlock(block, compressCfg),
+ new ParquetDataBlock(block, compressCfg),
metaAndFile.file.partitionValues,
- ParquetSchemaWrapper(singleFileInfo.schema),
+ new ParquetSchemaWrapper(singleFileInfo.schema),
singleFileInfo.readSchema,
new DeltaParquetExtraInfo(
singleFileInfo.dateRebaseMode,
diff --git a/dist/README.md b/dist/README.md
index aa23b6a6332..840f9a52ee6 100644
--- a/dist/README.md
+++ b/dist/README.md
@@ -28,10 +28,8 @@ provider discovery mechanism
[ParallelWorldClassloader](https://github.com/openjdk/jdk/blob/jdk8-b120/jaxws/src/share/jaxws_classes/com/sun/istack/internal/tools/ParallelWorldClassLoader.java))
for each version of Spark supported in the jar, i.e., spark330/, spark341/, etc.
-If you have to change the contents of the uber jar the following files control what goes into the base jar as classes that are not shaded.
+If you have to change the contents of the uber jar, the packaging defaults common classes to the base jar when binary dedupe proves they are bitwise-identical across shims. New common classes should normally remain unshimmed by default. The following files control explicit exceptions and non-class resources.
-1. `unshimmed-common-from-single-shim.txt` - This has classes and files that should go into the base jar with their normal
-package name (not shaded). This includes user visible classes (i.e., com/nvidia/spark/SQLPlugin), python files,
-and other files that aren't version specific. Uses Spark 3.2.0 built jar for these base classes as explained above.
-2. `unshimmed-from-each-spark3xx.txt` - This is applied to all the individual Spark specific version jars to pull
-any files that need to go into the base of the jar and not into the Spark specific directory.
+1. `keep-in-spark-shared.txt` - Patterns for bitwise-identical common `spark-shared` class files that must stay in `spark-shared` instead of being promoted to the base jar. This should stay small; add entries only for compatibility or packaging exceptions.
+2. `unshimmed-common-from-single-shim.txt` - Files that must go into the base jar from one representative shim but are not selected by default class promotion, such as root `META-INF` resources and Python worker files. Avoid adding class files here unless they need special root-layout treatment outside bitwise-identical default promotion.
+3. `unshimmed-from-each-spark3xx.txt` - This is applied to all the individual Spark specific version jars to pull any files that need to go into the base of the jar and not into the Spark specific directory. These are per-shim root artifacts rather than common `spark-shared` classes.
diff --git a/dist/build/package-parallel-worlds.py b/dist/build/package-parallel-worlds.py
index 4698c4a8ca0..e612b05b490 100644
--- a/dist/build/package-parallel-worlds.py
+++ b/dist/build/package-parallel-worlds.py
@@ -26,6 +26,30 @@ def shell_exec(shell_cmd):
self.fail("failed to execute %s" % shell_cmd)
+def has_fnmatch_magic(pattern):
+ return "*" in pattern or "?" in pattern or "[" in pattern
+
+
+def select_matching_members(namelist, patterns):
+ if os.environ.get("UNSHIM_FAST") != "1":
+ matching_members = []
+ for pat in patterns:
+ matching_members += fnmatch.filter(namelist, pat)
+ return matching_members
+
+ names_by_entry = {}
+ for name in namelist:
+ names_by_entry.setdefault(name, []).append(name)
+
+ matching_members = []
+ for pat in patterns:
+ if has_fnmatch_magic(pat):
+ matching_members += fnmatch.filter(namelist, pat)
+ else:
+ matching_members += names_by_entry.get(pat, [])
+ return matching_members
+
+
artifacts = attributes.get('artifact_csv').split(',')
buildver_list = re.sub(r'\s+', '', project.getProperty('included_buildvers'),
flags=re.UNICODE).split(',')
@@ -40,6 +64,12 @@ def shell_exec(shell_cmd):
art_url = project.getProperty('env.ART_URL')
jenkins_settings = os.sep.join([source_basedir, 'jenkins', 'settings.xml'])
repo_local = project.getProperty('maven.repo.local')
+dist_dir = os.sep.join([source_basedir, 'dist'])
+with open(os.sep.join([dist_dir, 'unshimmed-common-from-single-shim.txt']), 'r') as f:
+ from_single_shim = f.read().splitlines()
+with open(os.sep.join([dist_dir, 'unshimmed-from-each-spark3xx.txt']), 'r') as f:
+ from_each = f.read().splitlines()
+from_single_shim_or_each = from_single_shim + from_each
for bv in buildver_list:
classifier = 'spark' + bv
@@ -73,11 +103,6 @@ def shell_exec(shell_cmd):
mvn_cmd.append('='.join(['-Dmaven.repo.local', repo_local]))
shell_exec(mvn_cmd)
- dist_dir = os.sep.join([source_basedir, 'dist'])
- with open(os.sep.join([dist_dir, 'unshimmed-common-from-single-shim.txt']), 'r') as f:
- from_single_shim = f.read().splitlines()
- with open(os.sep.join([dist_dir, 'unshimmed-from-each-spark3xx.txt']), 'r') as f:
- from_each = f.read().splitlines()
with zipfile.ZipFile(os.sep.join([deps_dir, art_jar]), 'r') as zip_handle:
if project.getProperty('should.build.conventional.jar'):
zip_handle.extractall(path=top_dist_jar_dir)
@@ -88,9 +113,6 @@ def shell_exec(shell_cmd):
zip_handle.extractall(path=top_dist_jar_dir)
# TODO deprecate
namelist = zip_handle.namelist()
- matching_members = []
- glob_list = from_single_shim + from_each if bv == buildver_list[0] else from_each
- for pat in glob_list:
- new_matches = fnmatch.filter(namelist, pat)
- matching_members += new_matches
+ glob_list = from_single_shim_or_each if bv == buildver_list[0] else from_each
+ matching_members = select_matching_members(namelist, glob_list)
zip_handle.extractall(path=top_dist_jar_dir, members=matching_members)
diff --git a/dist/keep-in-spark-shared.txt b/dist/keep-in-spark-shared.txt
new file mode 100644
index 00000000000..5fc420febc9
--- /dev/null
+++ b/dist/keep-in-spark-shared.txt
@@ -0,0 +1,6 @@
+# Patterns for common spark-shared class files that must not be promoted to
+# the root layout even when binary dedupe marks them bitwise-identical.
+#
+# Add entries only when a class is bitwise-identical but must remain loaded
+# from spark-shared for compatibility or packaging reasons. New common classes
+# should normally stay unshimmed by default.
diff --git a/dist/maven-antrun/build-parallel-worlds.xml b/dist/maven-antrun/build-parallel-worlds.xml
index afde7c2d755..f6ccf8cb0b9 100644
--- a/dist/maven-antrun/build-parallel-worlds.xml
+++ b/dist/maven-antrun/build-parallel-worlds.xml
@@ -123,6 +123,10 @@
failonerror="false">
+
+
@@ -132,13 +136,14 @@
-
+
-
+ Generating dependency-reduced-pom.xml<dependency>
diff --git a/dist/scripts/analyze-parallel-world-deps.py b/dist/scripts/analyze-parallel-world-deps.py
new file mode 100644
index 00000000000..ab2867db7c4
--- /dev/null
+++ b/dist/scripts/analyze-parallel-world-deps.py
@@ -0,0 +1,617 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Analyze dependencies between conventional, spark-shared, and shim classes.
+
+The dist jar contains classes in the conventional root layout, in spark-shared,
+and in one or more Spark-version-specific directories. This script inspects the
+class files and reports which root or spark-shared classes still have a static
+dependency path to version-specific bytecode.
+"""
+
+import argparse
+import collections
+import json
+import os
+import re
+import struct
+import sys
+import zipfile
+
+
+SHIM_DIR_RE = re.compile(r"^spark[0-9][0-9a-z]*$")
+CLASSIFIER_PACKAGE_RE = re.compile(r"(^|\.)spark[0-9][0-9a-z]*($|\.)")
+DESCRIPTOR_CLASS_RE = re.compile(r"L([^;<>\[\]\(\)]+);")
+
+DEFAULT_EXCLUDES = (
+ "ai.rapids.cudf.",
+ "com.nvidia.shaded.",
+ "org.openucx.",
+)
+
+
+ClassInfo = collections.namedtuple("ClassInfo", ("name", "location", "entry", "deps"))
+
+
+def _read_u1(data, offset):
+ return data[offset], offset + 1
+
+
+def _read_u2(data, offset):
+ return struct.unpack_from(">H", data, offset)[0], offset + 2
+
+
+def _read_u4(data, offset):
+ return struct.unpack_from(">I", data, offset)[0], offset + 4
+
+
+def _class_names_from_descriptor(value):
+ for match in DESCRIPTOR_CLASS_RE.finditer(value):
+ yield match.group(1)
+
+
+def _normalize_internal_name(value):
+ if not value:
+ return []
+ if value.startswith("["):
+ return list(_class_names_from_descriptor(value))
+ if "/" in value and not value.startswith("("):
+ return [value]
+ return list(_class_names_from_descriptor(value))
+
+
+def parse_class_file(data):
+ magic, offset = _read_u4(data, 0)
+ if magic != 0xCAFEBABE:
+ raise ValueError("not a class file")
+
+ # minor_version, major_version
+ _, offset = _read_u2(data, offset)
+ _, offset = _read_u2(data, offset)
+
+ cp_count, offset = _read_u2(data, offset)
+ constant_pool = [None] * cp_count
+ class_name_indexes = []
+ utf8_values = []
+
+ index = 1
+ while index < cp_count:
+ tag, offset = _read_u1(data, offset)
+ if tag == 1: # CONSTANT_Utf8
+ length, offset = _read_u2(data, offset)
+ raw = data[offset:offset + length]
+ offset += length
+ value = raw.decode("utf-8", errors="replace")
+ constant_pool[index] = value
+ utf8_values.append(value)
+ elif tag in (3, 4): # Integer, Float
+ offset += 4
+ elif tag in (5, 6): # Long, Double
+ offset += 8
+ index += 1
+ elif tag == 7: # Class
+ name_index, offset = _read_u2(data, offset)
+ constant_pool[index] = name_index
+ class_name_indexes.append(name_index)
+ elif tag == 8: # String
+ offset += 2
+ elif tag in (9, 10, 11, 12, 17, 18): # refs, NameAndType, Dynamic, InvokeDynamic
+ offset += 4
+ elif tag == 15: # MethodHandle
+ offset += 3
+ elif tag in (16, 19, 20): # MethodType, Module, Package
+ offset += 2
+ else:
+ raise ValueError("unknown constant pool tag %s" % tag)
+ index += 1
+
+ # access_flags
+ _, offset = _read_u2(data, offset)
+ this_class_index, offset = _read_u2(data, offset)
+ this_name_index = constant_pool[this_class_index]
+ this_name = constant_pool[this_name_index]
+
+ deps = set()
+ for name_index in class_name_indexes:
+ for dep in _normalize_internal_name(constant_pool[name_index]):
+ deps.add(dep.replace("/", "."))
+ for value in utf8_values:
+ for dep in _class_names_from_descriptor(value):
+ deps.add(dep.replace("/", "."))
+
+ class_name = this_name.replace("/", ".")
+ deps.discard(class_name)
+ return class_name, deps
+
+
+def location_from_entry(entry):
+ first = entry.split("/", 1)[0]
+ if first == "spark-shared":
+ return "spark-shared"
+ if SHIM_DIR_RE.match(first):
+ return first
+ return "root"
+
+
+def is_classifier_class(class_name):
+ return bool(CLASSIFIER_PACKAGE_RE.search(class_name))
+
+
+def is_version_location(location):
+ return bool(SHIM_DIR_RE.match(location))
+
+
+def is_version_node(node):
+ class_name, location = node
+ return is_version_location(location) or is_classifier_class(class_name)
+
+
+def iter_class_entries(path):
+ if zipfile.is_zipfile(path):
+ with zipfile.ZipFile(path) as zf:
+ for name in zf.namelist():
+ if name.endswith(".class") and not name.endswith("/module-info.class"):
+ yield name, zf.read(name)
+ return
+
+ for root, _, files in os.walk(path):
+ for file_name in files:
+ if not file_name.endswith(".class") or file_name == "module-info.class":
+ continue
+ full_path = os.path.join(root, file_name)
+ rel_path = os.path.relpath(full_path, path).replace(os.sep, "/")
+ with open(full_path, "rb") as fh:
+ yield rel_path, fh.read()
+
+
+def should_exclude(class_name, prefixes):
+ return any(class_name.startswith(prefix) for prefix in prefixes)
+
+
+def load_classes(path, exclude_prefixes):
+ classes = {}
+ name_locations = collections.defaultdict(set)
+ errors = []
+ for entry, data in iter_class_entries(path):
+ try:
+ class_name, deps = parse_class_file(data)
+ except ValueError as exc:
+ errors.append("%s: %s" % (entry, exc))
+ continue
+ if should_exclude(class_name, exclude_prefixes):
+ continue
+ location = location_from_entry(entry)
+ info = ClassInfo(class_name, location, entry, deps)
+ node = (class_name, location)
+ classes[node] = info
+ name_locations[class_name].add(location)
+ return classes, name_locations, errors
+
+
+def resolve_dependency_targets(source_location, dep_name, name_locations):
+ locations = name_locations.get(dep_name)
+ if not locations:
+ return []
+
+ # Parent/root class loading wins in the current layout. Prefer a conventional
+ # class when one exists, then the source archive, then spark-shared, then the
+ # remaining version-specific locations.
+ ordered = []
+ for preferred in ("root", source_location, "spark-shared"):
+ if preferred in locations and preferred not in ordered:
+ ordered.append(preferred)
+ ordered.extend(sorted(loc for loc in locations if loc not in ordered))
+ return [(dep_name, loc) for loc in ordered]
+
+
+def build_graph(classes, name_locations):
+ graph = {node: set() for node in classes}
+ for node, info in classes.items():
+ for dep_name in info.deps:
+ for target in resolve_dependency_targets(info.location, dep_name, name_locations):
+ if target in classes:
+ graph[node].add(target)
+ return graph
+
+
+def reverse_graph(graph):
+ rev = {node: set() for node in graph}
+ for source, targets in graph.items():
+ for target in targets:
+ rev[target].add(source)
+ return rev
+
+
+def reachable_to_version_specific(graph):
+ rev = reverse_graph(graph)
+ version_nodes = {node for node in graph if is_version_node(node)}
+ marked = set(version_nodes)
+ queue = collections.deque(version_nodes)
+ while queue:
+ node = queue.popleft()
+ for parent in rev[node]:
+ if parent not in marked:
+ marked.add(parent)
+ queue.append(parent)
+ return marked, version_nodes
+
+
+def shortest_path_to_version(graph, start):
+ queue = collections.deque([(start, [start])])
+ seen = {start}
+ while queue:
+ node, path = queue.popleft()
+ if node != start and is_version_node(node):
+ return path
+ for next_node in sorted(graph[node]):
+ if next_node not in seen:
+ seen.add(next_node)
+ queue.append((next_node, path + [next_node]))
+ return None
+
+
+def tarjan_scc(graph):
+ sys.setrecursionlimit(max(sys.getrecursionlimit(), len(graph) * 2 + 1000))
+
+ index = 0
+ stack = []
+ on_stack = set()
+ indexes = {}
+ lowlinks = {}
+ components = []
+
+ def strongconnect(node):
+ nonlocal index
+ indexes[node] = index
+ lowlinks[node] = index
+ index += 1
+ stack.append(node)
+ on_stack.add(node)
+
+ for next_node in graph[node]:
+ if next_node not in indexes:
+ strongconnect(next_node)
+ lowlinks[node] = min(lowlinks[node], lowlinks[next_node])
+ elif next_node in on_stack:
+ lowlinks[node] = min(lowlinks[node], indexes[next_node])
+
+ if lowlinks[node] == indexes[node]:
+ component = []
+ while True:
+ item = stack.pop()
+ on_stack.remove(item)
+ component.append(item)
+ if item == node:
+ break
+ components.append(component)
+
+ for node in graph:
+ if node not in indexes:
+ strongconnect(node)
+ return components
+
+
+def dependency_first_component_order(graph, components):
+ comp_by_node = {}
+ for comp_id, component in enumerate(components):
+ for node in component:
+ comp_by_node[node] = comp_id
+
+ # Source -> target means "source depends on target". Reverse component
+ # edges so Kahn's algorithm emits dependencies before their users.
+ prereq_edges = collections.defaultdict(set)
+ indegree = collections.Counter()
+ for source, targets in graph.items():
+ source_comp = comp_by_node[source]
+ indegree.setdefault(source_comp, 0)
+ for target in targets:
+ target_comp = comp_by_node[target]
+ if source_comp == target_comp:
+ continue
+ if source_comp not in prereq_edges[target_comp]:
+ prereq_edges[target_comp].add(source_comp)
+ indegree[source_comp] += 1
+ indegree.setdefault(target_comp, indegree[target_comp])
+
+ ready = collections.deque(sorted(
+ comp_id for comp_id in range(len(components)) if indegree[comp_id] == 0))
+ ordered = []
+ while ready:
+ comp_id = ready.popleft()
+ ordered.append(comp_id)
+ for dependent in sorted(prereq_edges[comp_id]):
+ indegree[dependent] -= 1
+ if indegree[dependent] == 0:
+ ready.append(dependent)
+ return ordered
+
+
+def format_node(node):
+ class_name, location = node
+ return "%s (%s)" % (class_name, location)
+
+
+def print_path(path):
+ return " -> ".join(format_node(node) for node in path)
+
+
+def json_node(node):
+ class_name, location = node
+ return {
+ "className": class_name,
+ "location": location,
+ }
+
+
+def location_relative_entry(info):
+ parts = info.entry.split("/", 1)
+ if info.location == "root":
+ return info.entry
+ if len(parts) == 2:
+ return parts[1]
+ return info.entry
+
+
+def direct_classifier_edges(graph):
+ edges = []
+ for source, targets in graph.items():
+ if is_classifier_class(source[0]):
+ continue
+ for target in targets:
+ if is_classifier_class(target[0]):
+ edges.append((source, target))
+ return sorted(edges)
+
+
+def version_blocker_counts(graph, version_nodes, root_or_shared):
+ """Count root/shared classes that can reach each version-specific node."""
+ rev = reverse_graph(graph)
+ counts = []
+ for version_node in sorted(version_nodes):
+ seen = {version_node}
+ queue = collections.deque([version_node])
+ impacted = set()
+ while queue:
+ node = queue.popleft()
+ for parent in rev[node]:
+ if parent in seen:
+ continue
+ seen.add(parent)
+ queue.append(parent)
+ if parent in root_or_shared:
+ impacted.add(parent)
+ if impacted:
+ counts.append((len(impacted), version_node))
+ return sorted(counts, key=lambda item: (-item[0], item[1]))
+
+
+def nearest_version_target_counts(graph, blocked):
+ """Count terminal version nodes from each blocked node's shortest path."""
+ rev = reverse_graph(graph)
+ distance = {}
+ queue = collections.deque()
+ for node in sorted(node for node in graph if is_version_node(node)):
+ distance[node] = 0
+ queue.append(node)
+
+ while queue:
+ node = queue.popleft()
+ for parent in sorted(rev[node]):
+ if parent in distance:
+ continue
+ distance[parent] = distance[node] + 1
+ queue.append(parent)
+
+ def rebuild_path(start):
+ path = [start]
+ node = start
+ while not is_version_node(node):
+ next_node = None
+ for candidate in sorted(graph[node]):
+ if distance.get(candidate) == distance[node] - 1:
+ next_node = candidate
+ break
+ if next_node is None:
+ return None
+ path.append(next_node)
+ node = next_node
+ return path
+
+ counts = collections.Counter()
+ examples = {}
+ paths = []
+ for node in blocked:
+ if node not in distance:
+ continue
+ path = rebuild_path(node)
+ if not path:
+ continue
+ paths.append((node, path))
+ target = path[-1]
+ counts[target] += 1
+ examples.setdefault(target, path)
+ ranked = sorted(
+ ((count, target, examples[target]) for target, count in counts.items()),
+ key=lambda item: (-item[0], item[1]))
+ return ranked, paths
+
+
+def main():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("path", help="dist/target/parallel-world directory or a dist jar")
+ parser.add_argument("--limit", type=int, default=20,
+ help="maximum number of examples to print per section")
+ parser.add_argument("--exclude-prefix", action="append", default=[],
+ help="class name prefix to exclude; may be passed more than once")
+ parser.add_argument("--show-safe", action="store_true",
+ help="print examples of spark-shared classes with no path to version-specific code")
+ parser.add_argument("--show-topo", action="store_true",
+ help="print root-safe spark-shared SCCs in dependency-first order")
+ parser.add_argument("--show-reachability", action="store_true",
+ help="print overlapping reachability counts for version-specific nodes")
+ parser.add_argument("--format", choices=("text", "json"), default="text",
+ help="output format")
+ parser.add_argument("--write-safe-paths",
+ help="write root-safe spark-shared class paths, one per line")
+ args = parser.parse_args()
+
+ exclude_prefixes = tuple(DEFAULT_EXCLUDES) + tuple(args.exclude_prefix)
+ classes, name_locations, errors = load_classes(args.path, exclude_prefixes)
+ graph = build_graph(classes, name_locations)
+ contaminated, version_nodes = reachable_to_version_specific(graph)
+ components = tarjan_scc(graph)
+ component_order = dependency_first_component_order(graph, components)
+
+ by_location = collections.Counter(info.location for info in classes.values())
+ root_or_shared = {
+ node for node, info in classes.items()
+ if info.location in ("root", "spark-shared") and not is_classifier_class(info.name)
+ }
+ blocked = sorted(root_or_shared & contaminated)
+ safe_shared = sorted(
+ node for node in root_or_shared - contaminated
+ if classes[node].location == "spark-shared")
+ classifier_edges = direct_classifier_edges(graph)
+ version_components = [comp for comp in components if any(is_version_node(node) for node in comp)]
+ safe_sccs = []
+ for comp_id in component_order:
+ component = components[comp_id]
+ safe_members = sorted(node for node in component if node in safe_shared)
+ if safe_members:
+ safe_sccs.append((comp_id, safe_members))
+ version_blockers = (
+ version_blocker_counts(graph, version_nodes, root_or_shared)
+ if args.show_reachability or args.format == "json" else [])
+ nearest_targets, blocked_paths = nearest_version_target_counts(graph, blocked)
+ safe_shared_paths = sorted(location_relative_entry(classes[node]) for node in safe_shared)
+
+ if args.write_safe_paths:
+ with open(args.write_safe_paths, "w", encoding="utf-8") as out:
+ for path in safe_shared_paths:
+ out.write(path)
+ out.write("\n")
+
+ if args.format == "json":
+ output = {
+ "path": args.path,
+ "classCount": len(classes),
+ "locationCounts": dict(sorted(by_location.items())),
+ "versionSpecificNodeCount": len(version_nodes),
+ "rootOrSharedBlockedCount": len(blocked),
+ "rootSafeSparkSharedCount": len(safe_shared),
+ "sccCount": len(components),
+ "versionSpecificSccCount": len(version_components),
+ "directClassifierDependencyCount": len(classifier_edges),
+ "rootSafeSparkSharedPaths": safe_shared_paths,
+ "directClassifierDependencyExamples": [
+ {
+ "source": json_node(source),
+ "target": json_node(target),
+ }
+ for source, target in classifier_edges[:args.limit]
+ ],
+ "topVersionBlockersByReachability": [
+ {
+ "blockedRootOrSharedCount": count,
+ "target": json_node(target),
+ }
+ for count, target in version_blockers[:args.limit]
+ ],
+ "nearestVersionTargetCounts": [
+ {
+ "blockedShortestPathCount": count,
+ "target": json_node(target),
+ "examplePath": [json_node(node) for node in path],
+ }
+ for count, target, path in nearest_targets[:args.limit]
+ ],
+ "rootSafeSparkSharedSccCount": len(safe_sccs),
+ "rootSafeSparkSharedSccExamples": [
+ {
+ "componentId": comp_id,
+ "classCount": len(members),
+ "classExamples": [json_node(node) for node in members[:args.limit]],
+ }
+ for comp_id, members in safe_sccs[:args.limit]
+ ],
+ "blockedExamples": [
+ [json_node(node) for node in path]
+ for _, path in blocked_paths[:args.limit]
+ ],
+ }
+ json.dump(output, sys.stdout, indent=2, sort_keys=True)
+ print()
+ return
+
+ print("Loaded %d classes from %s" % (len(classes), args.path))
+ if errors:
+ print("Skipped %d malformed class files" % len(errors))
+ print("Class locations:")
+ for location, count in sorted(by_location.items()):
+ print(" %s: %d" % (location, count))
+ print("Version-specific/classifier nodes: %d" % len(version_nodes))
+ print("Root or spark-shared nodes with a path to version-specific code: %d" % len(blocked))
+ print("Root-safe spark-shared nodes: %d" % len(safe_shared))
+ print("SCCs: %d total, %d containing version-specific code" %
+ (len(components), len(version_components)))
+
+ print("\nDirect classifier-package dependencies: %d" % len(classifier_edges))
+ for source, target in classifier_edges[:args.limit]:
+ print(" %s -> %s" % (format_node(source), format_node(target)))
+ if len(classifier_edges) > args.limit:
+ print(" ... %d more" % (len(classifier_edges) - args.limit))
+
+ if args.show_reachability:
+ print("\nTop version-specific blockers by upstream root/shared reachability:")
+ for count, target in version_blockers[:args.limit]:
+ print(" %d <- %s" % (count, format_node(target)))
+ if len(version_blockers) > args.limit:
+ print(" ... %d more" % (len(version_blockers) - args.limit))
+
+ print("\nNearest version targets from shortest blocked paths:")
+ for count, target, path in nearest_targets[:args.limit]:
+ print(" %d -> %s" % (count, format_node(target)))
+ print(" e.g. %s" % print_path(path))
+ if len(nearest_targets) > args.limit:
+ print(" ... %d more" % (len(nearest_targets) - args.limit))
+
+ print("\nNearest paths from root/spark-shared code to version-specific code:")
+ for _, path in blocked_paths[:args.limit]:
+ print(" %s" % print_path(path))
+ if len(blocked) > args.limit:
+ print(" ... %d more blocked classes" % (len(blocked) - args.limit))
+
+ if args.show_safe:
+ print("\nSpark-shared classes with no path to version-specific code:")
+ for node in safe_shared[:args.limit]:
+ print(" %s" % format_node(node))
+ if len(safe_shared) > args.limit:
+ print(" ... %d more" % (len(safe_shared) - args.limit))
+
+ if args.show_topo:
+ print("\nRoot-safe spark-shared SCCs in dependency-first order:")
+ for printed, (comp_id, safe_members) in enumerate(safe_sccs):
+ print(" component %d, %d class(es)" % (comp_id, len(safe_members)))
+ for node in safe_members[:3]:
+ print(" %s" % format_node(node))
+ if len(safe_members) > 3:
+ print(" ... %d more in component" % (len(safe_members) - 3))
+ if printed + 1 >= args.limit:
+ break
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh
index 2054e18ccf9..ea3ac931413 100755
--- a/dist/scripts/binary-dedupe.sh
+++ b/dist/scripts/binary-dedupe.sh
@@ -35,10 +35,34 @@ esac
STEP=0
export SPARK_SHARED_TXT="$PWD/spark-shared.txt"
+export SPARK_SHARED_CLASSES_TXT="$PWD/spark-shared-classes.txt"
export SPARK_SHARED_COPY_LIST="$PWD/spark-shared-copy-list.txt"
export DELETE_DUPLICATES_TXT="$PWD/delete-duplicates.txt"
export SPARK_SHARED_DIR="$PWD/spark-shared"
export UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST="$PWD/unshimmed-from-spark-shared-copy-list.txt"
+export ROOT_SAFE_SPARK_SHARED_TXT="$PWD/root-safe-spark-shared.txt"
+export DEFAULT_UNSHIMMED_SPARK_SHARED_TXT="$PWD/default-unshimmed-spark-shared.txt"
+export UNSHIMMED_NEED_SHARED_TXT="$PWD/unshimmed-need-shared.txt"
+export UNSHIMMED_MISSING_SHARED_TXT="$PWD/unshimmed-missing-shared.txt"
+
+SPARK_SHIM_DIRS=()
+if [[ "${UNSHIM_FAST:-0}" == "1" ]]; then
+ while IFS= read -r shim_dir; do
+ SPARK_SHIM_DIRS+=("$shim_dir")
+ done < <(find ./parallel-world -maxdepth 1 -mindepth 1 -type d -name 'spark[34]*' | sort)
+fi
+
+DEDUPE_CACHE_DIR="${UNSHIM_DEDUPE_CACHE_DIR:-}"
+DEDUPE_CACHE_SPARK_SHARED_TXT=""
+DEDUPE_CACHE_SHA1_FILES_TXT=""
+DEDUPE_CACHE_SHIM_SHA_PACKAGE_FILES_TXT=""
+DEDUPE_CACHE_COUNT_SHIM_SHA_PACKAGE_FILES_TXT=""
+if [[ -n "$DEDUPE_CACHE_DIR" ]]; then
+ DEDUPE_CACHE_SPARK_SHARED_TXT="$DEDUPE_CACHE_DIR/spark-shared.txt"
+ DEDUPE_CACHE_SHA1_FILES_TXT="$DEDUPE_CACHE_DIR/tmp-sha1-files.txt"
+ DEDUPE_CACHE_SHIM_SHA_PACKAGE_FILES_TXT="$DEDUPE_CACHE_DIR/tmp-shim-sha-package-files.txt"
+ DEDUPE_CACHE_COUNT_SHIM_SHA_PACKAGE_FILES_TXT="$DEDUPE_CACHE_DIR/tmp-count-shim-sha-package-files.txt"
+fi
# This script de-duplicates .class files at the binary level.
# We could also diff classes using scalap / javap outputs.
@@ -55,24 +79,54 @@ export UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST="$PWD/unshimmed-from-spark-shared-c
# - put the path starting with /sparkxyz back together for the final list
echo "Retrieving class files hashing to a single value ..."
-
-echo "$((++STEP))/ SHA1 of all non-META files > tmp-sha1-files.txt"
-find ./parallel-world/spark[34]* -name META-INF -prune -o -name webapps -prune -o \( -type f -print0 \) | \
- xargs --null $SHASUM > tmp-sha1-files.txt
-
-echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-files.txt"
-< tmp-sha1-files.txt awk -F/ '$1=$1' | \
- awk '{checksum=$1; shim=$4; $1=shim; $2=$3=""; $4=checksum; print $0}' | \
- tr -s ' ' > tmp-shim-sha-package-files.txt
-
-echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count-shim-sha-package-files.txt"
-sort -k3 -k2,2 -u tmp-shim-sha-package-files.txt | \
- uniq -f 2 -c > tmp-count-shim-sha-package-files.txt
-
-echo "$((++STEP))/ files with unique sha1 > $SPARK_SHARED_TXT"
-grep '^\s\+1 .*' tmp-count-shim-sha-package-files.txt | \
- awk '{$1=""; $3=""; print $0 }' | \
- tr -s ' ' | sed 's/\ /\//g' > "$SPARK_SHARED_TXT"
+CACHE_HIT=0
+if [[ -n "$DEDUPE_CACHE_SPARK_SHARED_TXT" && \
+ -f "$DEDUPE_CACHE_SPARK_SHARED_TXT" && \
+ -f "$DEDUPE_CACHE_SHA1_FILES_TXT" && \
+ -f "$DEDUPE_CACHE_SHIM_SHA_PACKAGE_FILES_TXT" && \
+ -f "$DEDUPE_CACHE_COUNT_SHIM_SHA_PACKAGE_FILES_TXT" ]]; then
+ echo "$((++STEP))/ reusing cached files with unique sha1 > $SPARK_SHARED_TXT"
+ cp "$DEDUPE_CACHE_SPARK_SHARED_TXT" "$SPARK_SHARED_TXT"
+ cp "$DEDUPE_CACHE_SHA1_FILES_TXT" tmp-sha1-files.txt
+ cp "$DEDUPE_CACHE_SHIM_SHA_PACKAGE_FILES_TXT" tmp-shim-sha-package-files.txt
+ cp "$DEDUPE_CACHE_COUNT_SHIM_SHA_PACKAGE_FILES_TXT" tmp-count-shim-sha-package-files.txt
+ CACHE_HIT=1
+# With one shim there is no cross-shim identity proof to perform; every
+# non-META file is the sole representative for its path.
+elif [[ "${UNSHIM_FAST:-0}" == "1" && "${#SPARK_SHIM_DIRS[@]}" == "1" ]]; then
+ echo "$((++STEP))/ single shim fast path; listing files > $SPARK_SHARED_TXT"
+ : > tmp-sha1-files.txt
+ : > tmp-shim-sha-package-files.txt
+ : > tmp-count-shim-sha-package-files.txt
+ find "${SPARK_SHIM_DIRS[0]}" -name META-INF -prune -o -name webapps -prune -o \( -type f -print \) | \
+ sort | sed 's|^\./parallel-world||' > "$SPARK_SHARED_TXT"
+else
+ echo "$((++STEP))/ SHA1 of all non-META files > tmp-sha1-files.txt"
+ find ./parallel-world/spark[34]* -name META-INF -prune -o -name webapps -prune -o \( -type f -print0 \) | \
+ xargs --null $SHASUM > tmp-sha1-files.txt
+
+ echo "$((++STEP))/ make shim column 1 > tmp-shim-sha-package-files.txt"
+ < tmp-sha1-files.txt awk -F/ '$1=$1' | \
+ awk '{checksum=$1; shim=$4; $1=shim; $2=$3=""; $4=checksum; print $0}' | \
+ tr -s ' ' > tmp-shim-sha-package-files.txt
+
+ echo "$((++STEP))/ sort by path, sha1; output first from each group > tmp-count-shim-sha-package-files.txt"
+ sort -k3 -k2,2 -u tmp-shim-sha-package-files.txt | \
+ uniq -f 2 -c > tmp-count-shim-sha-package-files.txt
+
+ echo "$((++STEP))/ files with unique sha1 > $SPARK_SHARED_TXT"
+ grep '^\s\+1 .*' tmp-count-shim-sha-package-files.txt | \
+ awk '{$1=""; $3=""; print $0 }' | \
+ tr -s ' ' | sed 's/\ /\//g' > "$SPARK_SHARED_TXT"
+fi
+
+if [[ "$CACHE_HIT" == "0" && -n "$DEDUPE_CACHE_SPARK_SHARED_TXT" ]]; then
+ mkdir -p "$DEDUPE_CACHE_DIR"
+ cp "$SPARK_SHARED_TXT" "$DEDUPE_CACHE_SPARK_SHARED_TXT"
+ cp tmp-sha1-files.txt "$DEDUPE_CACHE_SHA1_FILES_TXT"
+ cp tmp-shim-sha-package-files.txt "$DEDUPE_CACHE_SHIM_SHA_PACKAGE_FILES_TXT"
+ cp tmp-count-shim-sha-package-files.txt "$DEDUPE_CACHE_COUNT_SHIM_SHA_PACKAGE_FILES_TXT"
+fi
function retain_single_copy() {
set -e
@@ -100,9 +154,10 @@ function retain_single_copy() {
done >> "$DELETE_DUPLICATES_TXT" || exit 255
}
-function copy_unshimmed_from_spark_shared() {
+function append_matching_spark_shared_patterns() {
set -e
- local unshimmed_patterns_txt="${UNSHIMMED_COMMON_FROM_SINGLE_SHIM_TXT:-}"
+ local unshimmed_patterns_txt="$1"
+ local output_txt="$2"
[[ -n "$unshimmed_patterns_txt" ]] || return 0
[[ -f "$unshimmed_patterns_txt" ]] || {
@@ -110,23 +165,102 @@ function copy_unshimmed_from_spark_shared() {
exit 255
}
- : > "$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST"
- while read -r shared_path; do
- local rel_path="${shared_path#./parallel-world/spark-shared/}"
- local pattern
- while read -r pattern; do
- [[ -n "$pattern" ]] || continue
- [[ "$pattern" =~ ^[[:space:]]*# ]] && continue
- # shellcheck disable=SC2053
- if [[ "$rel_path" == $pattern ]]; then
- echo "$rel_path" >> "$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST"
- break
- fi
- done < "$unshimmed_patterns_txt"
- done < <(find ./parallel-world/spark-shared -type f)
+ local shared_dir="./parallel-world/spark-shared"
+ local pattern
+ while IFS= read -r pattern; do
+ [[ -n "$pattern" ]] || continue
+ [[ "$pattern" =~ ^[[:space:]]*# ]] && continue
+ case "$pattern" in
+ *[\*\?\[]*)
+ find "$shared_dir" -type f -path "$shared_dir/$pattern" |
+ sed "s|^\./parallel-world/spark-shared/||" >> "$output_txt"
+ ;;
+ *)
+ if [[ -f "$shared_dir/$pattern" ]]; then
+ echo "$pattern" >> "$output_txt"
+ fi
+ ;;
+ esac
+ done < "$unshimmed_patterns_txt"
+}
+
+function write_root_safe_spark_shared_classes() {
+ set -e
+ local analyzer_script="${UNSHIM_ANALYZER_SCRIPT:-}"
+ if [[ -z "$analyzer_script" && -n "${UNSHIMMED_COMMON_FROM_SINGLE_SHIM_TXT:-}" ]]; then
+ analyzer_script="$(dirname "$UNSHIMMED_COMMON_FROM_SINGLE_SHIM_TXT")/scripts/analyze-parallel-world-deps.py"
+ fi
+ [[ -n "$analyzer_script" && -f "$analyzer_script" ]] || {
+ echo >&2 "Cannot locate analyze-parallel-world-deps.py for default unshim analysis"
+ exit 255
+ }
+
+ echo "$((++STEP))/ analyzing spark-shared dependency paths > $ROOT_SAFE_SPARK_SHARED_TXT"
+ python3 "$analyzer_script" ./parallel-world \
+ --write-safe-paths "$ROOT_SAFE_SPARK_SHARED_TXT"
+}
+function write_default_unshimmed_spark_shared_classes() {
+ set -e
+ echo "$((++STEP))/ selecting all bitwise-identical spark-shared classes > $DEFAULT_UNSHIMMED_SPARK_SHARED_TXT"
+ sed -E "s|^/spark[^/]*/||" "$SPARK_SHARED_TXT" | \
+ grep '\.class$' | sort -u > "$DEFAULT_UNSHIMMED_SPARK_SHARED_TXT"
+}
+
+function keep_in_spark_shared() {
+ set -e
+ local class_file="$1"
+ local keep_patterns_txt="${KEEP_IN_SPARK_SHARED_TXT:-}"
+ [[ -n "$keep_patterns_txt" ]] || return 1
+ [[ -f "$keep_patterns_txt" ]] || {
+ echo >&2 "Keep-in-spark-shared list does not exist: $keep_patterns_txt"
+ exit 255
+ }
+
+ local pattern
+ while IFS= read -r pattern; do
+ [[ -n "$pattern" ]] || continue
+ [[ "$pattern" =~ ^[[:space:]]*# ]] && continue
+ # shellcheck disable=SC2053
+ if [[ "$class_file" == $pattern ]]; then
+ return 0
+ fi
+ done < "$keep_patterns_txt"
+ return 1
+}
+
+function filter_keep_in_spark_shared() {
+ set -e
+ local input_txt="$1"
+ local output_txt="$2"
+ local class_file
+ : > "$output_txt"
+ while IFS= read -r class_file; do
+ [[ -n "$class_file" ]] || continue
+ if keep_in_spark_shared "$class_file"; then
+ continue
+ fi
+ echo "$class_file"
+ done < "$input_txt" > "$output_txt.tmp"
+ mv "$output_txt.tmp" "$output_txt"
+}
+
+function copy_unshimmed_from_spark_shared() {
+ set -e
+ local raw_copy_list="$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST.raw"
+ local sorted_copy_list="$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST.sorted"
+
+ : > "$raw_copy_list"
+ write_root_safe_spark_shared_classes
+ write_default_unshimmed_spark_shared_classes
+ cat "$DEFAULT_UNSHIMMED_SPARK_SHARED_TXT" >> "$raw_copy_list"
+ append_matching_spark_shared_patterns \
+ "${UNSHIMMED_COMMON_FROM_SINGLE_SHIM_TXT:-}" "$raw_copy_list"
+
+ sort -u "$raw_copy_list" > "$sorted_copy_list"
+ filter_keep_in_spark_shared "$sorted_copy_list" "$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST"
if [[ -s "$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST" ]]; then
- echo "Promoting root-layout files from spark-shared via $unshimmed_patterns_txt"
+ echo "Promoting root-layout files from spark-shared by default"
rsync --files-from="$UNSHIMMED_FROM_SPARK_SHARED_COPY_LIST" \
./parallel-world/spark-shared ./parallel-world
fi
@@ -141,9 +275,23 @@ rm -rf "$SPARK_SHARED_DIR"
mkdir -p "$SPARK_SHARED_DIR"
echo "$((++STEP))/ retaining a single copy of spark-shared classes"
-while read -r spark_common_class; do
- retain_single_copy "$spark_common_class"
-done < "$SPARK_SHARED_TXT"
+awk -F/ "
+ NF >= 3 {
+ shim = \$2
+ package_class = \$0
+ sub(\"^/spark[34][^/]*/\", \"\", package_class)
+ print package_class >> (\"from-\" shim \"-to-spark-shared.txt\")
+ }
+" "$SPARK_SHARED_TXT"
+for pw in ./parallel-world/spark[34]* ; do
+ awk -v pw="$pw" "
+ {
+ package_class = \$0
+ sub(\"^/spark[34][^/]*/\", \"\", package_class)
+ print pw \"/\" package_class
+ }
+ " "$SPARK_SHARED_TXT"
+done >> "$DELETE_DUPLICATES_TXT"
echo "$((++STEP))/ rsyncing common classes to $SPARK_SHARED_DIR"
for copy_list in from-spark[34]*-to-spark-shared.txt; do
@@ -157,7 +305,7 @@ done
mv "$SPARK_SHARED_DIR" parallel-world/
-echo "$((++STEP))/ promoting allowlisted spark-shared files to root layout"
+echo "$((++STEP))/ promoting default spark-shared files to root layout"
copy_unshimmed_from_spark_shared
# Verify that all class files in the conventional jar location are bitwise
@@ -184,11 +332,16 @@ copy_unshimmed_from_spark_shared
# Determine the list of unshimmed class files
UNSHIMMED_LIST_TXT=unshimmed-result.txt
-echo "$((++STEP))/ creating sorted list of unshimmed classes > $UNSHIMMED_LIST_TXT"
-find ./parallel-world -name '*.class' -not -path './parallel-world/spark[34-]*' | \
+echo "$((++STEP))/ creating sorted list of root-layout unshimmed classes > $UNSHIMMED_LIST_TXT"
+find ./parallel-world -name '*.class' \
+ -not -path './parallel-world/spark[34-]*' \
+ -not -path './parallel-world/spark-shared/*' | \
cut -d/ -f 3- | sort > "$UNSHIMMED_LIST_TXT"
-function verify_same_sha_for_unshimmed() {
+echo "$((++STEP))/ creating sorted list of spark-shared classes > $SPARK_SHARED_CLASSES_TXT"
+sed -E "s|^/spark[^/]*/||" "$SPARK_SHARED_TXT" | sort -u > "$SPARK_SHARED_CLASSES_TXT"
+
+function unshimmed_class_needs_shared_identity() {
set -e
class_file="$1"
@@ -196,7 +349,7 @@ function verify_same_sha_for_unshimmed() {
# including the ones that are unshimmed. Instead of expensively recomputing
# sha1 look up if there is an entry with the unshimmed class as a suffix
- class_file_quoted=$(printf '%q' "$class_file")
+ class_file_quoted=$(printf "%q" "$class_file")
# TODO currently RapidsShuffleManager is "removed" from /spark* by construction in
# dist pom.xml via ant. We could delegate this logic to this script
# and make both simmpler
@@ -211,34 +364,72 @@ function verify_same_sha_for_unshimmed() {
# the class provides concrete implementations for ALL getReader variants,
# so the JVM resolves the correct one at runtime regardless of which
# ShuffleManager version the class was compiled against.
- if [[ ! "$class_file_quoted" =~ com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class && \
- "$class_file_quoted" != "com/nvidia/spark/ParquetCachedBatchSerializer.class" && \
- ! "$class_file_quoted" =~ org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase ]]; then
- if ! grep -q "/spark.\+/$class_file_quoted" "$SPARK_SHARED_TXT"; then
- echo >&2 "$class_file is not bitwise-identical across shims"
- exit 255
- fi
+ # GpuShuffleDependency has identical JVM bytecode and descriptors between
+ # Spark 3.5 and 4.1. Only ScalaSignature metadata differs after compiling
+ # the same source against different Spark dependency jars. WindowInPandasExecTypeShim
+ # has no methods in the class shell; its companion carries the behavior.
+ # CloseableColumnBatchIterator has identical descriptors and code; Scala 2.13 only
+ # renames generic Signature-attribute type variables across the Spark 3.5/4.1 compiles.
+ # GpuReadCSVFileFormat and GpuReadJsonFileFormat have identical descriptors and
+ # executable javap output; only ScalaSignature metadata differs across Spark deps.
+ # PythonMapInArrowExecShims and PythonArgumentUtils class shells have identical
+ # executable bytecode; only source-file metadata differs across shim source names.
+ # GpuUnionExecShim and RapidsErrorUtils class shells have identical executable
+ # bytecode; only ScalaSignature metadata differs.
+ # GpuStringTrim* differs after Spark 4.1 because String2TrimExpression adds
+ # collation/context-independent foldability methods. The case-class fields,
+ # product surface, and Spark 3.5-callable methods remain compatible; Spark 3.x
+ # does not invoke the added methods.
+ # GpuAtomicCreateTableAsSelectExec companion has identical executable bytecode;
+ # only line-number debug metadata differs across shim sources.
+ if [[ "$class_file_quoted" =~ com/nvidia/spark/rapids/spark[34].*/.*ShuffleManager.class || \
+ "$class_file_quoted" == "com/nvidia/spark/ParquetCachedBatchSerializer.class" || \
+ "$class_file_quoted" =~ org/apache/spark/sql/rapids/ProxyRapidsShuffleInternalManagerBase || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/GpuShuffleDependency.class" || \
+ "$class_file_quoted" == "com/nvidia/spark/rapids/parquet/CloseableColumnBatchIterator.class" || \
+ "$class_file_quoted" == "com/nvidia/spark/rapids/GpuReadCSVFileFormat.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/catalyst/json/rapids/GpuReadJsonFileFormat.class" || \
+ "$class_file_quoted" == "com/nvidia/spark/rapids/shims/PythonMapInArrowExecShims.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/execution/python/shims/PythonArgumentUtils.class" || \
+ "$class_file_quoted" == "com/nvidia/spark/rapids/shims/GpuUnionExecShim.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/GpuStringTrim.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/GpuStringTrimLeft.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/GpuStringTrimRight.class" || \
+ "$class_file" == "org/apache/spark/sql/execution/datasources/v2/rapids/GpuAtomicCreateTableAsSelectExec$.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/shims/RapidsErrorUtils.class" || \
+ "$class_file_quoted" == "org/apache/spark/sql/rapids/execution/python/shims/WindowInPandasExecTypeShim.class" ]]; then
+ return 1
fi
+ return 0
}
-echo "$((++STEP))/ verifying unshimmed classes have unique sha1 across shims"
+echo "$((++STEP))/ filtering unshimmed classes that require shared identity > $UNSHIMMED_NEED_SHARED_TXT"
while read -r unshimmed_class; do
- verify_same_sha_for_unshimmed "$unshimmed_class"
-done < "$UNSHIMMED_LIST_TXT"
+ if unshimmed_class_needs_shared_identity "$unshimmed_class"; then
+ echo "$unshimmed_class"
+ fi
+done < "$UNSHIMMED_LIST_TXT" | sort -u > "$UNSHIMMED_NEED_SHARED_TXT"
+
+echo "$((++STEP))/ verifying unshimmed classes have unique sha1 across shims"
+comm -23 "$UNSHIMMED_NEED_SHARED_TXT" "$SPARK_SHARED_CLASSES_TXT" > "$UNSHIMMED_MISSING_SHARED_TXT"
+if [[ -s "$UNSHIMMED_MISSING_SHARED_TXT" ]]; then
+ read -r missing_unshimmed_class < "$UNSHIMMED_MISSING_SHARED_TXT"
+ echo >&2 "$missing_unshimmed_class is not bitwise-identical across shims"
+ exit 255
+fi
# Remove unshimmed classes from parallel worlds
# TODO rework with low priority, only a few classes.
echo "$((++STEP))/ removing duplicates of unshimmed classes"
-
-while read -r unshimmed_class; do
+{
+ sed "s|^|./parallel-world/spark-shared/|" "$UNSHIMMED_LIST_TXT"
for pw in ./parallel-world/spark[34-]* ; do
- unshimmed_path="$pw/$unshimmed_class"
- [[ -f "$unshimmed_path" ]] && echo "$unshimmed_path" || true
- done >> "$DELETE_DUPLICATES_TXT"
-done < "$UNSHIMMED_LIST_TXT"
+ awk -v pw="$pw" "{ print pw \"/\" \$0 }" "$UNSHIMMED_LIST_TXT"
+ done
+} >> "$DELETE_DUPLICATES_TXT"
echo "$((++STEP))/ deleting all class files listed in $DELETE_DUPLICATES_TXT"
-< "$DELETE_DUPLICATES_TXT" sort -u | xargs rm
+< "$DELETE_DUPLICATES_TXT" sort -u | xargs rm -f
end_time=$(date +%s)
echo "binary-dedupe completed in $((end_time - start_time)) seconds"
diff --git a/dist/scripts/build-unshim-parallel-world.py b/dist/scripts/build-unshim-parallel-world.py
new file mode 100644
index 00000000000..9f41be63558
--- /dev/null
+++ b/dist/scripts/build-unshim-parallel-world.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2026, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Build dist/target/parallel-world directly for repeated unshim analysis.
+
+This mirrors the analyzer-relevant part of dist/maven-antrun/build-parallel-worlds.xml
+without starting a final Maven dist generate-resources invocation. It assumes buildall
+has already built the per-shim sql-plugin-api and aggregator jars under target/sparkXYZ.
+"""
+
+import argparse
+import fnmatch
+import hashlib
+import os
+from pathlib import Path
+import shutil
+import subprocess
+import sys
+import zipfile
+
+
+ARTIFACTS = ("sql-plugin-api", "aggregator")
+
+
+def read_patterns(path):
+ with path.open() as fh:
+ return [
+ line.strip()
+ for line in fh
+ if line.strip() and not line.lstrip().startswith("#")
+ ]
+
+
+def has_fnmatch_magic(pattern):
+ return any(ch in pattern for ch in "*?[")
+
+
+def matching_members(namelist, patterns):
+ names_by_entry = {}
+ for name in namelist:
+ names_by_entry.setdefault(name, []).append(name)
+
+ matches = []
+ for pattern in patterns:
+ if has_fnmatch_magic(pattern):
+ matches.extend(fnmatch.filter(namelist, pattern))
+ else:
+ matches.extend(names_by_entry.get(pattern, []))
+ return matches
+
+
+def safe_extract(zip_handle, destination, members=None):
+ destination = destination.resolve()
+ for member in members if members is not None else zip_handle.namelist():
+ target = (destination / member).resolve()
+ if not str(target).startswith(str(destination) + os.sep):
+ raise RuntimeError("refusing to extract outside destination: %s" % member)
+ zip_handle.extract(member, destination)
+
+
+def clean_output(target_dir):
+ for dirname in ("parallel-world", "deps", "extra-resources"):
+ path = target_dir / dirname
+ if path.exists():
+ shutil.rmtree(path)
+ path.mkdir(parents=True, exist_ok=True)
+ for jar_path in target_dir.glob("*.jar"):
+ jar_path.unlink()
+
+
+def artifact_jar(base_dir, artifact, scala_binary_version, project_version, buildver):
+ artifact_id = "rapids-4-spark-%s_%s" % (artifact, scala_binary_version)
+ classifier = "spark%s" % buildver
+ jar_name = "%s-%s-%s.jar" % (artifact_id, project_version, classifier)
+ jar_path = base_dir / artifact / "target" / classifier / jar_name
+ if not jar_path.is_file():
+ raise FileNotFoundError(
+ "expected built %s jar missing: %s" % (artifact, jar_path))
+ return jar_path
+
+
+def jar_signature(jar_path):
+ stat = jar_path.stat()
+ return "\n".join((
+ "path=%s" % jar_path,
+ "size=%s" % stat.st_size,
+ "mtime_ns=%s" % stat.st_mtime_ns,
+ "",
+ ))
+
+
+def dedupe_cache_key(base_dir, scala_binary_version, project_version, buildvers):
+ parts = []
+ for buildver in sorted(buildvers, reverse=True):
+ for artifact in ARTIFACTS:
+ jar_path = artifact_jar(
+ base_dir, artifact, scala_binary_version, project_version, buildver)
+ parts.extend((
+ "buildver=%s" % buildver,
+ "artifact=%s" % artifact,
+ jar_signature(jar_path),
+ ))
+ return hashlib.sha1("\n".join(parts).encode("utf-8")).hexdigest()
+
+
+def ensure_extracted_cache(jar_path, cache_dir):
+ contents_dir = cache_dir / "contents"
+ marker = cache_dir / ".source"
+ signature = jar_signature(jar_path)
+
+ if marker.is_file() and marker.read_text() == signature:
+ return contents_dir
+
+ if cache_dir.exists():
+ shutil.rmtree(cache_dir)
+ contents_dir.mkdir(parents=True, exist_ok=True)
+ with zipfile.ZipFile(jar_path) as zip_handle:
+ safe_extract(zip_handle, contents_dir)
+ marker.write_text(signature)
+ return contents_dir
+
+
+def link_or_copy(src, dst):
+ dst.parent.mkdir(parents=True, exist_ok=True)
+ if dst.exists() or dst.is_symlink():
+ dst.unlink()
+ try:
+ os.link(src, dst)
+ except OSError:
+ shutil.copy2(src, dst)
+
+
+def link_tree_contents(src_dir, dst_dir):
+ for root, _, files in os.walk(src_dir):
+ root_path = Path(root)
+ rel_root = root_path.relative_to(src_dir)
+ target_root = dst_dir / rel_root
+ target_root.mkdir(parents=True, exist_ok=True)
+ for name in files:
+ link_or_copy(root_path / name, target_root / name)
+
+
+def link_members(contents_dir, destination, members):
+ for member in members:
+ if member.endswith("/"):
+ continue
+ src = contents_dir / member
+ if src.is_file():
+ link_or_copy(src, destination / member)
+
+
+def copy_and_extract_jars(
+ base_dir,
+ target_dir,
+ scala_binary_version,
+ project_version,
+ buildvers,
+ from_single_shim,
+ from_each):
+ parallel_world = target_dir / "parallel-world"
+ cache_root = target_dir / "unshim-parallel-world-cache"
+ sorted_buildvers = sorted(buildvers, reverse=True)
+ root_buildver = sorted_buildvers[0]
+
+ for buildver in sorted_buildvers:
+ classifier = "spark%s" % buildver
+ for artifact in ARTIFACTS:
+ jar_path = artifact_jar(
+ base_dir, artifact, scala_binary_version, project_version, buildver)
+ contents_dir = ensure_extracted_cache(
+ jar_path, cache_root / classifier / artifact)
+ with zipfile.ZipFile(jar_path) as zip_handle:
+ namelist = zip_handle.namelist()
+
+ link_tree_contents(contents_dir, parallel_world / classifier)
+ if buildver == root_buildver and artifact == "sql-plugin-api":
+ link_tree_contents(contents_dir, parallel_world)
+
+ patterns = from_each
+ if buildver == root_buildver:
+ patterns = from_single_shim + from_each
+ members = matching_members(namelist, patterns)
+ link_members(contents_dir, parallel_world, members)
+
+
+def run_checked(command, cwd, env=None):
+ subprocess.run(command, cwd=str(cwd), env=env, check=True)
+
+
+def remove_allowlisted_from_spark_shared(parallel_world, from_single_shim):
+ shared_dir = parallel_world / "spark-shared"
+ if not shared_dir.is_dir():
+ return
+
+ for pattern in from_single_shim:
+ if has_fnmatch_magic(pattern):
+ for path in shared_dir.rglob("*"):
+ if path.is_file() and fnmatch.fnmatch(path.relative_to(shared_dir).as_posix(), pattern):
+ path.unlink()
+ else:
+ path = shared_dir / pattern
+ if path.is_file():
+ path.unlink()
+
+
+def main():
+ parser = argparse.ArgumentParser(description=__doc__)
+ parser.add_argument("--mvn-base-dir", required=True,
+ help="Maven build root containing module target directories")
+ parser.add_argument("--source-dir", required=True,
+ help="Top-level spark-rapids source directory")
+ parser.add_argument("--project-version", required=True)
+ parser.add_argument("--scala-binary-version", required=True)
+ parser.add_argument("--buildvers", required=True,
+ help="Comma-separated Spark build versions, for example 350,411")
+ parser.add_argument("--ignore-shim-revisions-check", action="store_true",
+ help="Continue when per-shim build metadata revisions differ")
+ args = parser.parse_args()
+
+ base_dir = Path(args.mvn_base_dir).resolve()
+ source_dir = Path(args.source_dir).resolve()
+ dist_dir = source_dir / "dist"
+ target_dir = base_dir / "dist" / "target"
+ parallel_world = target_dir / "parallel-world"
+ buildvers = [item.strip() for item in args.buildvers.split(",") if item.strip()]
+
+ if len(buildvers) == 0:
+ raise RuntimeError("no build versions were supplied")
+
+ from_single_shim = read_patterns(dist_dir / "unshimmed-common-from-single-shim.txt")
+ from_each = read_patterns(dist_dir / "unshimmed-from-each-spark3xx.txt")
+
+ print("Direct unshim parallel-world assembly for Spark versions: %s" %
+ ", ".join(buildvers),
+ flush=True)
+ clean_output(target_dir)
+ copy_and_extract_jars(
+ base_dir,
+ target_dir,
+ args.scala_binary_version,
+ args.project_version,
+ buildvers,
+ from_single_shim,
+ from_each)
+
+ revision_check = subprocess.run(
+ [str(dist_dir / "scripts" / "check-shims-revisions.sh"), ",".join(buildvers)],
+ cwd=str(target_dir),
+ check=False)
+ if revision_check.returncode != 0:
+ if args.ignore_shim_revisions_check:
+ print("Ignoring shim revision check failure for direct unshim parallel-world assembly",
+ flush=True)
+ else:
+ revision_check.check_returncode()
+
+ dedupe_env = os.environ.copy()
+ dedupe_env["UNSHIM_FAST"] = "1"
+ dedupe_env["UNSHIM_DEDUPE_CACHE_DIR"] = str(
+ target_dir / "unshim-dedupe-cache" / dedupe_cache_key(
+ base_dir,
+ args.scala_binary_version,
+ args.project_version,
+ buildvers))
+ dedupe_env["UNSHIMMED_COMMON_FROM_SINGLE_SHIM_TXT"] = str(
+ dist_dir / "unshimmed-common-from-single-shim.txt")
+ dedupe_env["KEEP_IN_SPARK_SHARED_TXT"] = str(dist_dir / "keep-in-spark-shared.txt")
+ dedupe_env["UNSHIM_ANALYZER_SCRIPT"] = str(
+ dist_dir / "scripts" / "analyze-parallel-world-deps.py")
+ run_checked([str(dist_dir / "scripts" / "binary-dedupe.sh")],
+ cwd=target_dir,
+ env=dedupe_env)
+ remove_allowlisted_from_spark_shared(parallel_world, from_single_shim)
+
+ print("Direct unshim parallel-world output: %s" % parallel_world, flush=True)
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/dist/unshimmed-common-from-single-shim.txt b/dist/unshimmed-common-from-single-shim.txt
index 5802807a250..a3dc3ed0214 100644
--- a/dist/unshimmed-common-from-single-shim.txt
+++ b/dist/unshimmed-common-from-single-shim.txt
@@ -1,53 +1,9 @@
+# Files that must be promoted to the root layout from one representative shim
+# but are not selected by default class promotion. Common class files are
+# unshimmed by default when binary-dedupe proves they are bitwise-identical
+# across shims.
META-INF/DEPENDENCIES
META-INF/LICENSE
META-INF/NOTICE
-com/nvidia/spark/rapids/ExplainPlan.class
-com/nvidia/spark/rapids/ExplainPlan$.class
-com/nvidia/spark/rapids/ExplainPlanBase.class
-com/nvidia/spark/rapids/Optimizer.class
-com/nvidia/spark/rapids/optimizer/SQLOptimizerPlugin*
-com/nvidia/spark/rapids/ShimLoaderTemp*
-com/nvidia/spark/rapids/SparkShims*
-com/nvidia/spark/rapids/fileio/iceberg/IcebergInputFile.class
-com/nvidia/spark/rapids/fileio/iceberg/IcebergInputStream.class
-com/nvidia/spark/rapids/fileio/iceberg/IcebergOutputFile.class
-com/nvidia/spark/rapids/fileio/iceberg/IcebergOutputStream.class
-com/nvidia/spark/rapids/iceberg/GpuInternalRow.class
-com/nvidia/spark/rapids/iceberg/GpuInternalRowBase.class
-com/nvidia/spark/rapids/iceberg/data/GpuDeleteFilter2.class
-com/nvidia/spark/rapids/iceberg/package.class
-com/nvidia/spark/rapids/iceberg/package$.class
-com/nvidia/spark/rapids/iceberg/parquet/FileSchemaAccessors.class
-com/nvidia/spark/rapids/iceberg/parquet/GpuIcebergParquetReader$.class
-com/nvidia/spark/rapids/iceberg/parquet/SingleFile.class
-com/nvidia/spark/rapids/iceberg/parquet/SingleFile$.class
-com/nvidia/spark/rapids/iceberg/parquet/ThreadConf.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkReadOptions.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkReadOptions$.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkSQLProperties.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkSQLProperties$.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkUtil.class
-com/nvidia/spark/rapids/iceberg/spark/GpuSparkUtil$.class
-com/nvidia/spark/rapids/iceberg/spark/RapidsSparkCatalog.class
-com/nvidia/spark/rapids/iceberg/spark/RapidsSparkSessionCatalog.class
-com/nvidia/spark/rapids/iceberg/spark/source/RapidsSparkTable.class
-org/apache/iceberg/aws/s3/IcebergS3InputFileAccess.class
-org/apache/iceberg/data/GpuFileHelpers.class
-org/apache/iceberg/io/GpuClusteredWriterBridge.class
-org/apache/iceberg/io/GpuFanoutWriterBridge.class
-org/apache/iceberg/io/GpuPositionDeleteFileWriter$.class
-org/apache/iceberg/parquet/GpuParquetIOAccess.class
-org/apache/iceberg/spark/GpuTypeToSparkType.class
-org/apache/iceberg/spark/GpuTypeToSparkType$.class
-org/apache/iceberg/spark/GpuSparkReadConf.class
-org/apache/iceberg/spark/GpuSparkReadConfAccess.class
-org/apache/iceberg/spark/package.class
-org/apache/iceberg/spark/package$.class
-org/apache/iceberg/spark/source/GpuBaseReader.class
-org/apache/iceberg/spark/source/GpuSparkPlanningUtil.class
-org/apache/iceberg/spark/source/GpuSparkScanAccess.class
-org/apache/iceberg/spark/source/GpuSparkWriteAccess.class
-org/apache/iceberg/spark/source/GpuStructInternalRow.class
-org/apache/spark/sql/rapids/AdaptiveSparkPlanHelperShim*
-org/apache/spark/sql/rapids/ExecutionPlanCaptureCallback*
+rapids4spark-private-version-info.properties
rapids/*.py
diff --git a/dist/unshimmed-from-each-spark3xx.txt b/dist/unshimmed-from-each-spark3xx.txt
index 918a572722b..1f96d9d0781 100644
--- a/dist/unshimmed-from-each-spark3xx.txt
+++ b/dist/unshimmed-from-each-spark3xx.txt
@@ -9,4 +9,6 @@ com/nvidia/spark/rapids/delta/DeltaProbe.class
com/nvidia/spark/rapids/delta/DeltaProvider.class
com/nvidia/spark/rapids/delta/DeltaProvider$.class
com/nvidia/spark/rapids/PlanShims*
+org/apache/spark/sql/rapids/GpuShuffleDependency.class
+org/apache/spark/sql/rapids/execution/python/shims/WindowInPandasExecTypeShim.class
spark-*-info.properties
diff --git a/docs/additional-functionality/rapids-udfs.md b/docs/additional-functionality/rapids-udfs.md
index d498a841ef1..e4144460f0e 100644
--- a/docs/additional-functionality/rapids-udfs.md
+++ b/docs/additional-functionality/rapids-udfs.md
@@ -152,7 +152,7 @@ The GPU support for Pandas UDF is an experimental feature, and may change at any
---
GPU support for Pandas UDF is built on Apache Spark's [Pandas UDF(user defined
-function)](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#pandas-udfs-a-k-a-vectorized-udfs),
+function)](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#pandas-udfs-a-k-a-vectorized-udfs),
and has two features:
- **GPU Assignment(Scheduling) in Python Process**: Let the Python process share the same GPU with
@@ -201,12 +201,12 @@ Accelerator has a 1-1 mapping support for each of them.
|Spark Execution Plan|Data Transfer Accelerated|Use Case|
|----------------------|----------|--------|
- |ArrowEvalPythonExec|yes|[Series to Series](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#series-to-series), [Iterator of Series to Iterator of Series](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#iterator-of-series-to-iterator-of-series) and [Iterator of Multiple Series to Iterator of Series](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#iterator-of-multiple-series-to-iterator-of-series)|
- |MapInPandasExec|yes|[Map](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#map)|
- |WindowInPandasExec|yes|[Window](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#series-to-scalar)|
- |FlatMapGroupsInPandasExec|yes|[Grouped Map](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#grouped-map)|
- |AggregateInPandasExec|yes|[Aggregate](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#series-to-scalar)|
- |FlatMapCoGroupsInPandasExec|yes|[Co-grouped Map](https://archive.apache.org/dist/spark/docs/3.2.0/api/python/user_guide/sql/arrow_pandas.html#co-grouped-map)|
+ |ArrowEvalPythonExec|yes|[Series to Series](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#series-to-series), [Iterator of Series to Iterator of Series](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#iterator-of-series-to-iterator-of-series) and [Iterator of Multiple Series to Iterator of Series](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#iterator-of-multiple-series-to-iterator-of-series)|
+ |MapInPandasExec|yes|[Map](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#map)|
+ |WindowInPandasExec|yes|[Window](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#series-to-scalar)|
+ |FlatMapGroupsInPandasExec|yes|[Grouped Map](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#grouped-map)|
+ |AggregateInPandasExec|yes|[Aggregate](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#series-to-scalar)|
+ |FlatMapCoGroupsInPandasExec|yes|[Co-grouped Map](https://spark.apache.org/docs/3.5.7/api/python/user_guide/sql/arrow_pandas.html#co-grouped-map)|
### Other Configuration
diff --git a/docs/dev/adaptive-query.md b/docs/dev/adaptive-query.md
index c3e5568bfb4..cf9c8c126e4 100644
--- a/docs/dev/adaptive-query.md
+++ b/docs/dev/adaptive-query.md
@@ -51,7 +51,7 @@ optimizer rules:
```scala
extensions.injectColumnar(_ => ColumnarOverrideRules())
-extensions.injectQueryStagePrepRule(_ => GpuQueryStagePrepOverrides())
+extensions.injectQueryStagePrepRule(_ => new GpuQueryStagePrepOverrides)
```
The `ColumnarOverrideRules` are used whether AQE is enabled or not, and the
diff --git a/docs/dev/shimplify.md b/docs/dev/shimplify.md
index 4fefd824c7c..dd1f83f871d 100644
--- a/docs/dev/shimplify.md
+++ b/docs/dev/shimplify.md
@@ -266,4 +266,4 @@ See [CPD user doc][7] for more details about the options you can pass inside `cp
[4]: https://jsonlines.org/
[5]: https://spark.apache.org/versioning-policy.html
[6]: https://plugins.jetbrains.com/plugin/16429-idea-resolve-symlinks
-[7]: https://docs.pmd-code.org/latest/pmd_userdocs_cpd.html
+[7]: https://pmd.github.io/pmd/pmd_userdocs_cpd.html
diff --git a/docs/dev/shims.md b/docs/dev/shims.md
index 38a368df73b..f68b5e61e81 100644
--- a/docs/dev/shims.md
+++ b/docs/dev/shims.md
@@ -22,6 +22,100 @@ class as a tight entry point for interacting with the host Spark runtime.
In the following we provide recipes for typical scenarios addressed by the Shim layer.
+## One-way Shim Module Boundary
+
+Shim source can be split between three layers when the implementation does not have to live
+in the same module as the Spark-version-specific API reference.
+
+1. `sql-plugin-api` contains the narrow shared types that both sides can see. These types must
+ not depend on `sql-plugin` implementation classes.
+2. `sql-plugin-shims` depends on `sql-plugin-api` and Spark. It may reference Spark classes whose
+ source or binary shape varies by build version, but it must not reference implementation types
+ such as `GpuOverrides`, `RapidsMeta`, `ExprRule`, `ExecRule`, or GPU meta classes.
+3. `sql-plugin` depends on `sql-plugin-shims`. It turns API-level shim descriptors into concrete
+ plugin rules and owns the RAPIDS metadata factories.
+
+For replacement rules, use descriptor objects when the shim only needs to identify a Spark class
+and provide stable rule metadata. For example, `ShimDataWritingCommandRule`,
+`ShimRunnableCommandRule`, and `ShimExecRule` live in `sql-plugin-api`; versioned objects in
+`sql-plugin-shims` instantiate those descriptors with Spark-specific class tags; `sql-plugin`
+then calls the corresponding `GpuOverrides.*FromShim` method and supplies the actual `RapidsMeta`
+factory. This keeps the call direction one-way: shared plugin code can consume shim descriptors,
+while shim code cannot call back into shared plugin implementation.
+
+Classes whose `spark-rapids-shim-json-lines` entries cover all build versions can be unshimmed
+into a common source root when there is no special-version sibling and the source is truly
+compatible across the supported Spark APIs. When a file has Databricks-specific, Spark 4.1-specific,
+or otherwise divergent siblings, keep the version-specific source and move only the API-safe part
+behind the one-way boundary.
+
+## Reducing Parallel-World Classes
+
+The long-term goal is to maximize bytecode in the conventional jar layout and shrink the amount
+of code that must be loaded through the parallel-world mechanism. A class can move from
+`spark-shared` to the conventional layout only when it has no static dependency path to
+Spark-version-specific bytecode. The dependency path matters transitively: a `spark-shared` class
+that calls another `spark-shared` class that eventually calls a `sparkXYZ` class is not root-safe.
+
+`dist/unshimmed-common-from-single-shim.txt` names classes and resources that are allowed to be
+stored in the conventional layout after the dist jar is assembled. During `binary-dedupe.sh`, files
+from that allowlist may be promoted out of `spark-shared` into the root layout before the bitwise
+identity check runs. This is important for profiles where the highest Spark build contributes only a
+stub module, while a lower Spark build contributes the real implementation. For example, root-safe
+Iceberg helpers can still be placed in the conventional layout even when the Spark 4.1 shim uses the
+Iceberg stub.
+
+Use a small bootstrap allowlist for classes that are allowed to refer to packages generated with
+`$_spark.version.classifier_`, such as `com.nvidia.spark.rapids.spark330.RapidsShuffleManager`.
+Ordinary shared implementation classes should not have direct static dependencies on those
+classifier packages. They should instead call through stable contracts in `sql-plugin-api` or
+through descriptor objects in `sql-plugin-shims`.
+
+For an inventory of a released artifact, download the complete dist jar from Maven Central and run
+the dependency analyzer directly against the jar:
+
+```bash
+VERSION=26.04.2
+curl -fL -o /tmp/rapids-4-spark_2.12-${VERSION}-cuda12.jar \
+ https://repo.maven.apache.org/maven2/com/nvidia/rapids-4-spark_2.12/${VERSION}/rapids-4-spark_2.12-${VERSION}-cuda12.jar
+
+python3 dist/scripts/analyze-parallel-world-deps.py \
+ /tmp/rapids-4-spark_2.12-${VERSION}-cuda12.jar \
+ --show-topo
+```
+
+Run the same command for the Scala 2.13 artifact when checking Spark 4.x coverage. Internal
+snapshot artifacts can be analyzed the same way after downloading a timestamped dist jar from the
+configured artifact repository; keep repository credentials in local Maven or environment
+configuration rather than embedding them in scripts or docs.
+
+For local branch validation, build representative two-shim dist jars that span the widest
+differences in each Scala line:
+
+```bash
+./build/buildall --profile=350,411 --scala213 --module=dist
+python3 dist/scripts/analyze-parallel-world-deps.py \
+ scala2.13/dist/target/parallel-world \
+ --show-topo
+
+./build/buildall --profile=330,358 --module=dist
+python3 dist/scripts/analyze-parallel-world-deps.py \
+ dist/target/parallel-world \
+ --show-topo
+```
+
+The analyzer reports:
+
+1. direct classifier-package dependencies, which should remain limited to bootstrap/facade code;
+2. root or `spark-shared` classes with transitive paths to version-specific classes;
+3. root-safe `spark-shared` strongly connected components in dependency-first order.
+
+Use `--format=json` when comparing safe components across artifacts or build outputs. JSON output
+keeps counts exact and bounds example sections with `--limit`.
+Shortest paths explain why a class is blocked and usually identify the adapter boundary to cut.
+Strongly connected components, not shortest paths, provide the migration ordering because classes in
+the same component have to move or be refactored together.
+
## Method signature discrepancies
It's among the easiest issues to resolve. We define a method in SparkShims
diff --git a/iceberg/common/src/main/java/org/apache/iceberg/spark/source/GpuSparkWriteAccess.java b/iceberg/common/src/main/java/org/apache/iceberg/spark/source/GpuSparkWriteAccess.java
index 91ed87280da..8e3f7b55a51 100644
--- a/iceberg/common/src/main/java/org/apache/iceberg/spark/source/GpuSparkWriteAccess.java
+++ b/iceberg/common/src/main/java/org/apache/iceberg/spark/source/GpuSparkWriteAccess.java
@@ -17,11 +17,14 @@
package org.apache.iceberg.spark.source;
import java.lang.reflect.Field;
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
import java.util.Map;
import org.apache.iceberg.DataFile;
import org.apache.iceberg.FileFormat;
import org.apache.iceberg.Schema;
+import org.apache.iceberg.SnapshotUpdate;
import org.apache.iceberg.Table;
import org.apache.iceberg.deletes.DeleteGranularity;
import org.apache.iceberg.io.DeleteWriteResult;
@@ -94,6 +97,23 @@ public static Map writeProperties(Write write) {
return readField(sparkWrite(write), "writeProperties", Map.class);
}
+ public static void abort(Write write, WriterCommitMessage[] messages) {
+ invokeMethod(
+ sparkWrite(write),
+ "abort",
+ new Class>[] {WriterCommitMessage[].class},
+ new Object[] {messages});
+ }
+
+ public static void commitOperation(
+ Write write, SnapshotUpdate> operation, String description) {
+ invokeMethod(
+ sparkWrite(write),
+ "commitOperation",
+ new Class>[] {SnapshotUpdate.class, String.class},
+ new Object[] {operation, description});
+ }
+
public static Table table(DeltaWrite write) {
return readField(positionDeltaWrite(write), "table", Table.class);
}
@@ -169,6 +189,10 @@ public static WriterCommitMessage taskCommit(DataFile[] files) {
return commit;
}
+ public static DataFile[] taskCommitFiles(WriterCommitMessage message) {
+ return ((SparkWrite.TaskCommit) message).files();
+ }
+
public static WriterCommitMessage deltaTaskCommit(WriteResult result) {
return new SparkPositionDeltaWrite.DeltaTaskCommit(result);
}
@@ -208,4 +232,38 @@ private static Field findField(Class> targetClass, String fieldName) {
throw new IllegalStateException("No field " + fieldName + " in " + targetClass.getName());
}
+ private static void invokeMethod(
+ Object target, String methodName, Class>[] parameterTypes, Object[] args) {
+ try {
+ Method method = findMethod(target.getClass(), methodName, parameterTypes);
+ method.setAccessible(true);
+ method.invoke(target, args);
+ } catch (IllegalAccessException e) {
+ throw new IllegalStateException(
+ "Unable to invoke " + methodName + " on " + target.getClass().getName(), e);
+ } catch (InvocationTargetException e) {
+ Throwable cause = e.getCause();
+ if (cause instanceof RuntimeException) {
+ throw (RuntimeException) cause;
+ }
+ if (cause instanceof Error) {
+ throw (Error) cause;
+ }
+ throw new IllegalStateException(
+ "Unable to invoke " + methodName + " on " + target.getClass().getName(), cause);
+ }
+ }
+
+ private static Method findMethod(
+ Class> targetClass, String methodName, Class>[] parameterTypes) {
+ Class> current = targetClass;
+ while (current != null) {
+ try {
+ return current.getDeclaredMethod(methodName, parameterTypes);
+ } catch (NoSuchMethodException e) {
+ current = current.getSuperclass();
+ }
+ }
+ throw new IllegalStateException("No method " + methodName + " in " + targetClass.getName());
+ }
}
diff --git a/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala b/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala
index a93e77533ed..0887ea7281d 100644
--- a/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala
+++ b/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/GpuIcebergPartitioner.scala
@@ -98,7 +98,7 @@ class GpuIcebergPartitioner(
// Combine the partition keys and partitioned tables
partitionKeys.zip(partitions).map { case (partKey, partition) =>
- ColumnarBatchWithPartition(SpillableColumnarBatch(partition,
+ new ColumnarBatchWithPartition(SpillableColumnarBatch(partition,
valueSparkType,
SpillPriorities.ACTIVE_BATCHING_PRIORITY),
partKey)
@@ -178,8 +178,9 @@ class GpuIcebergSpecPartitioner(val spec: PartitionSpec,
}
}
-case class ColumnarBatchWithPartition(batch: SpillableColumnarBatch, partition: StructLike) extends
- AutoCloseable {
+class ColumnarBatchWithPartition(
+ val batch: SpillableColumnarBatch,
+ val partition: StructLike) extends AutoCloseable {
override def close(): Unit = {
batch.close()
}
diff --git a/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/parquet/GpuCoalescingIcebergParquetReader.scala b/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/parquet/GpuCoalescingIcebergParquetReader.scala
index 56ab66a20ee..2df18a7d7fd 100644
--- a/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/parquet/GpuCoalescingIcebergParquetReader.scala
+++ b/iceberg/common/src/main/scala/com/nvidia/spark/rapids/iceberg/parquet/GpuCoalescingIcebergParquetReader.scala
@@ -68,11 +68,11 @@ class GpuCoalescingIcebergParquetReader(
conf.metrics)
info.blocks.map { block =>
- ParquetSingleDataBlockMeta(
+ new ParquetSingleDataBlockMeta(
info.filePath,
- ParquetDataBlock(block, CpuCompressionConfig.disabled()),
+ new ParquetDataBlock(block, CpuCompressionConfig.disabled()),
InternalRow.empty,
- ParquetSchemaWrapper(info.schema),
+ new ParquetSchemaWrapper(info.schema),
info.readSchema,
IcebergParquetExtraInfo(
info.dateRebaseMode,
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/functions/transforms.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/functions/transforms.scala
index 33b68811d79..f1674fc618a 100644
--- a/iceberg/common/src/main/scala/org/apache/iceberg/spark/functions/transforms.scala
+++ b/iceberg/common/src/main/scala/org/apache/iceberg/spark/functions/transforms.scala
@@ -98,7 +98,7 @@ object GpuTransform {
}
}
-case class GpuFieldTransform(sourceFieldId: Int, transform: GpuTransform) {
+class GpuFieldTransform(val sourceFieldId: Int, val transform: GpuTransform) {
def supports(inputType: StructType, inputSchema: Schema): Boolean = {
// Iceberg allows partition source fields to reference nested-leaf field ids
// (e.g. `bucket(4, contact.email)`). Those ids do not appear in
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala
index 0efbee1da56..4e9b2a6ec6d 100644
--- a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala
+++ b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuReaderFactory.scala
@@ -108,7 +108,7 @@ class GpuReaderFactory(private val metrics: Map[String, GpuMetric],
queryUsesInputFile || hasFilePathMetadata || hasRowPositionMetadata ||
!hasNoDeletes
MultiThread(poolConfBuilder, partition.maxNumParquetFilesParallel,
- CombineConf(combineThresholdSize, combineWaitTime),
+ new CombineConf(combineThresholdSize, combineWaitTime),
disableCombining,
hasFilePathMetadata,
hasRowPositionMetadata)
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkPositionDeltaWrite.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkPositionDeltaWrite.scala
index ce146a6522f..6eabbbee6e4 100644
--- a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkPositionDeltaWrite.scala
+++ b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkPositionDeltaWrite.scala
@@ -347,7 +347,7 @@ trait GpuDeltaWriter extends DeltaWriter[ColumnarBatch] {
protected def newDeleteWriteContext(metadata: ColumnarBatch, rowId: ColumnarBatch)
: DeleteWriteContext = {
withResource(Seq(metadata, rowId)) { _ =>
- var ret = DeleteWriteContext(spillPartValues = SpillableColumnarBatch(
+ var ret = new DeleteWriteContext(spillPartValues = SpillableColumnarBatch(
extractToStruct(metadata, context.partitionOrdinal()),
ACTIVE_ON_DECK_PRIORITY))
@@ -371,17 +371,22 @@ trait GpuDeltaWriter extends DeltaWriter[ColumnarBatch] {
}
}
-case class DeleteWriteContext(
- spillPartValues: SpillableColumnarBatch = null,
- spillPosDeletes: SpillableColumnarBatch = null,
- uniqueSpecIdCol: RapidsHostColumnVector = null,
- specIdCol: CudfColumnVector = null) extends AutoCloseable {
+class DeleteWriteContext(
+ val spillPartValues: SpillableColumnarBatch = null,
+ val spillPosDeletes: SpillableColumnarBatch = null,
+ val uniqueSpecIdCol: RapidsHostColumnVector = null,
+ val specIdCol: CudfColumnVector = null) extends AutoCloseable {
+
+ def copy(
+ spillPartValues: SpillableColumnarBatch = this.spillPartValues,
+ spillPosDeletes: SpillableColumnarBatch = this.spillPosDeletes,
+ uniqueSpecIdCol: RapidsHostColumnVector = this.uniqueSpecIdCol,
+ specIdCol: CudfColumnVector = this.specIdCol): DeleteWriteContext = {
+ new DeleteWriteContext(spillPartValues, spillPosDeletes, uniqueSpecIdCol, specIdCol)
+ }
override def close(): Unit = {
- productIterator
- .map(_.asInstanceOf[AutoCloseable])
- .toSeq
- .safeClose()
+ Seq[AutoCloseable](spillPartValues, spillPosDeletes, uniqueSpecIdCol, specIdCol).safeClose()
}
}
@@ -480,7 +485,7 @@ trait GpuDeleteAndDataDeltaWriter extends GpuDeltaWriter {
}
} else {
// Unpartitioned spec
- Seq(ColumnarBatchWithPartition(
+ Seq(new ColumnarBatchWithPartition(
SpillableColumnarBatch(filteredPositionDeletes,
SpillPriorities.ACTIVE_ON_DECK_PRIORITY),
emptyPartitionData
@@ -601,7 +606,7 @@ class GpuDeleteOnlyDeltaWriter(
}
} else {
// Unpartitioned spec
- Seq(ColumnarBatchWithPartition(
+ Seq(new ColumnarBatchWithPartition(
SpillableColumnarBatch(filteredPositionDeletes,
SpillPriorities.ACTIVE_ON_DECK_PRIORITY),
emptyPartitionData
diff --git a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala
index 5628ce311db..ad76c489b20 100644
--- a/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala
+++ b/iceberg/common/src/main/scala/org/apache/iceberg/spark/source/GpuSparkWrite.scala
@@ -86,6 +86,10 @@ class GpuSparkWrite(cpu: Write) extends GpuWrite with RequiresDistributionAndOrd
override def toString: String = s"GpuIcebergWrite(table=$table, format=$format)"
+ private[source] def abort(messages: Array[WriterCommitMessage]): Unit = {
+ GpuSparkWriteAccess.abort(cpu, messages)
+ }
+
override def distributionStrictlyRequired(): Boolean =
writeRequirements.distributionStrictlyRequired()
@@ -95,7 +99,6 @@ class GpuSparkWrite(cpu: Write) extends GpuWrite with RequiresDistributionAndOrd
writeRequirements.advisoryPartitionSizeInBytes()
override def requiredDistribution(): Distribution = writeRequirements.requiredDistribution()
-
override def requiredOrdering(): Array[SortOrder] = writeRequirements.requiredOrdering()
private[source] def createDataWriterFactory: DataWriterFactory = {
@@ -154,6 +157,16 @@ class GpuSparkWrite(cpu: Write) extends GpuWrite with RequiresDistributionAndOrd
statsTracker,
serializedHadoopConf)
}
+
+ private[source] def files(messages: Array[WriterCommitMessage]): Seq[DataFile] = {
+ messages.filter(_ != null)
+ .flatMap(GpuSparkWriteAccess.taskCommitFiles)
+ .toSeq
+ }
+
+ private[source] def commitOperation(operation: SnapshotUpdate[_], desc: String) = {
+ GpuSparkWriteAccess.commitOperation(cpu, operation, desc)
+ }
}
object GpuSparkWrite {
@@ -250,7 +263,7 @@ object GpuSparkWrite {
val transform = partitionField.transform()
GpuTransform.tryFrom(transform) match {
case Success(t) =>
- val fieldTransform = GpuFieldTransform(partitionField.sourceId(), t)
+ val fieldTransform = new GpuFieldTransform(partitionField.sourceId(), t)
if (!fieldTransform.supports(dataSparkType.get, dataSchema.get)) {
meta.willNotWorkOnGpu(
s"Iceberg partition transform $transform is not supported on GPU")
diff --git a/integration_tests/src/main/python/delta_lake_test.py b/integration_tests/src/main/python/delta_lake_test.py
index 80afd59d5bf..4b6f46864d2 100644
--- a/integration_tests/src/main/python/delta_lake_test.py
+++ b/integration_tests/src/main/python/delta_lake_test.py
@@ -599,7 +599,7 @@ def test_delta_deletion_vector_interleaved_file_splits(
"""
Tests deletion vector handling when files are interleaved in a way that causes their
blocks to be split non-consecutively.
-
+
For this test, we set up two files A (large) and B (small) such that:
- A is split into N PartitionedFiles: [max, ..., max, tail].
- tail(A) < len(B) < max_split.
diff --git a/integration_tests/src/main/python/iceberg/iceberg_append_test.py b/integration_tests/src/main/python/iceberg/iceberg_append_test.py
index 152c5fe1377..c7223cd51db 100644
--- a/integration_tests/src/main/python/iceberg/iceberg_append_test.py
+++ b/integration_tests/src/main/python/iceberg/iceberg_append_test.py
@@ -494,4 +494,3 @@ def insert_data(spark):
return spark.sql(f"INSERT INTO {table_name} SELECT * FROM {view_name}")
assert_gpu_fallback_collect(insert_data, "AppendDataExec", conf=iceberg_write_enabled_conf)
-
diff --git a/pom.xml b/pom.xml
index 450211bcc4a..df560199c9b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -67,8 +67,12 @@
distintegration_testsshuffle-plugin
- sql-pluginsql-plugin-api
+ sql-plugin-format
+ sql-plugin-fileio
+ sql-plugin-columnar
+ sql-plugin-shims
+ sql-pluginteststoolsudf-compiler
@@ -805,6 +809,8 @@
.${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbtfalse
+ false
+ false3301.88
@@ -1191,6 +1197,7 @@
rungenerate-sources
+ ${rapids.shimplify.skip}
@@ -1216,6 +1223,7 @@
generate-build-infogenerate-resources
+ ${rapids.build.info.skip}
diff --git a/scala2.13/aggregator/pom.xml b/scala2.13/aggregator/pom.xml
index a6fb5f60651..d538fba4254 100644
--- a/scala2.13/aggregator/pom.xml
+++ b/scala2.13/aggregator/pom.xml
@@ -45,6 +45,7 @@
initializenone
+ false
@@ -196,6 +197,7 @@
runprocess-classes
+ ${rapids.aggregator.downstream.refresh.skip}dist
integration_testsshuffle-plugin
- sql-pluginsql-plugin-api
+ sql-plugin-format
+ sql-plugin-fileio
+ sql-plugin-columnar
+ sql-plugin-shims
+ sql-pluginteststoolsudf-compiler
@@ -805,6 +809,8 @@
.${spark.rapids.project.basedir}/target/${spark.version.classifier}/.sbt/1.0/zinc/org.scala-sbtfalse
+ false
+ false3301.88
@@ -1191,6 +1197,7 @@
rungenerate-sources
+ ${rapids.shimplify.skip}
@@ -1216,6 +1223,7 @@
generate-build-infogenerate-resources
+ ${rapids.build.info.skip}
diff --git a/scala2.13/shuffle-plugin/pom.xml b/scala2.13/shuffle-plugin/pom.xml
index 191036cb1c0..ca29953086d 100644
--- a/scala2.13/shuffle-plugin/pom.xml
+++ b/scala2.13/shuffle-plugin/pom.xml
@@ -42,6 +42,12 @@
spark-rapids-jni${jni.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-api_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ org.scala-langscala-library
diff --git a/scala2.13/sql-plugin-columnar/pom.xml b/scala2.13/sql-plugin-columnar/pom.xml
new file mode 100644
index 00000000000..ced1a674ea2
--- /dev/null
+++ b/scala2.13/sql-plugin-columnar/pom.xml
@@ -0,0 +1,124 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.13
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-columnar_2.13
+ Java-only columnar runtime plumbing for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-columnar
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.nvidia
+ spark-rapids-jni
+ ${jni.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-format_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/scala2.13/sql-plugin-fileio/pom.xml b/scala2.13/sql-plugin-fileio/pom.xml
new file mode 100644
index 00000000000..6c9a666e8bb
--- /dev/null
+++ b/scala2.13/sql-plugin-fileio/pom.xml
@@ -0,0 +1,118 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.13
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-fileio_2.13
+ Java-only file I/O runtime plumbing for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-fileio
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.nvidia
+ spark-rapids-jni
+ ${jni.classifier}
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/scala2.13/sql-plugin-format/pom.xml b/scala2.13/sql-plugin-format/pom.xml
new file mode 100644
index 00000000000..471656fe973
--- /dev/null
+++ b/scala2.13/sql-plugin-format/pom.xml
@@ -0,0 +1,111 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.13
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-format_2.13
+ Java-only FlatBuffers format classes for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-format
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.google.flatbuffers
+ flatbuffers-java
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/scala2.13/sql-plugin-shims/pom.xml b/scala2.13/sql-plugin-shims/pom.xml
new file mode 100644
index 00000000000..db47aac3d38
--- /dev/null
+++ b/scala2.13/sql-plugin-shims/pom.xml
@@ -0,0 +1,68 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.13
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+ rapids-4-spark-sql-shims_2.13
+ RAPIDS Accelerator for Apache Spark SQL Plugin Shims
+ Compile-time isolated SQL plugin shims
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-shims
+ false
+ **/*
+ package
+
+
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-api_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ org.scala-lang
+ scala-library
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ maven-antrun-plugin
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+
+
diff --git a/scala2.13/sql-plugin/pom.xml b/scala2.13/sql-plugin/pom.xml
index 14efec5aea3..6b0925787f8 100644
--- a/scala2.13/sql-plugin/pom.xml
+++ b/scala2.13/sql-plugin/pom.xml
@@ -54,12 +54,37 @@
${spark-rapids-private.version}${spark.version.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-format_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-fileio_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-columnar_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ com.nvidiarapids-4-spark-sql-plugin-api_${scala.binary.version}${project.version}${spark.version.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-shims_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ provided
+ org.scala-langscala-library
@@ -219,6 +244,27 @@
net.alchim31.mavenscala-maven-plugin
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.6.1
+
+
+ unpack-sql-plugin-shims
+ prepare-package
+
+ unpack-dependencies
+
+
+ com.nvidia
+ rapids-4-spark-sql-shims_${scala.binary.version}
+ true
+ **/*.class
+ ${project.build.outputDirectory}
+
+
+
+ org.apache.ratapache-rat-plugin
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index ff481d52819..e1148736125 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -42,6 +42,12 @@
spark-rapids-jni${jni.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-api_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ org.scala-langscala-library
diff --git a/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/Rkeys.java b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/Rkeys.java
new file mode 100644
index 00000000000..2882b1db81d
--- /dev/null
+++ b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/Rkeys.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle.ucx;
+
+import java.nio.ByteBuffer;
+import java.util.Objects;
+
+import scala.collection.Seq;
+
+/** UCX remote keys registered for a peer. */
+public final class Rkeys {
+ private final Seq rkeys;
+
+ public Rkeys(Seq rkeys) {
+ this.rkeys = rkeys;
+ }
+
+ public Seq rkeys() {
+ return rkeys;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof Rkeys)) {
+ return false;
+ }
+ Rkeys other = (Rkeys) obj;
+ return Objects.equals(rkeys, other.rkeys);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(rkeys);
+ }
+
+ @Override
+ public String toString() {
+ return "Rkeys(" + rkeys + ")";
+ }
+}
diff --git a/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXActiveMessage.java b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXActiveMessage.java
new file mode 100644
index 00000000000..cb1622c3adb
--- /dev/null
+++ b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXActiveMessage.java
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle.ucx;
+
+import java.util.Objects;
+
+/** Active message id and dynamic header used by UCX request/response handlers. */
+public final class UCXActiveMessage {
+ private final int activeMessageId;
+ private final long header;
+ private final boolean forceRndv;
+
+ public UCXActiveMessage(int activeMessageId, long header, boolean forceRndv) {
+ this.activeMessageId = activeMessageId;
+ this.header = header;
+ this.forceRndv = forceRndv;
+ }
+
+ public int activeMessageId() {
+ return activeMessageId;
+ }
+
+ public long header() {
+ return header;
+ }
+
+ public boolean forceRndv() {
+ return forceRndv;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof UCXActiveMessage)) {
+ return false;
+ }
+ UCXActiveMessage other = (UCXActiveMessage) obj;
+ return activeMessageId == other.activeMessageId &&
+ header == other.header &&
+ forceRndv == other.forceRndv;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(activeMessageId, header, forceRndv);
+ }
+
+ @Override
+ public String toString() {
+ return "[amId=" + String.format("0x%08X", activeMessageId) +
+ ", hdr=" + String.format("0x%016X", header) + "]";
+ }
+}
diff --git a/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXError.java b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXError.java
new file mode 100644
index 00000000000..48526126a9d
--- /dev/null
+++ b/shuffle-plugin/src/main/java/com/nvidia/spark/rapids/shuffle/ucx/UCXError.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle.ucx;
+
+import java.util.Objects;
+
+/** Error reported by UCX. */
+public final class UCXError {
+ private final int ucsStatus;
+ private final String errorMsg;
+
+ public UCXError(int ucsStatus, String errorMsg) {
+ this.ucsStatus = ucsStatus;
+ this.errorMsg = errorMsg;
+ }
+
+ public int ucsStatus() {
+ return ucsStatus;
+ }
+
+ public String errorMsg() {
+ return errorMsg;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof UCXError)) {
+ return false;
+ }
+ UCXError other = (UCXError) obj;
+ return ucsStatus == other.ucsStatus &&
+ Objects.equals(errorMsg, other.errorMsg);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(ucsStatus, errorMsg);
+ }
+
+ @Override
+ public String toString() {
+ return "UCXError(" + ucsStatus + "," + errorMsg + ")";
+ }
+}
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index 6a8336f2a4a..9f6d87f1e50 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -40,22 +40,6 @@ import org.apache.spark.internal.Logging
import org.apache.spark.sql.rapids.storage.RapidsStorageUtils
import org.apache.spark.storage.BlockManagerId
-case class Rkeys(rkeys: Seq[ByteBuffer])
-
-/**
- * A simple wrapper for an Active Message Id and a header. This pair
- * is used together when dealing with Active Messages, with `activeMessageId`
- * being a fire-and-forget registration with UCX, and `header` being a dynamic long
- * we continue to update (it contains the local executor id, and the transaction id).
- *
- * This allows us to send a request (with a header that the response handler knows about),
- * and for the request handler to echo back that header when it's done.
- */
-case class UCXActiveMessage(activeMessageId: Int, header: Long, forceRndv: Boolean) {
- override def toString: String =
- UCX.formatAmIdAndHeader(activeMessageId, header)
-}
-
/**
* The UCX class wraps JUCX classes and handles all communication with UCX from other
* parts of the shuffle code. It manages a `UcpContext` and `UcpWorker`, for the
@@ -427,7 +411,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
s"Received message with wrong header size $headerSize")
} else {
val header = UcxUtils.getByteBufferView(headerAddr, headerSize).getLong()
- val am = UCXActiveMessage(reg.activeMessageId, header, reg.useRndv)
+ val am = new UCXActiveMessage(reg.activeMessageId, header, reg.useRndv)
withResource(new NvtxRange("AM Receive", NvtxColor.YELLOW)) { _ =>
logDebug(s"Active Message received: $am")
@@ -448,7 +432,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
cb.onSuccess(am, mtb)
case _ =>
cb.onError(am,
- UCXError(0, "Received an eager message for non-metadata message"))
+ new UCXError(0, "Received an eager message for non-metadata message"))
})
// we return OK telling UCX `amData` is ok to be closed, along with the eagerly
@@ -475,7 +459,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
s" status=$ucsStatus, msg=$errorMsg")
cb.onCancel(am)
} else {
- cb.onError(am, UCXError(ucsStatus, errorMsg))
+ cb.onError(am, new UCXError(ucsStatus, errorMsg))
}
}
}
@@ -833,7 +817,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
s"for ${connectionRequest.getClientAddress}")
// Register a `Control` active message for a handshake response
- val responseAm = UCXActiveMessage(
+ val responseAm = new UCXActiveMessage(
UCXConnection.composeResponseAmId(MessageType.Control), ep.getNativeId, false)
registerResponseHandler(responseAm, new UCXAmCallback {
@@ -918,7 +902,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
if (reverseLookupEndpoints.containsKey(ucpEndpoint)) {
val executorId = reverseLookupEndpoints.get(ucpEndpoint)
if (!isShuttingDown) {
- val error = UCXError(errorCode, errorString)
+ val error = new UCXError(errorCode, errorString)
logError(s"UcpListener detected an error for executorId $executorId: " +
s"$error")
}
@@ -1036,7 +1020,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
// called from progress thread - on ConnectionRequest
private def sendControlRequest(ep: UcpEndpoint, responseAm: UCXActiveMessage): Unit = {
- val requestAm = UCXActiveMessage(
+ val requestAm = new UCXActiveMessage(
UCXConnection.composeRequestAmId(MessageType.Control), ep.getNativeId, false)
val handshakeMsg =
@@ -1048,7 +1032,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
TransportUtils.getAddress(handshakeMsg), handshakeMsg.remaining(),
new UcxCallback {
override def onError(ucsStatus: Int, errorMsg: String): Unit = {
- val error = UCXError(ucsStatus, errorMsg)
+ val error = new UCXError(ucsStatus, errorMsg)
logError(s"Error sending handshake header, " +
s"error: $error active message: $requestAm handshake: $handshakeMsg")
RapidsStorageUtils.dispose(handshakeMsg)
@@ -1071,7 +1055,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
// reply
val handshakeMsg = UCXConnection.packHandshake(localExecutorId, localRkeys)
val responseAmId = UCXConnection.composeResponseAmId(MessageType.Control)
- val responseAm = UCXActiveMessage(responseAmId, requestAm.header, false)
+ val responseAm = new UCXActiveMessage(responseAmId, requestAm.header, false)
val address = TransportUtils.getAddress(handshakeMsg)
val len = handshakeMsg.remaining()
@@ -1080,7 +1064,7 @@ class UCX(transport: UCXShuffleTransport, executor: BlockManagerId, rapidsConf:
sendActiveMessage(ep, responseAm, address, len,
new UcxCallback {
override def onError(ucsStatus: Int, errorMsg: String): Unit = {
- val error = UCXError(ucsStatus, errorMsg)
+ val error = new UCXError(ucsStatus, errorMsg)
logError(s"Error replying to sending handshake header, " +
s"error: $error active message: $responseAm")
RapidsStorageUtils.dispose(handshakeMsg)
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXConnection.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXConnection.scala
index f6966df02a8..5464ff6a507 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXConnection.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXConnection.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,7 +16,7 @@
package com.nvidia.spark.rapids.shuffle.ucx
-import java.nio.ByteBuffer
+import java.nio.{Buffer, ByteBuffer}
import java.util.concurrent.ConcurrentHashMap
import ai.rapids.cudf.MemoryBuffer
@@ -30,7 +30,6 @@ import org.apache.spark.internal.Logging
* These are private apis used within the ucx package.
*/
-case class UCXError(ucsStatus: Int, errorMsg: String)
/**
* `UCXAmCallback` is used by [[Transaction]] to handle UCX Active Messages operations.
@@ -94,8 +93,8 @@ class UCXServerConnection(ucx: UCX, transport: UCXShuffleTransport)
logDebug(s"Sending to ${peerExecutorId} at ${TransportUtils.toHex(header)} " +
s"with ${buffer}")
- val sendAm = UCXActiveMessage(UCXConnection.composeSendAmId(messageType),
- header, forceRndv = true)
+ val sendAm = new UCXActiveMessage(UCXConnection.composeSendAmId(messageType),
+ header, true)
ucx.sendActiveMessage(peerExecutorId, sendAm, buffer,
new UcxCallback {
@@ -123,7 +122,7 @@ class UCXServerConnection(ucx: UCX, transport: UCXShuffleTransport)
logDebug(s"Responding to ${peerExecutorId} at ${TransportUtils.toHex(header)} " +
s"with ${response}")
- val responseAm = UCXActiveMessage(
+ val responseAm = new UCXActiveMessage(
UCXConnection.composeResponseAmId(messageType), header, false)
ucx.sendActiveMessage(peerExecutorId, responseAm, response,
new UcxCallback {
@@ -191,12 +190,12 @@ class UCXClientConnection(peerExecutorId: Long, ucx: UCX, transport: UCXShuffleT
// Register the active message response handler. Note that the `requestHeader`
// is expected to come back with the response, and is used to find the
// correct callback (this is an implementation detail in UCX.scala)
- val responseAm = UCXActiveMessage(
+ val responseAm = new UCXActiveMessage(
UCXConnection.composeResponseAmId(messageType), requestHeader, false)
ucx.registerResponseHandler(responseAm, amCallback)
// kick-off the request
- val requestAm = UCXActiveMessage(
+ val requestAm = new UCXActiveMessage(
UCXConnection.composeRequestAmId(messageType), requestHeader, false)
logDebug(s"Performing a ${messageType} request of size ${request.remaining()} " +
@@ -285,7 +284,7 @@ class UCXConnection(peerExecutorId: Long, val ucx: UCX) extends Logging {
}
}
-object UCXConnection extends Logging {
+object UCXConnection {
/**
* 1) client gets upper 28 bits
* 2) then comes the type, which gets 4 bits
@@ -389,8 +388,8 @@ object UCXConnection extends Logging {
val rkeys = (0 until numRkeys).map { _ =>
val rkeySize = buff.getInt
val rkeySlice = buff.slice()
- rkeySlice.limit(rkeySize)
- buff.position(buff.position() + rkeySize)
+ rkeySlice.asInstanceOf[Buffer].limit(rkeySize)
+ buff.asInstanceOf[Buffer].position(buff.position() + rkeySize)
rkeySlice
}
(remoteExecutorId, rkeys)
@@ -419,7 +418,7 @@ object UCXConnection extends Logging {
hsBuff.putInt(rkey.capacity)
hsBuff.put(rkey)
}
- hsBuff.flip()
+ hsBuff.asInstanceOf[Buffer].flip()
hsBuff
}
}
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index 3a71fd769e7..6406fa44aaf 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -180,10 +180,10 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
val hostBuffer = tryAcquireBounceBuffers(hostSendBuffMgr, numBuffs)
if (hostBuffer.nonEmpty) {
deviceBuffer.zip(hostBuffer).map { case (d, h) =>
- SendBounceBuffers(d, Some(h))
+ new SendBounceBuffers(d, Some(h))
}
} else {
- deviceBuffer.map(d => SendBounceBuffers(d, None))
+ deviceBuffer.map(d => new SendBounceBuffers(d, None))
}
} else {
Seq.empty
@@ -377,8 +377,8 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
}
}
- private case class ClientAndBufferReceiveState(client: RapidsShuffleClient,
- brs: BufferReceiveState)
+ private class ClientAndBufferReceiveState(val client: RapidsShuffleClient,
+ val brs: BufferReceiveState)
private val pendingBrs = new ConcurrentHashMap[Long, ClientAndBufferReceiveState]()
def handleBufferReceive(size: Long, header: Long,
@@ -498,7 +498,7 @@ class UCXShuffleTransport(shuffleServerId: BlockManagerId, rapidsConf: RapidsCon
perClientRequests.bounceBuffer,
perClientRequests.transferRequests.toSeq,
() => bufferReceiveStateComplete(brsId))
- pendingBrs.put(brs.id, ClientAndBufferReceiveState(client, brs))
+ pendingBrs.put(brs.id, new ClientAndBufferReceiveState(client, brs))
client.issueBufferReceives(brs)
}
} else if (!hasBounceBuffers) {
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXTransaction.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXTransaction.scala
index db7cee1a87e..24feebcef16 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXTransaction.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXTransaction.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -280,7 +280,7 @@ private[ucx] class UCXTransaction(conn: UCXConnection, val txId: Long)
val diff: Double = (end - start)/1000000.0D
val sendThroughput: Double = (sendSize.get()/1024.0D/1024.0D/1024.0D) / (diff / 1000.0D)
val recvThroughput: Double = (receiveSize.get()/1024.0D/1024.0D/1024.0D) / (diff / 1000.0D)
- TransactionStats(diff, sendSize.get(), receiveSize.get(), sendThroughput, recvThroughput)
+ new TransactionStats(diff, sendSize.get(), receiveSize.get(), sendThroughput, recvThroughput)
}
var callbackCalled: Boolean = false
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/HashedPriorityQueue.java b/sql-plugin-api/src/main/java/com/nvidia/spark/rapids/HashedPriorityQueue.java
similarity index 100%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/HashedPriorityQueue.java
rename to sql-plugin-api/src/main/java/com/nvidia/spark/rapids/HashedPriorityQueue.java
diff --git a/sql-plugin-api/src/main/java/com/nvidia/spark/rapids/ThreadFactoryBuilder.java b/sql-plugin-api/src/main/java/com/nvidia/spark/rapids/ThreadFactoryBuilder.java
new file mode 100644
index 00000000000..19cf340c167
--- /dev/null
+++ b/sql-plugin-api/src/main/java/com/nvidia/spark/rapids/ThreadFactoryBuilder.java
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.concurrent.Executors;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * This is similar to Guava ThreadFactoryBuilder. Avoid using Guava as it is a messy dependency
+ * in practice.
+ */
+public class ThreadFactoryBuilder {
+ private String nameFormat;
+ private Boolean daemon;
+
+ public ThreadFactoryBuilder setNameFormat(String nameFormat) {
+ String.format(nameFormat, 0);
+ this.nameFormat = nameFormat;
+ return this;
+ }
+
+ public ThreadFactoryBuilder setDaemon(boolean daemon) {
+ this.daemon = daemon;
+ return this;
+ }
+
+ public ThreadFactory build() {
+ AtomicLong count = nameFormat == null ? null : new AtomicLong(0);
+ return new ThreadFactory() {
+ private final ThreadFactory defaultThreadFactory = Executors.defaultThreadFactory();
+
+ @Override
+ public Thread newThread(Runnable runnable) {
+ Thread thread = defaultThreadFactory.newThread(runnable);
+ if (nameFormat != null) {
+ thread.setName(String.format(nameFormat, count.getAndIncrement()));
+ }
+ if (daemon != null) {
+ thread.setDaemon(daemon);
+ }
+ return thread;
+ }
+ };
+ }
+}
diff --git a/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimCommandRules.scala b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimCommandRules.scala
new file mode 100644
index 00000000000..f402b69d749
--- /dev/null
+++ b/sql-plugin-api/src/main/scala/com/nvidia/spark/rapids/ShimCommandRules.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.command.{DataWritingCommand, RunnableCommand}
+
+final class ShimExecRule[INPUT <: SparkPlan] private (
+ val desc: String,
+ val tag: ClassTag[INPUT])
+
+object ShimExecRule {
+ def apply[INPUT <: SparkPlan](desc: String)(
+ implicit tag: ClassTag[INPUT]): ShimExecRule[INPUT] = {
+ require(desc != null)
+ new ShimExecRule[INPUT](desc, tag)
+ }
+}
+
+final class ShimDataWritingCommandRule[INPUT <: DataWritingCommand] private (
+ val desc: String,
+ val tag: ClassTag[INPUT])
+
+object ShimDataWritingCommandRule {
+ def apply[INPUT <: DataWritingCommand](desc: String)(
+ implicit tag: ClassTag[INPUT]): ShimDataWritingCommandRule[INPUT] = {
+ require(desc != null)
+ new ShimDataWritingCommandRule[INPUT](desc, tag)
+ }
+}
+
+final class ShimRunnableCommandRule[INPUT <: RunnableCommand] private (
+ val desc: String,
+ val tag: ClassTag[INPUT])
+
+object ShimRunnableCommandRule {
+ def apply[INPUT <: RunnableCommand](desc: String)(
+ implicit tag: ClassTag[INPUT]): ShimRunnableCommandRule[INPUT] = {
+ require(desc != null)
+ new ShimRunnableCommandRule[INPUT](desc, tag)
+ }
+}
diff --git a/sql-plugin-columnar/pom.xml b/sql-plugin-columnar/pom.xml
new file mode 100644
index 00000000000..c1bed4d032d
--- /dev/null
+++ b/sql-plugin-columnar/pom.xml
@@ -0,0 +1,124 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.12
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-columnar_2.12
+ Java-only columnar runtime plumbing for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-columnar
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.nvidia
+ spark-rapids-jni
+ ${jni.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-format_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ org.apache.spark
+ spark-sql_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AbstractHostByteBufferIterator.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AbstractHostByteBufferIterator.java
new file mode 100644
index 00000000000..99a03734d90
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AbstractHostByteBufferIterator.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.nio.ByteBuffer;
+
+import scala.collection.AbstractIterator;
+
+public abstract class AbstractHostByteBufferIterator extends AbstractIterator {
+ private long nextBufferStart = 0L;
+
+ public abstract long totalLength();
+
+ public long limit() {
+ return Integer.MAX_VALUE;
+ }
+
+ public abstract ByteBuffer getByteBuffer(long offset, long length);
+
+ @Override
+ public boolean hasNext() {
+ return nextBufferStart < totalLength();
+ }
+
+ @Override
+ public ByteBuffer next() {
+ long offset = nextBufferStart;
+ long length = Math.min(totalLength() - nextBufferStart, limit());
+ nextBufferStart += length;
+ return getByteBuffer(offset, length);
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AggregateModeInfo.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AggregateModeInfo.java
new file mode 100644
index 00000000000..80834a02f5c
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AggregateModeInfo.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateMode;
+import org.apache.spark.sql.catalyst.expressions.aggregate.Complete$;
+import org.apache.spark.sql.catalyst.expressions.aggregate.Final$;
+import org.apache.spark.sql.catalyst.expressions.aggregate.Partial$;
+import org.apache.spark.sql.catalyst.expressions.aggregate.PartialMerge$;
+
+import scala.collection.Seq;
+
+/**
+ * Information on the aggregation modes being used.
+ */
+public class AggregateModeInfo implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final Seq uniqueModes;
+ private final boolean hasPartialMode;
+ private final boolean hasPartialMergeMode;
+ private final boolean hasFinalMode;
+ private final boolean hasCompleteMode;
+
+ public AggregateModeInfo(
+ Seq uniqueModes,
+ boolean hasPartialMode,
+ boolean hasPartialMergeMode,
+ boolean hasFinalMode,
+ boolean hasCompleteMode) {
+ this.uniqueModes = uniqueModes;
+ this.hasPartialMode = hasPartialMode;
+ this.hasPartialMergeMode = hasPartialMergeMode;
+ this.hasFinalMode = hasFinalMode;
+ this.hasCompleteMode = hasCompleteMode;
+ }
+
+ public static AggregateModeInfo from(Seq uniqueModes) {
+ return new AggregateModeInfo(
+ uniqueModes,
+ uniqueModes.contains(Partial$.MODULE$),
+ uniqueModes.contains(PartialMerge$.MODULE$),
+ uniqueModes.contains(Final$.MODULE$),
+ uniqueModes.contains(Complete$.MODULE$));
+ }
+
+ public Seq uniqueModes() {
+ return uniqueModes;
+ }
+
+ public boolean hasPartialMode() {
+ return hasPartialMode;
+ }
+
+ public boolean hasPartialMergeMode() {
+ return hasPartialMergeMode;
+ }
+
+ public boolean hasFinalMode() {
+ return hasFinalMode;
+ }
+
+ public boolean hasCompleteMode() {
+ return hasCompleteMode;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof AggregateModeInfo)) {
+ return false;
+ }
+ AggregateModeInfo that = (AggregateModeInfo) other;
+ return hasPartialMode == that.hasPartialMode
+ && hasPartialMergeMode == that.hasPartialMergeMode
+ && hasFinalMode == that.hasFinalMode
+ && hasCompleteMode == that.hasCompleteMode
+ && Objects.equals(uniqueModes, that.uniqueModes);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(
+ uniqueModes, hasPartialMode, hasPartialMergeMode, hasFinalMode, hasCompleteMode);
+ }
+
+ @Override
+ public String toString() {
+ return "AggregateModeInfo(" + uniqueModes + "," + hasPartialMode + ","
+ + hasPartialMergeMode + "," + hasFinalMode + "," + hasCompleteMode + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ArrayIndexUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ArrayIndexUtils.java
new file mode 100644
index 00000000000..cab1ecfced3
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ArrayIndexUtils.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.HostColumnVector;
+import ai.rapids.cudf.Table;
+
+public final class ArrayIndexUtils {
+ private ArrayIndexUtils() {}
+
+ public static final class IndexAndNumElement {
+ private final int index;
+ private final int numElements;
+
+ IndexAndNumElement(int index, int numElements) {
+ this.index = index;
+ this.numElements = numElements;
+ }
+
+ public int getIndex() {
+ return index;
+ }
+
+ public int getNumElements() {
+ return numElements;
+ }
+ }
+
+ /**
+ * Return the first int value (should be valid) in {@code indices} and
+ * {@code numElements} where the corresponding row in {@code mask} is true.
+ * Null rows in {@code mask} are skipped.
+ *
+ *
{@code indices} and {@code numElements} should be int columns with the
+ * same row count. {@code mask} should be a boolean column with the same row
+ * count. Otherwise, behavior is undefined.
+ */
+ public static IndexAndNumElement firstIndexAndNumElementUnchecked(
+ ColumnView mask, ColumnVector indices, ColumnVector numElements) {
+ try (Table indexTable = new Table(indices, numElements);
+ Table filteredTable = indexTable.filter(mask)) {
+ assert filteredTable.getRowCount() > 0;
+ int index;
+ try (HostColumnVector indicesH = filteredTable.getColumn(0).copyToHost()) {
+ assert !indicesH.isNull(0);
+ index = indicesH.getInt(0);
+ }
+ int numElement;
+ try (HostColumnVector numElemsH = filteredTable.getColumn(1).copyToHost()) {
+ assert !numElemsH.isNull(0);
+ numElement = numElemsH.getInt(0);
+ }
+ return new IndexAndNumElement(index, numElement);
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AutoCloseableTargetSize.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AutoCloseableTargetSize.java
new file mode 100644
index 00000000000..db294308c4f
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/AutoCloseableTargetSize.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+public class AutoCloseableTargetSize implements AutoCloseable, Serializable {
+ private static final long serialVersionUID = 1L;
+
+ public final long targetSize;
+ public final long minSize;
+ public final long dataSize;
+
+ public AutoCloseableTargetSize(long targetSize, long minSize) {
+ this(targetSize, minSize, 0);
+ }
+
+ public AutoCloseableTargetSize(long targetSize, long minSize, long dataSize) {
+ this.targetSize = targetSize;
+ this.minSize = minSize;
+ this.dataSize = dataSize;
+ }
+
+ public long targetSize() {
+ return targetSize;
+ }
+
+ public long minSize() {
+ return minSize;
+ }
+
+ public long dataSize() {
+ return dataSize;
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof AutoCloseableTargetSize)) {
+ return false;
+ }
+ AutoCloseableTargetSize that = (AutoCloseableTargetSize) other;
+ return targetSize == that.targetSize &&
+ minSize == that.minSize &&
+ dataSize == that.dataSize;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(targetSize, minSize, dataSize);
+ }
+
+ @Override
+ public String toString() {
+ return "AutoCloseableTargetSize(" + targetSize + "," + minSize + "," + dataSize + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BlockInfo.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BlockInfo.java
new file mode 100644
index 00000000000..300d8026f18
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BlockInfo.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+/** Avro block metadata. */
+public final class BlockInfo implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final long blockStart;
+ private final long blockSize;
+ private final long dataSize;
+ private final long count;
+
+ public BlockInfo(long blockStart, long blockSize, long dataSize, long count) {
+ this.blockStart = blockStart;
+ this.blockSize = blockSize;
+ this.dataSize = dataSize;
+ this.count = count;
+ }
+
+ public long blockStart() {
+ return blockStart;
+ }
+
+ public long blockSize() {
+ return blockSize;
+ }
+
+ public long dataSize() {
+ return dataSize;
+ }
+
+ public long count() {
+ return count;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof BlockInfo)) {
+ return false;
+ }
+ BlockInfo other = (BlockInfo) obj;
+ return blockStart == other.blockStart &&
+ blockSize == other.blockSize &&
+ dataSize == other.dataSize &&
+ count == other.count;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(blockStart, blockSize, dataSize, count);
+ }
+
+ @Override
+ public String toString() {
+ return "BlockInfo(" + blockStart + "," + blockSize + "," + dataSize + "," + count + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BoolUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BoolUtils.java
new file mode 100644
index 00000000000..d02a83c77b1
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/BoolUtils.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.Scalar;
+
+public final class BoolUtils {
+ private BoolUtils() {}
+
+ /**
+ * Whether all the valid rows in {@code col} are true. An empty column will get true.
+ * Null rows are skipped.
+ */
+ public static boolean isAllValidTrue(ColumnVector col) {
+ assert DType.BOOL8 == col.getType() : "input column type is not bool";
+ if (col.getRowCount() == 0) {
+ return true;
+ }
+
+ if (col.getRowCount() == col.getNullCount()) {
+ // all is null, equal to empty, since nulls should be skipped.
+ return true;
+ }
+ try (Scalar allTrue = col.all()) {
+ // Guaranteed there is at least one row and not all of the rows are null,
+ // so result scalar must be valid.
+ return allTrue.getBoolean();
+ }
+ }
+
+ /**
+ * Whether there is any valid row in {@code col} and it is true. An empty column will get false.
+ * Null rows are skipped.
+ */
+ public static boolean isAnyValidTrue(ColumnVector col) {
+ assert DType.BOOL8 == col.getType() : "input column type is not bool";
+
+ if (col.getRowCount() == col.getNullCount()) {
+ // all is null, return false since nulls should be skipped.
+ return false;
+ }
+ try (Scalar anyTrue = col.any()) {
+ // Guaranteed there is at least one row and not all of the rows are null,
+ // so result scalar must be valid.
+ return anyTrue.getBoolean();
+ }
+ }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnViewUtils.scala b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ColumnViewUtils.java
similarity index 56%
rename from sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnViewUtils.scala
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ColumnViewUtils.java
index 8bea6481220..84e643d3c1f 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ColumnViewUtils.scala
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ColumnViewUtils.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -13,22 +13,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package com.nvidia.spark.rapids
-import ai.rapids.cudf.ColumnView
-import com.nvidia.spark.rapids.Arm.withResource
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.Scalar;
+
+public final class ColumnViewUtils {
+ private ColumnViewUtils() {}
-object ColumnViewUtils {
/**
- * Get the `toString` on the scalar element at the specified row index in a column view.
+ * Get the {@code toString} on the scalar element at the specified row index in a column view.
* E.g., returns: Scalar{type=INT32 value=-1250858453} (ID: 143 7149580cdd60)
*/
- def getElementStringFromColumnView(cv: ColumnView, rowIndex: Int): String = {
- withResource(cv.getScalarElement(rowIndex)) { scalar =>
- if (scalar.isValid) {
- scalar.toString
+ public static String getElementStringFromColumnView(ColumnView cv, int rowIndex) {
+ try (Scalar scalar = cv.getScalarElement(rowIndex)) {
+ if (scalar.isValid()) {
+ return scalar.toString();
} else {
- "null"
+ return "null";
}
}
}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/ColumnarCopyHelper.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ColumnarCopyHelper.java
similarity index 100%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/ColumnarCopyHelper.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ColumnarCopyHelper.java
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CombineConf.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CombineConf.java
new file mode 100644
index 00000000000..1bb19bc9261
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CombineConf.java
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+public class CombineConf implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final long combineThresholdSize;
+ private final int combineWaitTime;
+
+ public CombineConf(long combineThresholdSize, int combineWaitTime) {
+ this.combineThresholdSize = combineThresholdSize;
+ this.combineWaitTime = combineWaitTime;
+ }
+
+ public long combineThresholdSize() {
+ return combineThresholdSize;
+ }
+
+ public int combineWaitTime() {
+ return combineWaitTime;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof CombineConf)) {
+ return false;
+ }
+ CombineConf that = (CombineConf) other;
+ return combineThresholdSize == that.combineThresholdSize &&
+ combineWaitTime == that.combineWaitTime;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(combineThresholdSize, combineWaitTime);
+ }
+
+ @Override
+ public String toString() {
+ return "CombineConf(" + combineThresholdSize + "," + combineWaitTime + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CompressedTable.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CompressedTable.java
new file mode 100644
index 00000000000..06c410d0c8c
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/CompressedTable.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import com.nvidia.spark.rapids.format.TableMeta;
+
+/**
+ * Compressed table descriptor.
+ */
+public class CompressedTable implements AutoCloseable {
+ public final long compressedSize;
+ public final TableMeta meta;
+ public final DeviceMemoryBuffer buffer;
+
+ public CompressedTable(long compressedSize, TableMeta meta, DeviceMemoryBuffer buffer) {
+ this.compressedSize = compressedSize;
+ this.meta = meta;
+ this.buffer = buffer;
+ }
+
+ public long compressedSize() {
+ return compressedSize;
+ }
+
+ public TableMeta meta() {
+ return meta;
+ }
+
+ public DeviceMemoryBuffer buffer() {
+ return buffer;
+ }
+
+ @Override
+ public void close() {
+ buffer.close();
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof CompressedTable)) {
+ return false;
+ }
+ CompressedTable that = (CompressedTable) other;
+ return compressedSize == that.compressedSize &&
+ Objects.equals(meta, that.meta) &&
+ Objects.equals(buffer, that.buffer);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(compressedSize, meta, buffer);
+ }
+
+ @Override
+ public String toString() {
+ return "CompressedTable(" + compressedSize + "," + meta + "," + buffer + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DecimalUtil.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DecimalUtil.java
new file mode 100644
index 00000000000..5dc43a5678c
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DecimalUtil.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.DecimalUtils;
+
+import org.apache.spark.sql.types.BooleanType;
+import org.apache.spark.sql.types.ByteType;
+import org.apache.spark.sql.types.DataType;
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Decimal;
+import org.apache.spark.sql.types.DecimalType;
+import org.apache.spark.sql.types.IntegerType;
+import org.apache.spark.sql.types.LongType;
+import org.apache.spark.sql.types.ShortType;
+
+import scala.Option;
+
+public final class DecimalUtil {
+ private static final DecimalType BOOLEAN_DECIMAL = DataTypes.createDecimalType(1, 0);
+
+ private DecimalUtil() {}
+
+ public static DType createCudfDecimal(DecimalType dt) {
+ return DecimalUtils.createDecimalType(dt.precision(), dt.scale());
+ }
+
+ public static ColumnVector outOfBounds(ColumnView input, DecimalType to) {
+ return DecimalUtils.outOfBounds(input, to.precision(), to.scale());
+ }
+
+ /**
+ * Return the size in bytes of the fixed-width data types.
+ * WARNING: Do not use this method for variable-width data types.
+ */
+ public static int getDataTypeSize(DataType dt) {
+ if (dt instanceof DecimalType && ((DecimalType) dt).precision() <= Decimal.MAX_INT_DIGITS()) {
+ return 4;
+ }
+ return dt.defaultSize();
+ }
+
+ public static Option optionallyAsDecimalType(DataType t) {
+ if (t instanceof DecimalType) {
+ return Option.apply((DecimalType) t);
+ } else if (t instanceof ByteType) {
+ return decimalTypeFor(DType.INT8);
+ } else if (t instanceof ShortType) {
+ return decimalTypeFor(DType.INT16);
+ } else if (t instanceof IntegerType) {
+ return decimalTypeFor(DType.INT32);
+ } else if (t instanceof LongType) {
+ return decimalTypeFor(DType.INT64);
+ } else if (t instanceof BooleanType) {
+ return Option.apply(BOOLEAN_DECIMAL);
+ }
+ return Option.empty();
+ }
+
+ public static DecimalType asDecimalType(DataType t) {
+ Option dt = optionallyAsDecimalType(t);
+ if (dt.isDefined()) {
+ return dt.get();
+ }
+ throw new IllegalArgumentException(
+ "Internal Error: type " + t + " cannot automatically be cast to a supported DecimalType");
+ }
+
+ private static Option decimalTypeFor(DType dtype) {
+ return Option.apply(DataTypes.createDecimalType(dtype.getPrecisionForInt(), 0));
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DefaultThreadPoolConf.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DefaultThreadPoolConf.java
new file mode 100644
index 00000000000..c42edeb8405
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DefaultThreadPoolConf.java
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class DefaultThreadPoolConf implements ThreadPoolConf {
+ private static final long serialVersionUID = 1L;
+
+ private final int maxThreadNumber;
+ private final boolean stageLevelPool;
+
+ public DefaultThreadPoolConf(int maxThreadNumber, boolean stageLevelPool) {
+ this.maxThreadNumber = maxThreadNumber;
+ this.stageLevelPool = stageLevelPool;
+ }
+
+ @Override
+ public int maxThreadNumber() {
+ return maxThreadNumber;
+ }
+
+ @Override
+ public boolean stageLevelPool() {
+ return stageLevelPool;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof DefaultThreadPoolConf)) {
+ return false;
+ }
+ DefaultThreadPoolConf that = (DefaultThreadPoolConf) other;
+ return maxThreadNumber == that.maxThreadNumber &&
+ stageLevelPool == that.stageLevelPool;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(maxThreadNumber, stageLevelPool);
+ }
+
+ @Override
+ public String toString() {
+ return "DefaultThreadPoolConf(" + maxThreadNumber + "," + stageLevelPool + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DeviceBuffersUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DeviceBuffersUtils.java
new file mode 100644
index 00000000000..0e855219887
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/DeviceBuffersUtils.java
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+
+public final class DeviceBuffersUtils {
+ private DeviceBuffersUtils() {}
+
+ public static BaseDeviceMemoryBuffer[] incRefCount(BaseDeviceMemoryBuffer[] bufs) {
+ BaseDeviceMemoryBuffer[] ret = new BaseDeviceMemoryBuffer[bufs.length];
+ int initialized = 0;
+ try {
+ for (BaseDeviceMemoryBuffer buf : bufs) {
+ buf.incRefCount();
+ ret[initialized] = buf;
+ initialized++;
+ }
+ return ret;
+ } catch (Throwable t) {
+ closeAll(ret, initialized, t);
+ throw t;
+ }
+ }
+
+ public static DeviceMemoryBuffer[] allocateBuffers(long[] bufSizes) {
+ DeviceMemoryBuffer[] ret = new DeviceMemoryBuffer[bufSizes.length];
+ int initialized = 0;
+ try (DeviceMemoryBuffer singleBuf = DeviceMemoryBuffer.allocate(sum(bufSizes))) {
+ long curPos = 0L;
+ for (long len : bufSizes) {
+ ret[initialized] = singleBuf.slice(curPos, len);
+ initialized++;
+ curPos += len;
+ }
+ return ret;
+ } catch (Throwable t) {
+ closeAll(ret, initialized, t);
+ throw t;
+ }
+ }
+
+ private static long sum(long[] values) {
+ long ret = 0L;
+ for (long value : values) {
+ ret += value;
+ }
+ return ret;
+ }
+
+ private static void closeAll(AutoCloseable[] values, int count, Throwable cause) {
+ for (int i = 0; i < count; i++) {
+ AutoCloseable value = values[i];
+ if (value != null) {
+ try {
+ value.close();
+ } catch (Throwable t) {
+ cause.addSuppressed(t);
+ }
+ }
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ExecutorCache.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ExecutorCache.java
new file mode 100644
index 00000000000..d2dba3fef99
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ExecutorCache.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.lang.management.ManagementFactory;
+
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.CudaComputeMode;
+
+/**
+ * Caches executor-related information. Values are initialized lazily to match the previous Scala
+ * object semantics.
+ */
+final class ExecutorCache {
+ private ExecutorCache() {
+ }
+
+ static CudaComputeMode getCurrentDeviceComputeMode() {
+ return CurrentDeviceComputeModeHolder.VALUE;
+ }
+
+ static byte[] getCurrentDeviceUuid() {
+ return CurrentDeviceUuidHolder.VALUE;
+ }
+
+ static String getProcessName() {
+ return ProcessNameHolder.VALUE;
+ }
+
+ private static final class CurrentDeviceComputeModeHolder {
+ private static final CudaComputeMode VALUE = Cuda.getComputeMode();
+ }
+
+ private static final class CurrentDeviceUuidHolder {
+ private static final byte[] VALUE = Cuda.getGpuUuid();
+ }
+
+ private static final class ProcessNameHolder {
+ private static final String VALUE = ManagementFactory.getRuntimeMXBean().getName();
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/FloatUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/FloatUtils.java
new file mode 100644
index 00000000000..4dfce5f74d1
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/FloatUtils.java
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.Scalar;
+
+public final class FloatUtils {
+ private FloatUtils() {}
+
+ public static ColumnVector nanToZero(ColumnView cv) {
+ if (cv.getType() != DType.FLOAT32 && cv.getType() != DType.FLOAT64) {
+ throw new IllegalArgumentException("Only Floats and Doubles allowed");
+ }
+
+ try (ColumnVector isNan = cv.isNan();
+ Scalar zero = cv.getType() == DType.FLOAT64
+ ? Scalar.fromDouble(0.0d)
+ : Scalar.fromFloat(0.0f)) {
+ return isNan.ifElse(zero, cv);
+ }
+ }
+
+ public static Scalar getNanScalar(DType dType) {
+ if (dType == DType.FLOAT64) {
+ return Scalar.fromDouble(Double.NaN);
+ } else if (dType == DType.FLOAT32) {
+ return Scalar.fromFloat(Float.NaN);
+ } else {
+ throw new IllegalArgumentException("NaNs are only supported for Float types");
+ }
+ }
+
+ public static Scalar getPositiveInfinityScalar(DType dType) {
+ if (dType == DType.FLOAT64) {
+ return Scalar.fromDouble(Double.POSITIVE_INFINITY);
+ } else {
+ return Scalar.fromFloat(Float.POSITIVE_INFINITY);
+ }
+ }
+
+ public static Scalar getNegativeInfinityScalar(DType dType) {
+ if (dType == DType.FLOAT64) {
+ return Scalar.fromDouble(Double.NEGATIVE_INFINITY);
+ } else {
+ return Scalar.fromFloat(Float.NEGATIVE_INFINITY);
+ }
+ }
+
+ public static ColumnVector getInfinityVector(DType dtype) {
+ if (dtype == DType.FLOAT64) {
+ return ColumnVector.fromDoubles(Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
+ } else {
+ return ColumnVector.fromFloats(Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
+ }
+ }
+
+ public static ColumnVector infinityToNulls(ColumnVector vec) {
+ try (ColumnVector infinityVector = getInfinityVector(vec.getType());
+ ColumnVector nullVector = getNullVector(vec.getType())) {
+ return vec.findAndReplaceAll(infinityVector, nullVector);
+ }
+ }
+
+ private static ColumnVector getNullVector(DType dtype) {
+ if (dtype == DType.FLOAT64) {
+ return ColumnVector.fromBoxedDoubles((Double) null, (Double) null);
+ } else {
+ return ColumnVector.fromBoxedFloats((Float) null, (Float) null);
+ }
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java
similarity index 98%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java
index 5d707cd9e12..8a0dc00a338 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuColumnVectorBase.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java
similarity index 90%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java
index 1dc85cb2031..81abeb11eea 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuCompressedColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -35,14 +35,6 @@ public final class GpuCompressedColumnVector extends GpuColumnVectorBase
private final DeviceMemoryBuffer buffer;
private final TableMeta tableMeta;
- /**
- * Build a columnar batch from a compressed table.
- * NOTE: The data remains compressed and cannot be accessed directly from the columnar batch.
- */
- public static ColumnarBatch from(CompressedTable compressedTable) {
- return from(compressedTable.buffer(), compressedTable.meta());
- }
-
public static boolean isBatchCompressed(ColumnarBatch batch) {
return batch.numCols() == 1 && batch.column(0) instanceof GpuCompressedColumnVector;
}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuListUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuListUtils.java
new file mode 100644
index 00000000000..a448cdaf7ba
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuListUtils.java
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Optional;
+
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+
+/** Provides APIs to manipulate array/list columns in common ways. */
+public final class GpuListUtils {
+ private GpuListUtils() {}
+
+ /**
+ * Replace the data column in a LIST column. This keeps the same offsets and validity
+ * of the list column. This returns a view, so the caller is responsible for keeping
+ * both {@code listCol} and {@code newDataCol} alive longer than the returned view.
+ *
+ * @param listCol the list column to use as a template
+ * @param newDataCol the new data column
+ * @return a new ColumnView
+ * @throws IllegalArgumentException if the data column does not match the original data column
+ * in size
+ */
+ public static ColumnView replaceListDataColumnAsView(
+ ColumnView listCol, ColumnView newDataCol) {
+ assert DType.LIST.equals(listCol.getType());
+ try (ColumnView dataCol = listCol.getChildColumnView(0)) {
+ if (dataCol.getRowCount() != newDataCol.getRowCount()) {
+ throw new IllegalArgumentException("Mismatch in the number of rows in the data columns");
+ }
+ }
+ try (BaseDeviceMemoryBuffer offsets = listCol.getOffsets();
+ BaseDeviceMemoryBuffer validity = listCol.getValid()) {
+ return new ColumnView(
+ DType.LIST,
+ listCol.getRowCount(),
+ Optional.of(listCol.getNullCount()),
+ validity,
+ offsets,
+ new ColumnView[] { newDataCol });
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuOrcTimezoneUtils.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuOrcTimezoneUtils.java
new file mode 100644
index 00000000000..34cc1ac221f
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuOrcTimezoneUtils.java
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.time.LocalDateTime;
+import java.time.ZoneId;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Optional;
+
+import ai.rapids.cudf.ColumnVector;
+import ai.rapids.cudf.ColumnView;
+import ai.rapids.cudf.DType;
+import ai.rapids.cudf.Scalar;
+import ai.rapids.cudf.Table;
+
+public final class GpuOrcTimezoneUtils {
+ private static final ZoneId UTC = ZoneId.of("UTC");
+
+ private GpuOrcTimezoneUtils() {
+ }
+
+ /**
+ * Get the offset in microseconds for 2015-01-01 between JVM timezone and UTC timezone.
+ *
+ * @param jvmTz the JVM timezone to calculate the offset for
+ * @return the offset in microseconds between the JVM timezone and UTC timezone
+ */
+ private static long getOffsetForJanuaryFirst2015(ZoneId jvmTz) {
+ long t1 = LocalDateTime.of(2015, 1, 1, 0, 0, 0).atZone(jvmTz).toInstant()
+ .getEpochSecond();
+ long t2 = LocalDateTime.of(2015, 1, 1, 0, 0, 0).atZone(UTC).toInstant()
+ .getEpochSecond();
+ return (t2 - t1) * 1000000L;
+ }
+
+ private static T addToClose(List toClose, T view) {
+ toClose.add(view);
+ return view;
+ }
+
+ /**
+ * Recursively rebase timestamp columns in an input column view to the target timezone.
+ * This handles nested list and struct types.
+ */
+ private static ColumnView rebaseTimestampRecursively(
+ ColumnView col,
+ List toClose,
+ long diffMicros) {
+ DType dType = col.getType();
+ if (dType.hasTimeResolution()) {
+ assert dType.equals(DType.TIMESTAMP_MICROSECONDS) :
+ "Only TIMESTAMP_MICROSECONDS is supported, but got " + dType;
+
+ try (ColumnView longs = col.bitCastTo(DType.INT64);
+ Scalar offsetScalar = Scalar.fromLong(diffMicros);
+ ColumnVector rebased = longs.sub(offsetScalar)) {
+ return rebased.castTo(DType.TIMESTAMP_MICROSECONDS);
+ }
+ } else if (DType.LIST.equals(dType)) {
+ ColumnView child = addToClose(toClose, col.getChildColumnView(0));
+ ColumnView newChild = rebaseTimestampRecursively(child, toClose, diffMicros);
+ if (newChild != child) {
+ return col.replaceListChild(addToClose(toClose, newChild));
+ }
+ return col;
+ } else if (DType.STRUCT.equals(dType)) {
+ ColumnView[] newViews = new ColumnView[col.getNumChildren()];
+ for (int i = 0; i < newViews.length; i++) {
+ ColumnView child = addToClose(toClose, col.getChildColumnView(i));
+ ColumnView newChild = rebaseTimestampRecursively(child, toClose, diffMicros);
+ if (newChild != child) {
+ addToClose(toClose, newChild);
+ }
+ newViews[i] = newChild;
+ }
+ return new ColumnView(col.getType(), col.getRowCount(), Optional.of(col.getNullCount()),
+ col.getValid(), col.getOffsets(), newViews);
+ }
+ return col;
+ }
+
+ /**
+ * Rebase timestamp columns in the input table to the system default timezone. If the system's
+ * default timezone is UTC, this returns the input table as-is. Otherwise the input table is
+ * closed before returning.
+ *
+ * @param input the input table
+ * @return a table with timestamp columns rebased
+ */
+ public static Table rebaseTimeZone(Table input) {
+ ZoneId toZoneId = ZoneId.systemDefault();
+
+ if (UTC.equals(toZoneId)) {
+ return input;
+ }
+
+ long diffMicros = getOffsetForJanuaryFirst2015(toZoneId);
+ try (Table ignored = input) {
+ ColumnVector[] newColumns = new ColumnVector[input.getNumberOfColumns()];
+ try {
+ for (int colIdx = 0; colIdx < newColumns.length; colIdx++) {
+ ColumnVector col = input.getColumn(colIdx);
+ List toClose = new ArrayList<>();
+ try {
+ ColumnView rebased = rebaseTimestampRecursively(col, toClose, diffMicros);
+ if (col == rebased) {
+ newColumns[colIdx] = col.incRefCount();
+ } else {
+ toClose.add(rebased);
+ newColumns[colIdx] = rebased.copyToColumnVector();
+ }
+ } finally {
+ closeAll(toClose);
+ }
+ }
+ return new Table(newColumns);
+ } finally {
+ closeAll(newColumns);
+ }
+ }
+ }
+
+ private static void closeAll(ColumnView[] views) {
+ for (ColumnView view : views) {
+ if (view != null) {
+ view.close();
+ }
+ }
+ }
+
+ private static void closeAll(List views) {
+ RuntimeException firstException = null;
+ for (ColumnView view : views) {
+ try {
+ view.close();
+ } catch (RuntimeException e) {
+ if (firstException == null) {
+ firstException = e;
+ } else {
+ firstException.addSuppressed(e);
+ }
+ }
+ }
+ if (firstException != null) {
+ throw firstException;
+ }
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java
similarity index 98%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java
index 7c0b1a5a517..90ca8a5a908 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/GpuPackedTableColumn.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostAllocResult.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostAllocResult.java
new file mode 100644
index 00000000000..8cb8cf9f0a0
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostAllocResult.java
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+public class HostAllocResult {
+ public final HostMemoryBuffer buffer;
+ public final boolean isPinned;
+
+ public HostAllocResult(HostMemoryBuffer buffer, boolean isPinned) {
+ this.buffer = buffer;
+ this.isPinned = isPinned;
+ }
+
+ public HostMemoryBuffer buffer() {
+ return buffer;
+ }
+
+ public boolean isPinned() {
+ return isPinned;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof HostAllocResult)) {
+ return false;
+ }
+ HostAllocResult that = (HostAllocResult) other;
+ return isPinned == that.isPinned && Objects.equals(buffer, that.buffer);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(buffer, isPinned);
+ }
+
+ @Override
+ public String toString() {
+ return "HostAllocResult(" + buffer + "," + isPinned + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostByteBufferIterator.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostByteBufferIterator.java
new file mode 100644
index 00000000000..ebd5a0d9997
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostByteBufferIterator.java
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.nio.ByteBuffer;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+/**
+ * Create an iterator that will emit ByteBuffer instances sequentially to work around the 2GB
+ * ByteBuffer size limitation. This allows the entire address range of a >2GB host buffer to be
+ * covered by a sequence of ByteBuffer instances.
+ *
+ * NOTE: It is the caller's responsibility to ensure this iterator does not outlive the host buffer.
+ * The iterator DOES NOT increment the reference count of the host buffer to ensure it remains valid.
+ */
+public class HostByteBufferIterator extends AbstractHostByteBufferIterator {
+ private final HostMemoryBuffer hostBuffer;
+ private final long totalLength;
+
+ public HostByteBufferIterator(HostMemoryBuffer hostBuffer) {
+ this.hostBuffer = hostBuffer;
+ this.totalLength = hostBuffer == null ? 0 : hostBuffer.getLength();
+ }
+
+ @Override
+ public long limit() {
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public long totalLength() {
+ return totalLength;
+ }
+
+ @Override
+ public ByteBuffer getByteBuffer(long offset, long length) {
+ return hostBuffer.asByteBuffer(offset, (int) length);
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryInputStream.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryInputStream.java
new file mode 100644
index 00000000000..81c91ce3740
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryInputStream.java
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.ByteBuffer;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+/**
+ * An implementation of InputStream that reads from a HostMemoryBuffer.
+ *
+ * NOTE: Closing this input stream does NOT close the buffer!
+ */
+public class HostMemoryInputStream extends InputStream {
+ public final HostMemoryBuffer hmb;
+ public final long hmbLength;
+
+ protected long pos = 0;
+ protected long mark = -1;
+
+ public HostMemoryInputStream(HostMemoryBuffer hmb, long hmbLength) {
+ this.hmb = hmb;
+ this.hmbLength = hmbLength;
+ }
+
+ public HostMemoryBuffer hmb() {
+ return hmb;
+ }
+
+ public long hmbLength() {
+ return hmbLength;
+ }
+
+ @Override
+ public int read() {
+ if (pos >= hmbLength) {
+ return -1;
+ }
+ byte result = hmb.getByte(pos);
+ pos += 1;
+ // Java bytes are signed, so mask off the upper bits to avoid returning negative EOF values.
+ return result & 0xFF;
+ }
+
+ @Override
+ public int read(byte[] buffer, int offset, int length) {
+ if (pos >= hmbLength) {
+ return -1;
+ }
+ int numBytes = Math.min(available(), length);
+ hmb.getBytes(buffer, offset, pos, numBytes);
+ pos += numBytes;
+ return numBytes;
+ }
+
+ public ByteBuffer readByteBuffer(int length) {
+ ByteBuffer byteBuffer = hmb.asByteBuffer(pos, length);
+ pos += length;
+ return byteBuffer;
+ }
+
+ @Override
+ public long skip(long count) {
+ long oldPos = pos;
+ pos = Math.min(pos + count, hmbLength);
+ return pos - oldPos;
+ }
+
+ @Override
+ public int available() {
+ return (int) Math.min(hmbLength - pos, Integer.MAX_VALUE);
+ }
+
+ @Override
+ public void mark(int ignored) {
+ mark = pos;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ if (mark <= 0) {
+ throw new IOException("reset called before mark");
+ }
+ pos = mark;
+ }
+
+ @Override
+ public boolean markSupported() {
+ return true;
+ }
+
+ public long getPos() {
+ return pos;
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryOutputStream.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryOutputStream.java
new file mode 100644
index 00000000000..25a95874e96
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/HostMemoryOutputStream.java
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.EOFException;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.channels.ReadableByteChannel;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+
+/**
+ * An implementation of OutputStream that writes to a HostMemoryBuffer.
+ *
+ * NOTE: Closing this output stream does NOT close the buffer!
+ */
+public class HostMemoryOutputStream extends OutputStream {
+ public final HostMemoryBuffer buffer;
+ protected long pos = 0;
+
+ public HostMemoryOutputStream(HostMemoryBuffer buffer) {
+ this.buffer = buffer;
+ }
+
+ public HostMemoryBuffer buffer() {
+ return buffer;
+ }
+
+ @Override
+ public void write(int i) {
+ buffer.setByte(pos, (byte) i);
+ pos += 1;
+ }
+
+ @Override
+ public void write(byte[] bytes) {
+ buffer.setBytes(pos, bytes, 0, bytes.length);
+ pos += bytes.length;
+ }
+
+ @Override
+ public void write(byte[] bytes, int offset, int len) {
+ buffer.setBytes(pos, bytes, offset, len);
+ pos += len;
+ }
+
+ public void write(ByteBuffer data) {
+ int numBytes = data.remaining();
+ ByteBuffer outBuffer = buffer.asByteBuffer(pos, numBytes);
+ outBuffer.put(data);
+ pos += numBytes;
+ }
+
+ public ByteBuffer writeAsByteBuffer(int length) {
+ ByteBuffer byteBuffer = buffer.asByteBuffer(pos, length);
+ pos += length;
+ return byteBuffer;
+ }
+
+ public long getPos() {
+ return pos;
+ }
+
+ public void seek(long newPos) {
+ pos = newPos;
+ }
+
+ public void copyFromChannel(ReadableByteChannel channel, long length) throws IOException {
+ long endPos = pos + length;
+ if (endPos > buffer.getLength()) {
+ throw new AssertionError();
+ }
+ while (pos != endPos) {
+ int bytesToCopy = (int) Math.min(endPos - pos, Integer.MAX_VALUE);
+ ByteBuffer byteBuffer = buffer.asByteBuffer(pos, bytesToCopy);
+ while (byteBuffer.hasRemaining()) {
+ int channelReadBytes = channel.read(byteBuffer);
+ if (channelReadBytes < 0) {
+ throw new EOFException("Unexpected EOF while reading from byte channel");
+ }
+ }
+ pos += bytesToCopy;
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBoundedPoolConf.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBoundedPoolConf.java
new file mode 100644
index 00000000000..781d7949132
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBoundedPoolConf.java
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class MemoryBoundedPoolConf implements ThreadPoolConf {
+ private static final long serialVersionUID = 1L;
+
+ private final int maxThreadNumber;
+ private final boolean stageLevelPool;
+ private final long memoryCapacity;
+ private final long waitMemTimeoutMs;
+
+ public MemoryBoundedPoolConf(int maxThreadNumber, boolean stageLevelPool,
+ long memoryCapacity, long waitMemTimeoutMs) {
+ this.maxThreadNumber = maxThreadNumber;
+ this.stageLevelPool = stageLevelPool;
+ this.memoryCapacity = memoryCapacity;
+ this.waitMemTimeoutMs = waitMemTimeoutMs;
+ }
+
+ @Override
+ public int maxThreadNumber() {
+ return maxThreadNumber;
+ }
+
+ @Override
+ public boolean stageLevelPool() {
+ return stageLevelPool;
+ }
+
+ public long memoryCapacity() {
+ return memoryCapacity;
+ }
+
+ public long waitMemTimeoutMs() {
+ return waitMemTimeoutMs;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof MemoryBoundedPoolConf)) {
+ return false;
+ }
+ MemoryBoundedPoolConf that = (MemoryBoundedPoolConf) other;
+ return maxThreadNumber == that.maxThreadNumber &&
+ stageLevelPool == that.stageLevelPool &&
+ memoryCapacity == that.memoryCapacity &&
+ waitMemTimeoutMs == that.waitMemTimeoutMs;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(maxThreadNumber, stageLevelPool, memoryCapacity, waitMemTimeoutMs);
+ }
+
+ @Override
+ public String toString() {
+ return "MemoryBoundedPoolConf(" + maxThreadNumber + "," + stageLevelPool + "," +
+ memoryCapacity + "," + waitMemTimeoutMs + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBufferToHostByteBufferIterator.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBufferToHostByteBufferIterator.java
new file mode 100644
index 00000000000..3c6693767c2
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MemoryBufferToHostByteBufferIterator.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.nio.ByteBuffer;
+
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+
+/**
+ * Create an iterator that will emit ByteBuffer instances sequentially to work around the 2GB
+ * ByteBuffer size limitation after copying a MemoryBuffer to a host-backed bounce buffer.
+ *
+ * NOTE: It is the caller's responsibility to ensure this iterator does not outlive memoryBuffer.
+ * The iterator DOES NOT increment the reference count of memoryBuffer to ensure it remains valid.
+ */
+public class MemoryBufferToHostByteBufferIterator extends AbstractHostByteBufferIterator {
+ private final MemoryBuffer memoryBuffer;
+ private final HostMemoryBuffer bounceBuffer;
+ private final Cuda.Stream stream;
+ private final long totalLength;
+ private final long limit;
+
+ public MemoryBufferToHostByteBufferIterator(
+ MemoryBuffer memoryBuffer,
+ HostMemoryBuffer bounceBuffer,
+ Cuda.Stream stream) {
+ this.memoryBuffer = memoryBuffer;
+ this.bounceBuffer = bounceBuffer;
+ this.stream = stream;
+ this.totalLength = memoryBuffer == null ? 0 : memoryBuffer.getLength();
+ this.limit = Math.min(bounceBuffer.getLength(), Integer.MAX_VALUE);
+ }
+
+ @Override
+ public long totalLength() {
+ return totalLength;
+ }
+
+ @Override
+ public long limit() {
+ return limit;
+ }
+
+ @Override
+ public ByteBuffer getByteBuffer(long offset, long length) {
+ bounceBuffer.copyFromMemoryBufferAsync(0, memoryBuffer, offset, length, stream);
+ stream.sync();
+ return bounceBuffer.asByteBuffer(0, (int) length);
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MetricsBatchIterator.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MetricsBatchIterator.java
new file mode 100644
index 00000000000..7795165e5fa
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MetricsBatchIterator.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import scala.collection.Iterator;
+
+import org.apache.spark.TaskContext;
+import org.apache.spark.executor.InputMetrics;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+
+public class MetricsBatchIterator implements Iterator {
+ private final Iterator iter;
+ private final InputMetrics inputMetrics;
+
+ public MetricsBatchIterator(Iterator iter) {
+ this.iter = iter;
+ this.inputMetrics = TaskContext.get().taskMetrics().inputMetrics();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return iter.hasNext();
+ }
+
+ @Override
+ public ColumnarBatch next() {
+ ColumnarBatch batch = iter.next();
+ inputMetrics.incRecordsRead(batch.numRows());
+ return batch;
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MutableBlockInfo.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MutableBlockInfo.java
new file mode 100644
index 00000000000..c2ed469f14a
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/MutableBlockInfo.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+/** Mutable Avro block metadata for iterator reuse. */
+public final class MutableBlockInfo implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private long blockSize;
+ private long dataSize;
+ private long count;
+
+ public MutableBlockInfo(long blockSize, long dataSize, long count) {
+ this.blockSize = blockSize;
+ this.dataSize = dataSize;
+ this.count = count;
+ }
+
+ public long blockSize() {
+ return blockSize;
+ }
+
+ public void setBlockSize(long blockSize) {
+ this.blockSize = blockSize;
+ }
+
+ public long dataSize() {
+ return dataSize;
+ }
+
+ public void setDataSize(long dataSize) {
+ this.dataSize = dataSize;
+ }
+
+ public long count() {
+ return count;
+ }
+
+ public void setCount(long count) {
+ this.count = count;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof MutableBlockInfo)) {
+ return false;
+ }
+ MutableBlockInfo other = (MutableBlockInfo) obj;
+ return blockSize == other.blockSize && dataSize == other.dataSize && count == other.count;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(blockSize, dataSize, count);
+ }
+
+ @Override
+ public String toString() {
+ return "MutableBlockInfo(" + blockSize + "," + dataSize + "," + count + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/NullHostMemoryOutputStream.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/NullHostMemoryOutputStream.java
new file mode 100644
index 00000000000..09144654539
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/NullHostMemoryOutputStream.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.nio.channels.ReadableByteChannel;
+
+/** A HostMemoryOutputStream only counts the written bytes, nothing is actually written. */
+public final class NullHostMemoryOutputStream extends HostMemoryOutputStream {
+ public NullHostMemoryOutputStream() {
+ super(null);
+ }
+
+ @Override
+ public void write(int i) {
+ pos += 1;
+ }
+
+ @Override
+ public void write(byte[] bytes) {
+ pos += bytes.length;
+ }
+
+ @Override
+ public void write(byte[] bytes, int offset, int len) {
+ pos += len;
+ }
+
+ @Override
+ public void copyFromChannel(ReadableByteChannel channel, long length) {
+ long endPos = pos + length;
+ while (pos != endPos) {
+ long bytesToCopy = Math.min(endPos - pos, Integer.MAX_VALUE);
+ pos += bytesToCopy;
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/PartitionRowData.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/PartitionRowData.java
new file mode 100644
index 00000000000..c5ae84db178
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/PartitionRowData.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+
+/** Partition value and replication count. */
+public final class PartitionRowData {
+ private final InternalRow rowValue;
+ private final int rowNum;
+
+ public PartitionRowData(InternalRow rowValue, int rowNum) {
+ this.rowValue = rowValue;
+ this.rowNum = rowNum;
+ }
+
+ public InternalRow rowValue() {
+ return rowValue;
+ }
+
+ public int rowNum() {
+ return rowNum;
+ }
+
+ public static PartitionRowData[] from(InternalRow[] rowValues, int[] rowNums) {
+ int length = Math.min(rowValues.length, rowNums.length);
+ PartitionRowData[] result = new PartitionRowData[length];
+ for (int i = 0; i < length; i++) {
+ result[i] = new PartitionRowData(rowValues[i], rowNums[i]);
+ }
+ return result;
+ }
+
+ public static PartitionRowData[] from(InternalRow[] rowValues, long[] rowNums) {
+ int length = Math.min(rowValues.length, rowNums.length);
+ PartitionRowData[] result = new PartitionRowData[length];
+ for (int i = 0; i < length; i++) {
+ long rowNum = rowNums[i];
+ if (rowNum > Integer.MAX_VALUE) {
+ throw new IllegalArgumentException(
+ "Row number " + rowNum + " exceeds max value of an integer.");
+ }
+ result[i] = new PartitionRowData(rowValues[i], (int) rowNum);
+ }
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof PartitionRowData)) {
+ return false;
+ }
+ PartitionRowData other = (PartitionRowData) obj;
+ return rowNum == other.rowNum && Objects.equals(rowValue, other.rowValue);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(rowValue, rowNum);
+ }
+
+ @Override
+ public String toString() {
+ return "PartitionRowData(" + rowValue + "," + rowNum + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileEndMsg.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileEndMsg.java
new file mode 100644
index 00000000000..c19651fa101
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileEndMsg.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class ProfileEndMsg implements ProfileMsg {
+ private static final long serialVersionUID = 1L;
+
+ private final String executorId;
+ private final String path;
+
+ public ProfileEndMsg(String executorId, String path) {
+ this.executorId = executorId;
+ this.path = path;
+ }
+
+ public String executorId() {
+ return executorId;
+ }
+
+ public String path() {
+ return path;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ProfileEndMsg)) {
+ return false;
+ }
+ ProfileEndMsg that = (ProfileEndMsg) other;
+ return Objects.equals(executorId, that.executorId) &&
+ Objects.equals(path, that.path);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(executorId, path);
+ }
+
+ @Override
+ public String toString() {
+ return "ProfileEndMsg(" + executorId + "," + path + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileErrorMsg.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileErrorMsg.java
new file mode 100644
index 00000000000..7aeeffc5c9d
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileErrorMsg.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class ProfileErrorMsg implements ProfileMsg {
+ private static final long serialVersionUID = 1L;
+
+ private final String executorId;
+ private final String msg;
+
+ public ProfileErrorMsg(String executorId, String msg) {
+ this.executorId = executorId;
+ this.msg = msg;
+ }
+
+ public String executorId() {
+ return executorId;
+ }
+
+ public String msg() {
+ return msg;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ProfileErrorMsg)) {
+ return false;
+ }
+ ProfileErrorMsg that = (ProfileErrorMsg) other;
+ return Objects.equals(executorId, that.executorId) &&
+ Objects.equals(msg, that.msg);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(executorId, msg);
+ }
+
+ @Override
+ public String toString() {
+ return "ProfileErrorMsg(" + executorId + "," + msg + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileInitMsg.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileInitMsg.java
new file mode 100644
index 00000000000..55a3b7627a7
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileInitMsg.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class ProfileInitMsg implements ProfileMsg {
+ private static final long serialVersionUID = 1L;
+
+ private final String executorId;
+ private final String path;
+
+ public ProfileInitMsg(String executorId, String path) {
+ this.executorId = executorId;
+ this.path = path;
+ }
+
+ public String executorId() {
+ return executorId;
+ }
+
+ public String path() {
+ return path;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ProfileInitMsg)) {
+ return false;
+ }
+ ProfileInitMsg that = (ProfileInitMsg) other;
+ return Objects.equals(executorId, that.executorId) &&
+ Objects.equals(path, that.path);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(executorId, path);
+ }
+
+ @Override
+ public String toString() {
+ return "ProfileInitMsg(" + executorId + "," + path + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileJobStageQueryMsg.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileJobStageQueryMsg.java
new file mode 100644
index 00000000000..dc2a8666a31
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileJobStageQueryMsg.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class ProfileJobStageQueryMsg implements ProfileMsg {
+ private static final long serialVersionUID = 1L;
+
+ private final int[] activeJobs;
+ private final int[] activeStages;
+
+ public ProfileJobStageQueryMsg(int[] activeJobs, int[] activeStages) {
+ this.activeJobs = activeJobs;
+ this.activeStages = activeStages;
+ }
+
+ public int[] activeJobs() {
+ return activeJobs;
+ }
+
+ public int[] activeStages() {
+ return activeStages;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ProfileJobStageQueryMsg)) {
+ return false;
+ }
+ ProfileJobStageQueryMsg that = (ProfileJobStageQueryMsg) other;
+ return Objects.equals(activeJobs, that.activeJobs) &&
+ Objects.equals(activeStages, that.activeStages);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(activeJobs, activeStages);
+ }
+
+ @Override
+ public String toString() {
+ return "ProfileJobStageQueryMsg(" + activeJobs + "," + activeStages + ")";
+ }
+}
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/errors/ConvUtils.scala b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileMsg.java
similarity index 57%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/errors/ConvUtils.scala
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileMsg.java
index 745d878f141..1cbeababfbb 100644
--- a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/errors/ConvUtils.scala
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileMsg.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -14,19 +14,9 @@
* limitations under the License.
*/
+package com.nvidia.spark.rapids;
-/*** spark-rapids-shim-json-lines
-{"spark": "330"}
-{"spark": "330db"}
-{"spark": "331"}
-{"spark": "332"}
-{"spark": "332db"}
-{"spark": "333"}
-{"spark": "334"}
-spark-rapids-shim-json-lines ***/
-package org.apache.spark.sql.errors
+import java.io.Serializable;
-object ConvUtils {
- // only Spark versions >= 340 support this function
- def overflowInConvError(): Unit = throw new UnsupportedOperationException()
+public interface ProfileMsg extends Serializable {
}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileStatusMsg.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileStatusMsg.java
new file mode 100644
index 00000000000..b5db1431c36
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ProfileStatusMsg.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.util.Objects;
+
+public class ProfileStatusMsg implements ProfileMsg {
+ private static final long serialVersionUID = 1L;
+
+ private final String executorId;
+ private final String msg;
+
+ public ProfileStatusMsg(String executorId, String msg) {
+ this.executorId = executorId;
+ this.msg = msg;
+ }
+
+ public String executorId() {
+ return executorId;
+ }
+
+ public String msg() {
+ return msg;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ProfileStatusMsg)) {
+ return false;
+ }
+ ProfileStatusMsg that = (ProfileStatusMsg) other;
+ return Objects.equals(executorId, that.executorId) &&
+ Objects.equals(msg, that.msg);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(executorId, msg);
+ }
+
+ @Override
+ public String toString() {
+ return "ProfileStatusMsg(" + executorId + "," + msg + ")";
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java
similarity index 99%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java
index ee4ee81a386..51ba9b62888 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnBuilder.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java
similarity index 98%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java
index c7913cd93e5..eae18071639 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java
similarity index 99%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java
index 87d92724e95..0f9900f1987 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsHostColumnVectorCore.java
@@ -1,6 +1,6 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java
similarity index 96%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java
index 1eb1b1f66d7..1f79dd19596 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java
similarity index 99%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java
index d35bc9b96c5..459bc5af7f7 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/RapidsNullSafeHostColumnVectorCore.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ShuffleBufferId.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ShuffleBufferId.java
new file mode 100644
index 00000000000..95b6d2d17d4
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ShuffleBufferId.java
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import org.apache.spark.storage.ShuffleBlockId;
+
+/** Identifier for a shuffle buffer that holds the data for a table. */
+public final class ShuffleBufferId implements Serializable {
+ private static final long serialVersionUID = 0L;
+
+ private final ShuffleBlockId blockId;
+ private final int tableId;
+ private final int shuffleId;
+ private final long mapId;
+
+ public ShuffleBufferId(ShuffleBlockId blockId, int tableId) {
+ this.blockId = blockId;
+ this.tableId = tableId;
+ this.shuffleId = blockId.shuffleId();
+ this.mapId = blockId.mapId();
+ }
+
+ public ShuffleBlockId blockId() {
+ return blockId;
+ }
+
+ public int tableId() {
+ return tableId;
+ }
+
+ public int shuffleId() {
+ return shuffleId;
+ }
+
+ public long mapId() {
+ return mapId;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ShuffleBufferId)) {
+ return false;
+ }
+ ShuffleBufferId that = (ShuffleBufferId) other;
+ return tableId == that.tableId && Objects.equals(blockId, that.blockId);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(blockId, tableId);
+ }
+
+ @Override
+ public String toString() {
+ return "ShuffleBufferId(" + blockId + "," + tableId + ")";
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java
similarity index 99%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java
index 295eb182cd7..626a8bc6d14 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedGpuColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java
similarity index 98%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java
index b8f4be5cd76..9dae9f0523e 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SlicedSerializedColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SpillPriorities.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SpillPriorities.java
new file mode 100644
index 00000000000..10376cb7249
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/SpillPriorities.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+/**
+ * Utility methods for managing spillable buffer priorities.
+ * The spill priority numerical space is divided into potentially overlapping ranges based on
+ * the type of buffer.
+ */
+public final class SpillPriorities {
+ /** Priorities for task output buffers intended for shuffle. */
+ public static final long OUTPUT_FOR_SHUFFLE_INITIAL_TASK_PRIORITY = Long.MIN_VALUE;
+
+ /**
+ * Priorities for buffers received from shuffle. Shuffle input buffers are about to be read by a
+ * task, so spill them if there's no other choice, but leave some space at the end of the priority
+ * range so there can be some things after it.
+ */
+ public static final long INPUT_FROM_SHUFFLE_PRIORITY = Long.MAX_VALUE - 1000;
+
+ /**
+ * Priority for buffers that are waiting for next to be called, i.e. data held between calls to
+ * {@code hasNext} and {@code next} or between different calls to {@code next}.
+ */
+ public static final long ACTIVE_ON_DECK_PRIORITY = INPUT_FROM_SHUFFLE_PRIORITY + 1;
+
+ /** Priority for multiple buffers being buffered within a call to next. */
+ public static final long ACTIVE_BATCHING_PRIORITY = ACTIVE_ON_DECK_PRIORITY + 100;
+
+ /** Priority offset for host memory buffers for spilling. */
+ public static final long HOST_MEMORY_BUFFER_SPILL_OFFSET = 0;
+
+ private SpillPriorities() {
+ }
+
+ /**
+ * Calculate a new priority based on an offset, clamping it to avoid wraparound.
+ *
+ * @param originalPriority the original priority
+ * @param offset the desired offset
+ * @return the resulting priority, with clamping if needed
+ */
+ public static long applyPriorityOffset(long originalPriority, long offset) {
+ if (offset < 0 && originalPriority < Long.MIN_VALUE - offset) {
+ return Long.MIN_VALUE;
+ } else if (offset > 0 && originalPriority > Long.MAX_VALUE - offset) {
+ return Long.MAX_VALUE;
+ } else {
+ return originalPriority + offset;
+ }
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/TableCompressionCodecConfig.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/TableCompressionCodecConfig.java
new file mode 100644
index 00000000000..62fef8edbc3
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/TableCompressionCodecConfig.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+/**
+ * Codec-specific table compression settings.
+ */
+public class TableCompressionCodecConfig implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final long lz4ChunkSize;
+ private final long zstdChunkSize;
+
+ public TableCompressionCodecConfig(long lz4ChunkSize, long zstdChunkSize) {
+ this.lz4ChunkSize = lz4ChunkSize;
+ this.zstdChunkSize = zstdChunkSize;
+ }
+
+ public long lz4ChunkSize() {
+ return lz4ChunkSize;
+ }
+
+ public long zstdChunkSize() {
+ return zstdChunkSize;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof TableCompressionCodecConfig)) {
+ return false;
+ }
+ TableCompressionCodecConfig that = (TableCompressionCodecConfig) other;
+ return lz4ChunkSize == that.lz4ChunkSize && zstdChunkSize == that.zstdChunkSize;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(lz4ChunkSize, zstdChunkSize);
+ }
+
+ @Override
+ public String toString() {
+ return "TableCompressionCodecConfig(" + lz4ChunkSize + "," + zstdChunkSize + ")";
+ }
+}
diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/DateTimeUtilsShims.scala b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ThreadPoolConf.java
similarity index 62%
rename from sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/DateTimeUtilsShims.scala
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ThreadPoolConf.java
index 21254c4b39a..b2bed218f95 100644
--- a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/DateTimeUtilsShims.scala
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/ThreadPoolConf.java
@@ -14,18 +14,18 @@
* limitations under the License.
*/
+package com.nvidia.spark.rapids;
-/*** spark-rapids-shim-json-lines
-{"spark": "400"}
-{"spark": "400db173"}
-{"spark": "401"}
-{"spark": "402"}
-{"spark": "411"}
-spark-rapids-shim-json-lines ***/
-package com.nvidia.spark.rapids.shims
+import java.io.Serializable;
-import org.apache.spark.sql.catalyst.util.SparkDateTimeUtils
+public interface ThreadPoolConf extends Serializable {
+ /**
+ * The maximum number of threads used by the thread pool, not necessarily the final number.
+ */
+ int maxThreadNumber();
-object DateTimeUtilsShims {
- def currentTimestamp: Long = SparkDateTimeUtils.instantToMicros(java.time.Instant.now())
-}
\ No newline at end of file
+ /**
+ * Whether to create pools for each Spark stage, only for testing for now.
+ */
+ boolean stageLevelPool();
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/TypeConverter.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/TypeConverter.java
similarity index 100%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/TypeConverter.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/TypeConverter.java
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java
similarity index 94%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java
rename to sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java
index eb1e1db5397..54359b96240 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/WithTableBuffer.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/AsyncMetrics.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/AsyncMetrics.java
new file mode 100644
index 00000000000..bea87901510
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/AsyncMetrics.java
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+/**
+ * Scheduling and execution timings for an async task.
+ */
+public class AsyncMetrics implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final long scheduleTimeMs;
+ private final long executionTimeMs;
+
+ public AsyncMetrics(long scheduleTimeMs, long executionTimeMs) {
+ this.scheduleTimeMs = scheduleTimeMs;
+ this.executionTimeMs = executionTimeMs;
+ }
+
+ public long scheduleTimeMs() {
+ return scheduleTimeMs;
+ }
+
+ public long executionTimeMs() {
+ return executionTimeMs;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof AsyncMetrics)) {
+ return false;
+ }
+ AsyncMetrics that = (AsyncMetrics) other;
+ return scheduleTimeMs == that.scheduleTimeMs
+ && executionTimeMs == that.executionTimeMs;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(scheduleTimeMs, executionTimeMs);
+ }
+
+ @Override
+ public String toString() {
+ return "AsyncMetrics(" + scheduleTimeMs + "," + executionTimeMs + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/ThrottlingExecutorStats.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/ThrottlingExecutorStats.java
new file mode 100644
index 00000000000..40b86a6ad3e
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/io/async/ThrottlingExecutorStats.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.io.async;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+/**
+ * Mutable throttling counters updated by ThrottlingExecutor.
+ */
+public class ThrottlingExecutorStats implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ public int numTasksScheduled;
+ public long accumulatedThrottleTimeNs;
+ public long minThrottleTimeNs;
+ public long maxThrottleTimeNs;
+
+ public ThrottlingExecutorStats(
+ int numTasksScheduled,
+ long accumulatedThrottleTimeNs,
+ long minThrottleTimeNs,
+ long maxThrottleTimeNs) {
+ this.numTasksScheduled = numTasksScheduled;
+ this.accumulatedThrottleTimeNs = accumulatedThrottleTimeNs;
+ this.minThrottleTimeNs = minThrottleTimeNs;
+ this.maxThrottleTimeNs = maxThrottleTimeNs;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof ThrottlingExecutorStats)) {
+ return false;
+ }
+ ThrottlingExecutorStats that = (ThrottlingExecutorStats) other;
+ return numTasksScheduled == that.numTasksScheduled
+ && accumulatedThrottleTimeNs == that.accumulatedThrottleTimeNs
+ && minThrottleTimeNs == that.minThrottleTimeNs
+ && maxThrottleTimeNs == that.maxThrottleTimeNs;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(
+ numTasksScheduled, accumulatedThrottleTimeNs, minThrottleTimeNs, maxThrottleTimeNs);
+ }
+
+ @Override
+ public String toString() {
+ return "ThrottlingExecutorStats(" + numTasksScheduled + ","
+ + accumulatedThrottleTimeNs + "," + minThrottleTimeNs + ","
+ + maxThrottleTimeNs + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockRange.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockRange.java
new file mode 100644
index 00000000000..1dcc97b5165
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockRange.java
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle;
+
+import java.util.Objects;
+
+/** Byte range for a block. */
+public final class BlockRange {
+ private final T block;
+ private final long rangeStart;
+ private final long rangeEnd;
+
+ public BlockRange(T block, long rangeStart, long rangeEnd) {
+ if (rangeStart >= rangeEnd) {
+ throw new IllegalArgumentException(
+ "requirement failed: Instantiated a BlockRange with invalid boundaries: " +
+ rangeStart + " to " + rangeEnd);
+ }
+ this.block = block;
+ this.rangeStart = rangeStart;
+ this.rangeEnd = rangeEnd;
+ }
+
+ public T block() {
+ return block;
+ }
+
+ public long rangeStart() {
+ return rangeStart;
+ }
+
+ public long rangeEnd() {
+ return rangeEnd;
+ }
+
+ public long rangeSize() {
+ return rangeEnd - rangeStart;
+ }
+
+ public boolean isComplete() {
+ return rangeEnd == block.size();
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof BlockRange)) {
+ return false;
+ }
+ BlockRange> other = (BlockRange>) obj;
+ return rangeStart == other.rangeStart &&
+ rangeEnd == other.rangeEnd &&
+ Objects.equals(block, other.block);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(block, rangeStart, rangeEnd);
+ }
+
+ @Override
+ public String toString() {
+ return "BlockRange(" + block + "," + rangeStart + "," + rangeEnd + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockWithSize.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockWithSize.java
new file mode 100644
index 00000000000..312f8198fef
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/BlockWithSize.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle;
+
+/** Block-like value that can report its size in bytes. */
+public interface BlockWithSize {
+ long size();
+}
diff --git a/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/TransactionStats.java b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/TransactionStats.java
new file mode 100644
index 00000000000..75f4a0cef6e
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/com/nvidia/spark/rapids/shuffle/TransactionStats.java
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shuffle;
+
+import java.util.Objects;
+
+/** Statistics for a shuffle transaction. */
+public final class TransactionStats {
+ private final double txTimeMs;
+ private final long sendSize;
+ private final long receiveSize;
+ private final double sendThroughput;
+ private final double recvThroughput;
+
+ public TransactionStats(double txTimeMs, long sendSize, long receiveSize,
+ double sendThroughput, double recvThroughput) {
+ this.txTimeMs = txTimeMs;
+ this.sendSize = sendSize;
+ this.receiveSize = receiveSize;
+ this.sendThroughput = sendThroughput;
+ this.recvThroughput = recvThroughput;
+ }
+
+ public double txTimeMs() {
+ return txTimeMs;
+ }
+
+ public long sendSize() {
+ return sendSize;
+ }
+
+ public long receiveSize() {
+ return receiveSize;
+ }
+
+ public double sendThroughput() {
+ return sendThroughput;
+ }
+
+ public double recvThroughput() {
+ return recvThroughput;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof TransactionStats)) {
+ return false;
+ }
+ TransactionStats other = (TransactionStats) obj;
+ return Double.compare(txTimeMs, other.txTimeMs) == 0 &&
+ sendSize == other.sendSize &&
+ receiveSize == other.receiveSize &&
+ Double.compare(sendThroughput, other.sendThroughput) == 0 &&
+ Double.compare(recvThroughput, other.recvThroughput) == 0;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(txTimeMs, sendSize, receiveSize, sendThroughput, recvThroughput);
+ }
+
+ @Override
+ public String toString() {
+ return "TransactionStats(" + txTimeMs + "," + sendSize + "," + receiveSize + "," +
+ sendThroughput + "," + recvThroughput + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/BasicColumnarWriteTaskStats.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/BasicColumnarWriteTaskStats.java
new file mode 100644
index 00000000000..7579ea33958
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/BasicColumnarWriteTaskStats.java
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids;
+
+import java.util.Objects;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.execution.datasources.WriteTaskStats;
+
+import scala.collection.Seq;
+
+/**
+ * Simple metrics collected during an instance of GpuFileFormatDataWriter.
+ * These were first introduced in https://github.com/apache/spark/pull/18159 (SPARK-20703).
+ */
+public final class BasicColumnarWriteTaskStats implements WriteTaskStats {
+ private static final long serialVersionUID = 0L;
+
+ private final Seq partitions;
+ private final int numFiles;
+ private final int numWriters;
+ private final long numBytes;
+ private final long numRows;
+
+ public BasicColumnarWriteTaskStats(
+ Seq partitions,
+ int numFiles,
+ int numWriters,
+ long numBytes,
+ long numRows) {
+ this.partitions = partitions;
+ this.numFiles = numFiles;
+ this.numWriters = numWriters;
+ this.numBytes = numBytes;
+ this.numRows = numRows;
+ }
+
+ public Seq partitions() {
+ return partitions;
+ }
+
+ public int numFiles() {
+ return numFiles;
+ }
+
+ public int numWriters() {
+ return numWriters;
+ }
+
+ public long numBytes() {
+ return numBytes;
+ }
+
+ public long numRows() {
+ return numRows;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof BasicColumnarWriteTaskStats)) {
+ return false;
+ }
+ BasicColumnarWriteTaskStats that = (BasicColumnarWriteTaskStats) other;
+ return numFiles == that.numFiles
+ && numWriters == that.numWriters
+ && numBytes == that.numBytes
+ && numRows == that.numRows
+ && Objects.equals(partitions, that.partitions);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(partitions, numFiles, numWriters, numBytes, numRows);
+ }
+
+ @Override
+ public String toString() {
+ return "BasicColumnarWriteTaskStats(" + partitions + "," + numFiles + ","
+ + numWriters + "," + numBytes + "," + numRows + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/NanoTime.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/NanoTime.java
new file mode 100644
index 00000000000..d862ae90372
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/NanoTime.java
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids;
+
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.concurrent.TimeUnit;
+
+public final class NanoTime implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private final Long value;
+
+ public NanoTime(Long value) {
+ this.value = value;
+ }
+
+ public Long value() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ long hours = TimeUnit.NANOSECONDS.toHours(value);
+ long remaining = value - TimeUnit.HOURS.toNanos(hours);
+ long minutes = TimeUnit.NANOSECONDS.toMinutes(remaining);
+ remaining -= TimeUnit.MINUTES.toNanos(minutes);
+ double seconds = ((double) remaining) / TimeUnit.SECONDS.toNanos(1);
+ return String.format(Locale.US, "%02d:%02d:%06.3f", hours, minutes, seconds);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof NanoTime)) {
+ return false;
+ }
+ NanoTime other = (NanoTime) obj;
+ return Objects.equals(value, other.value);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(value);
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/SizeInBytes.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/SizeInBytes.java
new file mode 100644
index 00000000000..7fb6c83f5e8
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/SizeInBytes.java
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+public final class SizeInBytes implements Serializable {
+ private static final long serialVersionUID = 1L;
+
+ private static final String[] SIZE_UNIT_NAMES = {"B", "KB", "MB", "GB", "TB", "PB", "EB"};
+
+ private final Long value;
+
+ public SizeInBytes(Long value) {
+ this.value = value;
+ }
+
+ public Long value() {
+ return value;
+ }
+
+ @Override
+ public String toString() {
+ long unitVal = value;
+ long remainVal = 0;
+ int unitIndex = 0;
+ while (unitIndex < SIZE_UNIT_NAMES.length && unitVal >= 1024) {
+ long nextUnitVal = unitVal >> 10;
+ remainVal = unitVal - (nextUnitVal << 10);
+ unitVal = nextUnitVal;
+ unitIndex += 1;
+ }
+ String finalVal = String.format("%.2f", unitVal + (remainVal / 1024.0));
+ return finalVal + SIZE_UNIT_NAMES[unitIndex] + " (" + value + " bytes)";
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+ if (!(obj instanceof SizeInBytes)) {
+ return false;
+ }
+ SizeInBytes other = (SizeInBytes) obj;
+ return Objects.equals(value, other.value);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(value);
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinCardinalityStats.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinCardinalityStats.java
new file mode 100644
index 00000000000..8cd8a0c4cfc
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinCardinalityStats.java
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.execution;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import org.apache.spark.sql.types.DataType;
+
+import scala.collection.Seq;
+
+/** Statistics for join cardinality logging to help diagnose performance issues. */
+public final class JoinCardinalityStats implements Serializable {
+ private static final long serialVersionUID = 0L;
+
+ private final long leftRowCount;
+ private final long rightRowCount;
+ private final long leftDistinctCount;
+ private final long rightDistinctCount;
+ private final Seq> leftNullCounts;
+ private final Seq> rightNullCounts;
+ private final Seq leftKeyTypes;
+ private final Seq rightKeyTypes;
+
+ public JoinCardinalityStats(
+ long leftRowCount,
+ long rightRowCount,
+ long leftDistinctCount,
+ long rightDistinctCount,
+ Seq> leftNullCounts,
+ Seq> rightNullCounts,
+ Seq leftKeyTypes,
+ Seq rightKeyTypes) {
+ this.leftRowCount = leftRowCount;
+ this.rightRowCount = rightRowCount;
+ this.leftDistinctCount = leftDistinctCount;
+ this.rightDistinctCount = rightDistinctCount;
+ this.leftNullCounts = leftNullCounts;
+ this.rightNullCounts = rightNullCounts;
+ this.leftKeyTypes = leftKeyTypes;
+ this.rightKeyTypes = rightKeyTypes;
+ }
+
+ public long leftRowCount() {
+ return leftRowCount;
+ }
+
+ public long rightRowCount() {
+ return rightRowCount;
+ }
+
+ public long leftDistinctCount() {
+ return leftDistinctCount;
+ }
+
+ public long rightDistinctCount() {
+ return rightDistinctCount;
+ }
+
+ public Seq> leftNullCounts() {
+ return leftNullCounts;
+ }
+
+ public Seq> rightNullCounts() {
+ return rightNullCounts;
+ }
+
+ public Seq leftKeyTypes() {
+ return leftKeyTypes;
+ }
+
+ public Seq rightKeyTypes() {
+ return rightKeyTypes;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof JoinCardinalityStats)) {
+ return false;
+ }
+ JoinCardinalityStats that = (JoinCardinalityStats) other;
+ return leftRowCount == that.leftRowCount
+ && rightRowCount == that.rightRowCount
+ && leftDistinctCount == that.leftDistinctCount
+ && rightDistinctCount == that.rightDistinctCount
+ && Objects.equals(leftNullCounts, that.leftNullCounts)
+ && Objects.equals(rightNullCounts, that.rightNullCounts)
+ && Objects.equals(leftKeyTypes, that.leftKeyTypes)
+ && Objects.equals(rightKeyTypes, that.rightKeyTypes);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(
+ leftRowCount,
+ rightRowCount,
+ leftDistinctCount,
+ rightDistinctCount,
+ leftNullCounts,
+ rightNullCounts,
+ leftKeyTypes,
+ rightKeyTypes);
+ }
+
+ @Override
+ public String toString() {
+ return "JoinCardinalityStats(" + leftRowCount + "," + rightRowCount + ","
+ + leftDistinctCount + "," + rightDistinctCount + "," + leftNullCounts + ","
+ + rightNullCounts + "," + leftKeyTypes + "," + rightKeyTypes + ")";
+ }
+}
diff --git a/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinOptions.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinOptions.java
new file mode 100644
index 00000000000..be0487ecde7
--- /dev/null
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/rapids/execution/JoinOptions.java
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.rapids.execution;
+
+import java.io.Serializable;
+import java.util.Objects;
+
+import scala.Enumeration.Value;
+
+/** Options to control join behavior. */
+public final class JoinOptions implements Serializable {
+ private static final long serialVersionUID = 0L;
+
+ private final Value strategy;
+ private final Value buildSideSelection;
+ private final long targetSize;
+ private final boolean logCardinalityEnabled;
+ private final double sizeEstimateThreshold;
+
+ public JoinOptions(
+ Value strategy,
+ Value buildSideSelection,
+ long targetSize,
+ boolean logCardinalityEnabled,
+ double sizeEstimateThreshold) {
+ this.strategy = strategy;
+ this.buildSideSelection = buildSideSelection;
+ this.targetSize = targetSize;
+ this.logCardinalityEnabled = logCardinalityEnabled;
+ this.sizeEstimateThreshold = sizeEstimateThreshold;
+ }
+
+ public Value strategy() {
+ return strategy;
+ }
+
+ public Value buildSideSelection() {
+ return buildSideSelection;
+ }
+
+ public long targetSize() {
+ return targetSize;
+ }
+
+ public boolean logCardinalityEnabled() {
+ return logCardinalityEnabled;
+ }
+
+ public double sizeEstimateThreshold() {
+ return sizeEstimateThreshold;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (this == other) {
+ return true;
+ }
+ if (!(other instanceof JoinOptions)) {
+ return false;
+ }
+ JoinOptions that = (JoinOptions) other;
+ return targetSize == that.targetSize
+ && logCardinalityEnabled == that.logCardinalityEnabled
+ && Double.compare(that.sizeEstimateThreshold, sizeEstimateThreshold) == 0
+ && Objects.equals(strategy, that.strategy)
+ && Objects.equals(buildSideSelection, that.buildSideSelection);
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(
+ strategy, buildSideSelection, targetSize, logCardinalityEnabled, sizeEstimateThreshold);
+ }
+
+ @Override
+ public String toString() {
+ return "JoinOptions(" + strategy + "," + buildSideSelection + "," + targetSize + ","
+ + logCardinalityEnabled + "," + sizeEstimateThreshold + ")";
+ }
+}
diff --git a/sql-plugin/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java
similarity index 99%
rename from sql-plugin/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java
rename to sql-plugin-columnar/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java
index 514f11316af..78fb986b307 100644
--- a/sql-plugin/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java
+++ b/sql-plugin-columnar/src/main/java/org/apache/spark/sql/vectorized/rapids/AccessibleArrowColumnVector.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-fileio/pom.xml b/sql-plugin-fileio/pom.xml
new file mode 100644
index 00000000000..e45d88fa79e
--- /dev/null
+++ b/sql-plugin-fileio/pom.xml
@@ -0,0 +1,118 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.12
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-fileio_2.12
+ Java-only file I/O runtime plumbing for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-fileio
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.nvidia
+ spark-rapids-jni
+ ${jni.classifier}
+
+
+ org.apache.spark
+ spark-core_${scala.binary.version}
+ ${spark.version}
+ provided
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/FileUtils.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/FileUtils.java
new file mode 100644
index 00000000000..06059b11e50
--- /dev/null
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/FileUtils.java
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2019-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import java.io.IOException;
+import java.util.Random;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileAlreadyExistsException;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+
+public final class FileUtils {
+ private FileUtils() {}
+
+ public static final class TempFile {
+ private final FSDataOutputStream outputStream;
+ private final Path path;
+
+ TempFile(FSDataOutputStream outputStream, Path path) {
+ this.outputStream = outputStream;
+ this.path = path;
+ }
+
+ public FSDataOutputStream getOutputStream() {
+ return outputStream;
+ }
+
+ public Path getPath() {
+ return path;
+ }
+ }
+
+ public static TempFile createTempFile(
+ Configuration conf, String pathPrefix, String pathSuffix) throws IOException {
+ FileSystem fs = new Path(pathPrefix).getFileSystem(conf);
+ Random rnd = new Random();
+ String suffix = pathSuffix != null ? pathSuffix : "";
+ while (true) {
+ Path path = new Path(pathPrefix + rnd.nextInt(Integer.MAX_VALUE) + suffix);
+ if (!fs.exists(path)) {
+ try {
+ return new TempFile(fs.create(path, false), path);
+ } catch (FileAlreadyExistsException e) {
+ // Retry if another writer won the race between exists and create.
+ }
+ }
+ }
+ }
+}
diff --git a/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java
new file mode 100644
index 00000000000..407bdd03fa6
--- /dev/null
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.fileio;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
+import org.apache.hadoop.conf.Configuration;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * Static helpers shared by {@link RapidsInputFile} implementations.
+ */
+public final class RapidsInputFiles {
+ private static final S3PerfReader DISABLED_S3_PERF_READER = new S3PerfReader() {
+ @Override
+ public boolean isEnabled() {
+ return false;
+ }
+
+ @Override
+ public boolean readVectored(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ List copyRanges) {
+ return false;
+ }
+
+ @Override
+ public boolean readTail(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ long length,
+ long outputOffset) {
+ return false;
+ }
+ };
+
+ private static volatile S3PerfReader s3PerfReader = DISABLED_S3_PERF_READER;
+
+ private RapidsInputFiles() {}
+
+ /**
+ * Java bridge for S3 PerfIO integration. The implementation lives in sql-plugin
+ * because it depends on private Scala PerfIO state.
+ */
+ public interface S3PerfReader {
+ boolean isEnabled();
+
+ boolean readVectored(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ List copyRanges) throws IOException;
+
+ boolean readTail(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ long length,
+ long outputOffset) throws IOException;
+ }
+
+ public static void setS3PerfReader(S3PerfReader reader) {
+ s3PerfReader = Objects.requireNonNull(reader, "reader can't be null");
+ }
+
+ public static void resetS3PerfReader() {
+ s3PerfReader = DISABLED_S3_PERF_READER;
+ }
+
+ /**
+ * True iff the active SQL-plugin bridge says the S3 PerfIO path is enabled.
+ * Returns false before the bridge is registered so callers default to the
+ * non-PerfIO path during early bring-up.
+ */
+ public static boolean isS3PerfEnabled() {
+ return s3PerfReader.isEnabled();
+ }
+
+ public static boolean readS3Vectored(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ List copyRanges) throws IOException {
+ return s3PerfReader.readVectored(hadoopConf, fileUri, output, copyRanges);
+ }
+
+ public static boolean readS3Tail(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ long length,
+ long outputOffset) throws IOException {
+ return s3PerfReader.readTail(hadoopConf, fileUri, output, length, outputOffset);
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java
similarity index 75%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java
index dd9da173280..f17730211a7 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopFileIO.java
@@ -16,12 +16,10 @@
package com.nvidia.spark.rapids.fileio.hadoop;
-import com.nvidia.spark.rapids.fileio.RapidsInputFiles;
import com.nvidia.spark.rapids.jni.fileio.RapidsFileIO;
import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
import com.nvidia.spark.rapids.jni.fileio.RapidsOutputFile;
import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.util.SerializableConfiguration;
@@ -34,9 +32,18 @@
*/
public class HadoopFileIO implements RapidsFileIO {
private final SerializableConfiguration hadoopConf;
+ private final HadoopInputFileFactory inputFileFactory;
public HadoopFileIO(Configuration hadoopConf) {
Objects.requireNonNull(hadoopConf, "hadoopConf can't be null");
+ this.inputFileFactory = null;
+ this.hadoopConf = new SerializableConfiguration(hadoopConf);
+ }
+
+ public HadoopFileIO(Configuration hadoopConf, HadoopInputFileFactory inputFileFactory) {
+ Objects.requireNonNull(hadoopConf, "hadoopConf can't be null");
+ this.inputFileFactory = Objects.requireNonNull(
+ inputFileFactory, "inputFileFactory can't be null");
this.hadoopConf = new SerializableConfiguration(hadoopConf);
}
@@ -47,9 +54,9 @@ public RapidsInputFile newInputFile(String path) throws IOException {
@Override
public RapidsInputFile newInputFile(Path path) throws IOException {
- String scheme = path.toUri().getScheme();
- if (scheme != null && scheme.startsWith("s3") && RapidsInputFiles.isS3PerfEnabled()) {
- return S3InputFile.create(path, hadoopConf.value());
+ Objects.requireNonNull(path, "path can't be null");
+ if (inputFileFactory != null) {
+ return inputFileFactory.create(path, hadoopConf.value());
}
return HadoopInputFile.create(path, hadoopConf.value());
}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java
similarity index 98%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java
index a1688b50be3..25ab03e2a7b 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFile.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/ReplaceDataExecShim.scala b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFileFactory.java
similarity index 50%
rename from sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/ReplaceDataExecShim.scala
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFileFactory.java
index e361720f76a..972a2ce103c 100644
--- a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/ReplaceDataExecShim.scala
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputFileFactory.java
@@ -13,24 +13,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-/*** spark-rapids-shim-json-lines
-{"spark": "400"}
-{"spark": "401"}
-{"spark": "402"}
-{"spark": "411"}
-spark-rapids-shim-json-lines ***/
-package com.nvidia.spark.rapids.shims
-import com.nvidia.spark.rapids.{GpuExec, GpuWrite}
+package com.nvidia.spark.rapids.fileio.hadoop;
-import org.apache.spark.sql.execution.SparkPlan
-import org.apache.spark.sql.execution.datasources.v2.{GpuReplaceDataExec, ReplaceDataExec}
+import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
-object ReplaceDataExecShim {
- def convertToGpu(
- cpuExec: ReplaceDataExec,
- childPlan: SparkPlan,
- gpuWrite: GpuWrite): GpuExec = {
- GpuReplaceDataExec(childPlan, cpuExec.refreshCache, cpuExec.projections, gpuWrite)
- }
+import java.io.IOException;
+import java.io.Serializable;
+
+/**
+ * Serializable extension point for callers that want to replace Hadoop input
+ * files with an optimized implementation for selected paths.
+ */
+@FunctionalInterface
+public interface HadoopInputFileFactory extends Serializable {
+ RapidsInputFile create(Path path, Configuration conf) throws IOException;
}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java
similarity index 97%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java
index 289e6dc0355..18b3c464da8 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopInputStream.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java
similarity index 97%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java
index 3ed1146eefa..f24f77d0fe5 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputFile.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java
similarity index 97%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java
rename to sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java
index 301570fba1f..9b628bc826f 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/HadoopOutputStream.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOHadoopInputFileFactory.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOHadoopInputFileFactory.java
new file mode 100644
index 00000000000..2fcd6d896af
--- /dev/null
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOHadoopInputFileFactory.java
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.fileio.hadoop;
+
+import com.nvidia.spark.rapids.fileio.RapidsInputFiles;
+import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+
+/** Hadoop input factory that routes S3 paths through the registered PerfIO bridge. */
+public final class PerfIOHadoopInputFileFactory implements HadoopInputFileFactory {
+ public static final PerfIOHadoopInputFileFactory INSTANCE = new PerfIOHadoopInputFileFactory();
+
+ private PerfIOHadoopInputFileFactory() {}
+
+ @Override
+ public RapidsInputFile create(Path path, Configuration conf) throws IOException {
+ String scheme = path.toUri().getScheme();
+ if (scheme != null && scheme.startsWith("s3") && RapidsInputFiles.isS3PerfEnabled()) {
+ return S3InputFile.create(path, conf);
+ }
+ return HadoopInputFile.create(path, conf);
+ }
+
+ private Object readResolve() {
+ return INSTANCE;
+ }
+}
diff --git a/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/S3InputFile.java b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/S3InputFile.java
new file mode 100644
index 00000000000..d9932bfda4e
--- /dev/null
+++ b/sql-plugin-fileio/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/S3InputFile.java
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.fileio.hadoop;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+import com.nvidia.spark.rapids.fileio.RapidsInputFiles;
+import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
+import com.nvidia.spark.rapids.jni.fileio.SeekableInputStream;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.List;
+import java.util.OptionalLong;
+
+/**
+ * S3-backed {@link RapidsInputFile} for Hadoop-conf-driven (non-iceberg) reads.
+ * {@code readVectored} issues batched byte-range GETs through the optimized
+ * vectored-read path; the other operations delegate to the standard
+ * {@link HadoopInputFile}.
+ */
+public class S3InputFile implements RapidsInputFile {
+ private final HadoopInputFile delegate;
+ private final URI fileUri;
+ private final Configuration hadoopConf;
+
+ public static S3InputFile create(Path filePath, Configuration conf) throws IOException {
+ return new S3InputFile(HadoopInputFile.create(filePath, conf), filePath.toUri(), conf);
+ }
+
+ private S3InputFile(HadoopInputFile delegate, URI fileUri, Configuration hadoopConf) {
+ this.delegate = delegate;
+ this.fileUri = fileUri;
+ this.hadoopConf = hadoopConf;
+ }
+
+ @Override
+ public String path() {
+ return delegate.path();
+ }
+
+ @Override
+ public long getLength() throws IOException {
+ return delegate.getLength();
+ }
+
+ @Override
+ public OptionalLong getLastModificationTime() throws IOException {
+ return delegate.getLastModificationTime();
+ }
+
+ @Override
+ public SeekableInputStream open() throws IOException {
+ return delegate.open();
+ }
+
+ @Override
+ public void readVectored(HostMemoryBuffer output, List copyRanges)
+ throws IOException {
+ if (!RapidsInputFiles.readS3Vectored(hadoopConf, fileUri, output, copyRanges)) {
+ throw new IllegalArgumentException("expected to use PerfIO to read");
+ }
+ }
+
+ /**
+ * Issue a single suffix-range {@code GetObject} ({@code Range: bytes=-N}) for
+ * the last {@code length} bytes. Avoids the {@code getLength()} round-trip the
+ * default {@link RapidsInputFile#readTail} would make.
+ */
+ @Override
+ public void readTail(long length, HostMemoryBuffer output) throws IOException {
+ if (length == 0) {
+ return;
+ }
+ if (length < 0) {
+ throw new IllegalArgumentException("length must be non-negative");
+ }
+ if (!RapidsInputFiles.readS3Tail(hadoopConf, fileUri, output, length, 0L)) {
+ throw new IllegalArgumentException("expected to use PerfIO to read");
+ }
+ }
+}
diff --git a/sql-plugin-format/pom.xml b/sql-plugin-format/pom.xml
new file mode 100644
index 00000000000..8535478f125
--- /dev/null
+++ b/sql-plugin-format/pom.xml
@@ -0,0 +1,111 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.12
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+
+ rapids-4-spark-sql-plugin-format_2.12
+ Java-only FlatBuffers format classes for the RAPIDS SQL plugin
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-format
+ false
+ **/*
+ package
+ true
+
+
+
+
+ com.google.flatbuffers
+ flatbuffers-java
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+ default-compile
+ compile
+
+ compile
+
+
+
+ default-testCompile
+ test-compile
+
+ testCompile
+
+
+
+
+ ${java.major.version}
+
+ -Xlint:all,-serial,-path,-try,-processing
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ eclipse-add-source
+ none
+
+
+ scala-compile-first
+ none
+
+
+ scala-test-compile-first
+ none
+
+
+ attach-scaladocs
+ none
+
+
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+
+
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java
similarity index 83%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java
index ed28904a876..b43473f0b14 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BlockIdMeta.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -49,4 +65,3 @@ public static int endBlockIdMeta(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java
similarity index 85%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java
index ea8ea94ce93..0e040e9da42 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferMeta.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -61,4 +77,3 @@ public static int endBufferMeta(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java
similarity index 74%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java
index 38d8e0995dd..71f4dfff4ce 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferRequest.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -34,4 +50,3 @@ public static int endBufferTransferRequest(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java
similarity index 82%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java
index a3cbda453d4..8a074fcc127 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/BufferTransferResponse.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -50,4 +66,3 @@ public static int endBufferTransferResponse(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java
similarity index 87%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java
index 681cc7ab31d..3bb1a2bd4ad 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecBufferDescriptor.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -71,4 +87,3 @@ public static int endCodecBufferDescriptor(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecType.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecType.java
similarity index 55%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecType.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecType.java
index 1451e1221a4..762773e618d 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/CodecType.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/CodecType.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -25,4 +41,3 @@ private CodecType() { }
public static String name(int e) { return names[e - COPY]; }
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java
similarity index 80%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java
index 301d1a9a682..84f2b9f4cd1 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataRequest.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -42,4 +58,3 @@ public static int endMetadataRequest(FlatBufferBuilder builder) {
public static void finishMetadataRequestBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); }
public static void finishSizePrefixedMetadataRequestBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset); }
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java
similarity index 80%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java
index 63d2cdd311b..bb9fdd0a0a3 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/MetadataResponse.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -42,4 +58,3 @@ public static int endMetadataResponse(FlatBufferBuilder builder) {
public static void finishMetadataResponseBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); }
public static void finishSizePrefixedMetadataResponseBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset); }
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java
similarity index 84%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java
index 03b37241215..f8ce12ec918 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TableMeta.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -58,4 +74,3 @@ public static int endTableMeta(FlatBufferBuilder builder) {
return o;
}
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java
similarity index 83%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java
index 4cb5a2f2547..58f8f838906 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferRequest.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -50,4 +66,3 @@ public static int endTransferRequest(FlatBufferBuilder builder) {
public static void finishTransferRequestBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); }
public static void finishSizePrefixedTransferRequestBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset); }
}
-
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java
similarity index 81%
rename from sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java
rename to sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java
index b1a03c05ba3..208cceb04ff 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferResponse.java
@@ -1,3 +1,19 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
// automatically generated by the FlatBuffers compiler, do not modify
package com.nvidia.spark.rapids.format;
@@ -42,4 +58,3 @@ public static int endTransferResponse(FlatBufferBuilder builder) {
public static void finishTransferResponseBuffer(FlatBufferBuilder builder, int offset) { builder.finish(offset); }
public static void finishSizePrefixedTransferResponseBuffer(FlatBufferBuilder builder, int offset) { builder.finishSizePrefixed(offset); }
}
-
diff --git a/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferState.java b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferState.java
new file mode 100644
index 00000000000..41dad2caa6e
--- /dev/null
+++ b/sql-plugin-format/src/main/java/com/nvidia/spark/rapids/format/TransferState.java
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// automatically generated by the FlatBuffers compiler, do not modify
+
+package com.nvidia.spark.rapids.format;
+
+public final class TransferState {
+ private TransferState() { }
+ /**
+ * UCX transfer initiated on sender-side
+ */
+ public static final byte STARTED = 0;
+ /**
+ * Data has been compressed and requires meta update
+ */
+ public static final byte BUFFER_META_UPDATED = 1;
+
+ public static final String[] names = { "STARTED", "BUFFER_META_UPDATED", };
+
+ public static String name(int e) { return names[e]; }
+}
diff --git a/sql-plugin-shims/pom.xml b/sql-plugin-shims/pom.xml
new file mode 100644
index 00000000000..b1f200b2356
--- /dev/null
+++ b/sql-plugin-shims/pom.xml
@@ -0,0 +1,68 @@
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-shim-deps-parent_2.12
+ 26.08.0-SNAPSHOT
+ ../shim-deps/pom.xml
+
+ rapids-4-spark-sql-shims_2.12
+ RAPIDS Accelerator for Apache Spark SQL Plugin Shims
+ Compile-time isolated SQL plugin shims
+ 26.08.0-SNAPSHOT
+
+
+ sql-plugin-shims
+ false
+ **/*
+ package
+
+
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-api_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ org.scala-lang
+ scala-library
+
+
+
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ maven-antrun-plugin
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+
+
diff --git a/sql-plugin-shims/src/main/scala/org/apache/spark/sql/errors/ConvUtils.scala b/sql-plugin-shims/src/main/scala/org/apache/spark/sql/errors/ConvUtils.scala
new file mode 100644
index 00000000000..abd23b7f158
--- /dev/null
+++ b/sql-plugin-shims/src/main/scala/org/apache/spark/sql/errors/ConvUtils.scala
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.errors
+
+import java.lang.reflect.InvocationTargetException
+
+object ConvUtils {
+ private val queryExecutionErrorsCompanion =
+ "org.apache.spark.sql.errors.QueryExecutionErrors$"
+
+ def overflowInConvError(): Unit = {
+ try {
+ val companion = Class.forName(queryExecutionErrorsCompanion).getField("MODULE$").get(null)
+ val method = companion.getClass.getMethods.find { method =>
+ method.getName == "overflowInConvError" && method.getParameterCount == 1
+ }.getOrElse {
+ throw new UnsupportedOperationException()
+ }
+ throw method.invoke(companion, null.asInstanceOf[AnyRef]).asInstanceOf[Throwable]
+ } catch {
+ case _: ClassNotFoundException | _: NoSuchFieldException =>
+ throw new UnsupportedOperationException()
+ case e: InvocationTargetException =>
+ throw e.getCause
+ }
+ }
+}
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala b/sql-plugin-shims/src/main/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala
similarity index 65%
rename from sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala
rename to sql-plugin-shims/src/main/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala
index 6de8f1d6165..a68f2d6bafd 100644
--- a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala
+++ b/sql-plugin-shims/src/main/scala/org/apache/spark/sql/rapids/shims/SparkSessionUtils.scala
@@ -14,24 +14,29 @@
* limitations under the License.
*/
-/*** spark-rapids-shim-json-lines
-{"spark": "400"}
-{"spark": "400db173"}
-{"spark": "401"}
-{"spark": "402"}
-{"spark": "411"}
-spark-rapids-shim-json-lines ***/
package org.apache.spark.sql.rapids.shims
-import org.apache.spark.sql.classic.SparkSession
+import java.lang.reflect.InvocationTargetException
+
+import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.execution.SparkPlan
object SparkSessionUtils {
+
def sessionFromPlan(plan: SparkPlan): SparkSession = {
- plan.session
+ invokeNoArg(plan, "session").asInstanceOf[SparkSession]
}
def leafNodeDefaultParallelism(ss: SparkSession): Int = {
- ss.leafNodeDefaultParallelism
+ invokeNoArg(ss, "leafNodeDefaultParallelism").asInstanceOf[Int]
+ }
+
+ private def invokeNoArg(target: AnyRef, methodName: String): AnyRef = {
+ try {
+ target.getClass.getMethod(methodName).invoke(target)
+ } catch {
+ case e: InvocationTargetException =>
+ throw e.getCause
+ }
}
}
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/TryModeShim.scala b/sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
similarity index 62%
rename from sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/TryModeShim.scala
rename to sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
index 5e2a2eac009..6e7f9d737ed 100644
--- a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/TryModeShim.scala
+++ b/sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * Copyright (c) 2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
/*** spark-rapids-shim-json-lines
{"spark": "330"}
+{"spark": "330db"}
{"spark": "331"}
{"spark": "332"}
{"spark": "333"}
@@ -23,15 +24,12 @@
spark-rapids-shim-json-lines ***/
package com.nvidia.spark.rapids.shims
-import org.apache.spark.sql.catalyst.expressions.Expression
+import com.nvidia.spark.rapids.ShimDataWritingCommandRule
-object TryModeShim {
- /**
- * Expression is wrapped under TryEval during query planning which is not supported on GPU.
- * Example: for try_add(col1, col2) it would be tryeval((col1#0 + col2#1))
- * So the return value from this function does not matter.
- */
- def isTryMode(expr: Expression): Boolean = {
- false
- }
+import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand
+
+object CreateDataSourceTableAsSelectRules {
+ val dataWriteCmd: ShimDataWritingCommandRule[CreateDataSourceTableAsSelectCommand] =
+ ShimDataWritingCommandRule[CreateDataSourceTableAsSelectCommand](
+ "Create table with select command")
}
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala b/sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala
rename to sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/SequenceSizeTooLongErrorBuilder.scala
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
index e50335d71a8..4e209fe7450 100644
--- a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark330/scala/com/nvidia/spark/rapids/shims/spark330/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 96%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index 960384cd4e9..b0cab9169e4 100644
--- a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
diff --git a/sql-plugin/src/main/spark330/scala/org/apache/spark/storage/ShuffleClientShims.scala b/sql-plugin-shims/src/main/spark330/scala/org/apache/spark/storage/ShuffleClientShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark330/scala/org/apache/spark/storage/ShuffleClientShims.scala
rename to sql-plugin-shims/src/main/spark330/scala/org/apache/spark/storage/ShuffleClientShims.scala
diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala b/sql-plugin-shims/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark330db/scala/com/nvidia/spark/rapids/DatabricksShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
index 0db1385cda5..149c45b7dab 100644
--- a/sql-plugin/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark330db/scala/com/nvidia/spark/rapids/shims/spark330db/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala b/sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
similarity index 100%
rename from sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
rename to sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala b/sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
similarity index 95%
rename from sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
rename to sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
index 4f59c12c985..a33ac76fdc5 100644
--- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
+++ b/sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 96%
rename from sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
index 2ae55c73057..feedbd11cf9 100644
--- a/sql-plugin/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
+++ b/sql-plugin-shims/src/main/spark330db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
index db631bdfb63..0483b4ffc1d 100644
--- a/sql-plugin/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark331/scala/com/nvidia/spark/rapids/shims/spark331/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
index 06be70cb21b..57403b1848a 100644
--- a/sql-plugin/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark332/scala/com/nvidia/spark/rapids/shims/spark332/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/errors/ConvUtils.scala b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
similarity index 68%
rename from sql-plugin/src/main/spark340/scala/org/apache/spark/sql/errors/ConvUtils.scala
rename to sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
index d9a669771a4..704c5188af0 100644
--- a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/errors/ConvUtils.scala
+++ b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/CreateDataSourceTableAsSelectRules.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ * Copyright (c) 2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
*/
/*** spark-rapids-shim-json-lines
+{"spark": "332db"}
{"spark": "340"}
{"spark": "341"}
{"spark": "341db"}
@@ -37,8 +38,14 @@
{"spark": "402"}
{"spark": "411"}
spark-rapids-shim-json-lines ***/
-package org.apache.spark.sql.errors
+package com.nvidia.spark.rapids.shims
-object ConvUtils {
- def overflowInConvError(): Unit = throw QueryExecutionErrors.overflowInConvError(null)
+import com.nvidia.spark.rapids.ShimRunnableCommandRule
+
+import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand
+
+object CreateDataSourceTableAsSelectRules {
+ val runnableCmd: ShimRunnableCommandRule[CreateDataSourceTableAsSelectCommand] =
+ ShimRunnableCommandRule[CreateDataSourceTableAsSelectCommand](
+ "Write to a data source")
}
diff --git a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/BloomFilterConstantsShims.scala b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/WriteFilesExecShims.scala
similarity index 81%
rename from sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/BloomFilterConstantsShims.scala
rename to sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/WriteFilesExecShims.scala
index 21cf24ddfc2..f320286ff58 100644
--- a/sql-plugin/src/main/spark330/scala/com/nvidia/spark/rapids/shims/BloomFilterConstantsShims.scala
+++ b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/WriteFilesExecShims.scala
@@ -15,13 +15,7 @@
*/
/*** spark-rapids-shim-json-lines
-{"spark": "330"}
-{"spark": "330db"}
-{"spark": "331"}
-{"spark": "332"}
{"spark": "332db"}
-{"spark": "333"}
-{"spark": "334"}
{"spark": "340"}
{"spark": "341"}
{"spark": "341db"}
@@ -42,9 +36,14 @@
{"spark": "400db173"}
{"spark": "401"}
{"spark": "402"}
+{"spark": "411"}
spark-rapids-shim-json-lines ***/
package com.nvidia.spark.rapids.shims
-object BloomFilterConstantsShims {
- val BLOOM_FILTER_FORMAT_VERSION: Int = 1
+import com.nvidia.spark.rapids.ShimExecRule
+
+import org.apache.spark.sql.execution.datasources.WriteFilesExec
+
+object WriteFilesExecShims {
+ val exec: ShimExecRule[WriteFilesExec] = ShimExecRule[WriteFilesExec]("v1 write files")
}
diff --git a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala
index be448c2d4ba..4af95229ee3 100644
--- a/sql-plugin/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark332db/scala/com/nvidia/spark/rapids/shims/spark332db/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala b/sql-plugin-shims/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
rename to sql-plugin-shims/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkDateTimeExceptionShims.scala
diff --git a/sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala b/sql-plugin-shims/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
rename to sql-plugin-shims/src/main/spark332db/scala/org/apache/spark/sql/rapids/shims/SparkUpgradeExceptionShims.scala
diff --git a/sql-plugin/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala
index f329546de6a..a1c902dcbb6 100644
--- a/sql-plugin/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark333/scala/com/nvidia/spark/rapids/shims/spark333/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala
index 9742399c693..f917857bd36 100644
--- a/sql-plugin/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark334/scala/com/nvidia/spark/rapids/shims/spark334/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala b/sql-plugin-shims/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala
similarity index 96%
rename from sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala
rename to sql-plugin-shims/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala
index b7b01b388b9..c2530d5d1c8 100644
--- a/sql-plugin/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala
+++ b/sql-plugin-shims/src/main/spark334/scala/org/apache/spark/sql/rapids/shims/SequenceSizeTooLongUnsuccessfulErrorBuilder.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
index 23581ba9a28..38694beb937 100644
--- a/sql-plugin/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark340/scala/com/nvidia/spark/rapids/shims/spark340/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala b/sql-plugin-shims/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
similarity index 100%
rename from sql-plugin/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
rename to sql-plugin-shims/src/main/spark340/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
diff --git a/sql-plugin/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala
index 38f9fd0307f..9c9c844145d 100644
--- a/sql-plugin/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark341/scala/com/nvidia/spark/rapids/shims/spark341/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala
index 72ed2c7c067..6c607615c7d 100644
--- a/sql-plugin/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark341db/scala/com/nvidia/spark/rapids/shims/spark341db/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala
index 6b00a7a762e..7833fc477c6 100644
--- a/sql-plugin/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark342/scala/com/nvidia/spark/rapids/shims/spark342/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala
index ff35d06a9c7..dc028c33f24 100644
--- a/sql-plugin/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark343/scala/com/nvidia/spark/rapids/shims/spark343/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala
index 80f042e0ee7..13fd8106a47 100644
--- a/sql-plugin/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark344/scala/com/nvidia/spark/rapids/shims/spark344/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala
index 91fcf7cf40a..c70f14547d1 100644
--- a/sql-plugin/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark350/scala/com/nvidia/spark/rapids/shims/spark350/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark350db143/scala/com/nvidia/spark/rapids/shims/spark350db143/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark350db143/scala/com/nvidia/spark/rapids/shims/spark350db143/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark350db143/scala/com/nvidia/spark/rapids/shims/spark350db143/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark350db143/scala/com/nvidia/spark/rapids/shims/spark350db143/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala b/sql-plugin-shims/src/main/spark350db143/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala
similarity index 100%
rename from sql-plugin/src/main/spark350db143/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala
rename to sql-plugin-shims/src/main/spark350db143/scala/org/apache/spark/sql/rapids/shims/SequenceSizeExceededLimitErrorBuilder.scala
diff --git a/sql-plugin/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala
index 8b1bace8aa1..a1a1f5a51a9 100644
--- a/sql-plugin/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark351/scala/com/nvidia/spark/rapids/shims/spark351/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala
index 5a6ba4d43c9..a6cb8368ba2 100644
--- a/sql-plugin/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark352/scala/com/nvidia/spark/rapids/shims/spark352/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala
index 8eaf51f7177..2945458b56f 100644
--- a/sql-plugin/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark353/scala/com/nvidia/spark/rapids/shims/spark353/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala
index 46d9887b0c8..7002ba891cd 100644
--- a/sql-plugin/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark354/scala/com/nvidia/spark/rapids/shims/spark354/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala
index 7175d0eb948..776871d176c 100644
--- a/sql-plugin/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark355/scala/com/nvidia/spark/rapids/shims/spark355/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala
index 9f55ea238cd..f9ce0d6aeee 100644
--- a/sql-plugin/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark356/scala/com/nvidia/spark/rapids/shims/spark356/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala
similarity index 96%
rename from sql-plugin/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala
index 971678a51bf..bd376eee633 100644
--- a/sql-plugin/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala
+++ b/sql-plugin-shims/src/main/spark357/scala/com/nvidia/spark/rapids/shims/spark357/SparkShimServiceProvider.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
diff --git a/sql-plugin/src/main/spark358/scala/com/nvidia/spark/rapids/shims/spark358/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark358/scala/com/nvidia/spark/rapids/shims/spark358/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark358/scala/com/nvidia/spark/rapids/shims/spark358/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark358/scala/com/nvidia/spark/rapids/shims/spark358/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/spark400/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark400/scala/com/nvidia/spark/rapids/shims/spark400/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark400/scala/com/nvidia/spark/rapids/shims/spark400/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark400/scala/com/nvidia/spark/rapids/shims/spark400/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala b/sql-plugin-shims/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
similarity index 100%
rename from sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
rename to sql-plugin-shims/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/OriginContextShim.scala
diff --git a/sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala b/sql-plugin-shims/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
rename to sql-plugin-shims/src/main/spark400/scala/org/apache/spark/sql/rapids/shims/TrampolineConnectShims.scala
diff --git a/sql-plugin/src/main/spark400db173/scala/com/nvidia/spark/rapids/shims/spark400db173/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark400db173/scala/com/nvidia/spark/rapids/shims/spark400db173/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark400db173/scala/com/nvidia/spark/rapids/shims/spark400db173/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark400db173/scala/com/nvidia/spark/rapids/shims/spark400db173/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark400db173/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala b/sql-plugin-shims/src/main/spark400db173/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark400db173/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
rename to sql-plugin-shims/src/main/spark400db173/scala/org/apache/spark/sql/rapids/ShuffleManagerShims.scala
diff --git a/sql-plugin/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala b/sql-plugin-shims/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala
similarity index 99%
rename from sql-plugin/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala
rename to sql-plugin-shims/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala
index aedfb30f7e9..12a6ac402df 100644
--- a/sql-plugin/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala
+++ b/sql-plugin-shims/src/main/spark400db173/scala/org/apache/spark/storage/ShuffleClientShims.scala
@@ -34,4 +34,3 @@ object ShuffleClientShims {
client.diagnoseCorruption(host, port, execId, blockId.name, checksum, algorithm)
}
}
-
diff --git a/sql-plugin/src/main/spark401/scala/com/nvidia/spark/rapids/shims/spark401/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark401/scala/com/nvidia/spark/rapids/shims/spark401/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark401/scala/com/nvidia/spark/rapids/shims/spark401/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark401/scala/com/nvidia/spark/rapids/shims/spark401/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark402/scala/com/nvidia/spark/rapids/shims/spark402/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark402/scala/com/nvidia/spark/rapids/shims/spark402/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark402/scala/com/nvidia/spark/rapids/shims/spark402/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark402/scala/com/nvidia/spark/rapids/shims/spark402/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark411/scala/com/nvidia/spark/rapids/shims/spark411/SparkShimServiceProvider.scala b/sql-plugin-shims/src/main/spark411/scala/com/nvidia/spark/rapids/shims/spark411/SparkShimServiceProvider.scala
similarity index 100%
rename from sql-plugin/src/main/spark411/scala/com/nvidia/spark/rapids/shims/spark411/SparkShimServiceProvider.scala
rename to sql-plugin-shims/src/main/spark411/scala/com/nvidia/spark/rapids/shims/spark411/SparkShimServiceProvider.scala
diff --git a/sql-plugin/src/main/spark411/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala b/sql-plugin-shims/src/main/spark411/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
similarity index 100%
rename from sql-plugin/src/main/spark411/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
rename to sql-plugin-shims/src/main/spark411/scala/org/apache/spark/sql/rapids/shims/FileCommitProtocolShims.scala
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index de478137c32..42de0833323 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -54,12 +54,37 @@
${spark-rapids-private.version}${spark.version.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-format_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-fileio_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+
+
+ com.nvidia
+ rapids-4-spark-sql-plugin-columnar_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ com.nvidiarapids-4-spark-sql-plugin-api_${scala.binary.version}${project.version}${spark.version.classifier}
+
+ com.nvidia
+ rapids-4-spark-sql-shims_${scala.binary.version}
+ ${project.version}
+ ${spark.version.classifier}
+ provided
+ org.scala-langscala-library
@@ -219,6 +244,27 @@
net.alchim31.mavenscala-maven-plugin
+
+ org.apache.maven.plugins
+ maven-dependency-plugin
+ 3.6.1
+
+
+ unpack-sql-plugin-shims
+ prepare-package
+
+ unpack-dependencies
+
+
+ com.nvidia
+ rapids-4-spark-sql-shims_${scala.binary.version}
+ true
+ **/*.class
+ ${project.build.outputDirectory}
+
+
+
+ org.apache.ratapache-rat-plugin
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
index 400b54626d8..dbf27a96c41 100644
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/InternalRowToColumnarBatchIterator.java
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2026, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -228,7 +228,7 @@ private HostMemoryBuffer[] getHostBuffersWithRetry(
try {
hBuf = HostAlloc$.MODULE$.alloc((dataBytes + offsetBytes),true);
SpillableHostBuffer sBuf = SpillableHostBuffer$.MODULE$.apply(hBuf, hBuf.getLength(),
- SpillPriorities$.MODULE$.ACTIVE_ON_DECK_PRIORITY());
+ SpillPriorities.ACTIVE_ON_DECK_PRIORITY);
hBuf = null; // taken over by spillable host buffer
return Tuple2.apply(sBuf, numRowsWrapper);
} finally {
@@ -258,7 +258,7 @@ private SpillableColumnarBatch makeSpillableBatch(ColumnVector devColumn) {
new ColumnarBatch(
new org.apache.spark.sql.vectorized.ColumnVector[]{gpuCV},
(int)gpuCV.getRowCount()),
- SpillPriorities.ACTIVE_ON_DECK_PRIORITY());
+ SpillPriorities.ACTIVE_ON_DECK_PRIORITY);
}
/**
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/SpillableKudoTable.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/SpillableKudoTable.java
new file mode 100644
index 00000000000..a2585829858
--- /dev/null
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/SpillableKudoTable.java
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2025-2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+import com.nvidia.spark.rapids.jni.kudo.KudoTable;
+import com.nvidia.spark.rapids.jni.kudo.KudoTableHeader;
+
+public class SpillableKudoTable implements AutoCloseable {
+ public final KudoTableHeader header;
+ public final long length;
+ private final SpillableHostBuffer shb;
+
+ public SpillableKudoTable(KudoTableHeader header, long length, SpillableHostBuffer shb) {
+ this.header = header;
+ this.length = length;
+ this.shb = shb;
+ }
+
+ public static SpillableKudoTable from(KudoTableHeader header, HostMemoryBuffer buffer) {
+ if (buffer == null) {
+ return new SpillableKudoTable(header, 0, null);
+ } else {
+ return new SpillableKudoTable(
+ header,
+ buffer.getLength(),
+ SpillableHostBuffer.apply(
+ buffer,
+ buffer.getLength(),
+ SpillPriorities.ACTIVE_BATCHING_PRIORITY));
+ }
+ }
+
+ public KudoTable makeKudoTable() {
+ if (shb == null) {
+ return new KudoTable(header, null);
+ } else {
+ return new KudoTable(header, shb.getHostBuffer());
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "SpillableKudoTable{header=" + header + ", shb=" + shb + '}';
+ }
+
+ @Override
+ public void close() {
+ if (shb != null) {
+ shb.close();
+ }
+ }
+}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java
deleted file mode 100644
index e30bbce09b3..00000000000
--- a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/RapidsInputFiles.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2026, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package com.nvidia.spark.rapids.fileio;
-
-import com.nvidia.spark.rapids.PerfIOConf;
-import org.apache.spark.SparkEnv;
-
-/**
- * Static helpers shared by {@link com.nvidia.spark.rapids.jni.fileio.RapidsInputFile}
- * implementations.
- */
-public final class RapidsInputFiles {
- private RapidsInputFiles() {}
-
- /**
- * True iff {@code spark.rapids.perfio.s3.enabled} is set to {@code true} on
- * the active SparkConf. Returns false when no {@link SparkEnv} is initialized
- * (e.g. before driver bring-up) so callers default to the non-PerfIO path.
- */
- public static boolean isS3PerfEnabled() {
- SparkEnv env = SparkEnv.get();
- if (env == null) {
- return false;
- }
- return env.conf().getBoolean(PerfIOConf.S3PERF_ENABLED().key(), false);
- }
-}
diff --git a/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOS3Reader.java b/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOS3Reader.java
new file mode 100644
index 00000000000..5277c382ee0
--- /dev/null
+++ b/sql-plugin/src/main/java/com/nvidia/spark/rapids/fileio/hadoop/PerfIOS3Reader.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.fileio.hadoop;
+
+import ai.rapids.cudf.HostMemoryBuffer;
+import com.nvidia.spark.rapids.IntRangeWithOffset;
+import com.nvidia.spark.rapids.PerfIO$;
+import com.nvidia.spark.rapids.PerfIOConf;
+import com.nvidia.spark.rapids.RangeWithOffset;
+import com.nvidia.spark.rapids.SuffixRangeWithOffset;
+import com.nvidia.spark.rapids.fileio.RapidsInputFiles.S3PerfReader;
+import com.nvidia.spark.rapids.jni.fileio.RapidsInputFile;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.spark.SparkEnv;
+import scala.Option;
+import scala.collection.JavaConverters;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+/** SQL-plugin bridge from Java-only file I/O classes to private Scala PerfIO state. */
+public final class PerfIOS3Reader implements S3PerfReader {
+ public static final PerfIOS3Reader INSTANCE = new PerfIOS3Reader();
+
+ private PerfIOS3Reader() {}
+
+ @Override
+ public boolean isEnabled() {
+ SparkEnv env = SparkEnv.get();
+ if (env == null) {
+ return false;
+ }
+ return env.conf().getBoolean(PerfIOConf.S3PERF_ENABLED().key(), false);
+ }
+
+ @Override
+ public boolean readVectored(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ List copyRanges) throws IOException {
+ List ranges = new ArrayList<>(copyRanges.size());
+ for (RapidsInputFile.CopyRange range : copyRanges) {
+ ranges.add(new IntRangeWithOffset(
+ range.getInputOffset(), range.getLength(), range.getOutputOffset()));
+ }
+ return readToHostMemory(hadoopConf, fileUri, output, ranges);
+ }
+
+ @Override
+ public boolean readTail(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ long length,
+ long outputOffset) throws IOException {
+ List ranges = new ArrayList<>(1);
+ ranges.add(new SuffixRangeWithOffset(length, outputOffset));
+ return readToHostMemory(hadoopConf, fileUri, output, ranges);
+ }
+
+ private boolean readToHostMemory(
+ Configuration hadoopConf,
+ URI fileUri,
+ HostMemoryBuffer output,
+ List ranges) {
+ Option