projectglow · ramGoli · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026 · Apr 9, 2026
diff --git a/.github/workflows/production-release.yml b/.github/workflows/production-release.yml
@@ -6,10 +6,10 @@ on:
             description: "Git tag for release"
             required: true
         spark-version:
-            description: "Spark version to build against (only used to decide the artifact name)"
+            description: "Spark version (3.5.1 for Spark 3 release, 4.1.0 for Spark 4 release)"
             default: "3.5.1"
         java-version:
-            description: "Java version to use for running sbt"
+            description: "Java version (8 for Spark 3, 17 for Spark 4)"
             default: "8"
         push-python:
             description: "If true, Python artifacts will be pushed to Test PyPI"

diff --git a/.github/workflows/staging-release.yml b/.github/workflows/staging-release.yml
@@ -6,13 +6,13 @@ on:
         description: "Git tag for release"
         required: true
       spark-version:
-        description: "Spark version to build against"
+        description: "Spark version to build against. Use 3.5.1 for Spark 3 release, 4.1.0 for Spark 4 release."
         default: "3.5.1"
       scala-version:
-        description: "Scala version to use when building Glow"
+        description: "Scala version. Use 2.12.19 for Spark 3, 2.13.14 for Spark 4."
         default: "2.12.19"
       java-version:
-        description: "Java version to use when building Glow"
+        description: "Java version. Use 8 for Spark 3, 17 for Spark 4."
         default: "8"
       push-python:
         description: "If true, Python artifacts will be pushed to Test PyPI"

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -146,27 +146,26 @@ jobs:
         uses: codecov/codecov-action@v4
         with:
           files: ./coverage.xml, *scoverage.xml
-          fail_ci_if_error: true
+          fail_ci_if_error: false
           token: ${{ secrets.CODECOV_TOKEN }}
-          flags: unittests
+          flags: spark3
           verbose: true
 
   # Dummy job so that required statuses don't need to change with Spark / scala matrix
   spark-tests-success:
     runs-on: ubuntu-latest
-    needs: spark-tests
+    needs: [spark-tests, spark-4-tests]
     steps:
       - run: echo "Spark tests passed!"
 
   spark-4-tests:
     runs-on: ubuntu-latest
-    if: contains(github.event.pull_request.title, '[SPARK4]')
     defaults:
       run:
         shell: bash -el {0}
     env:
-      SPARK_VERSION: 4.0.0-SNAPSHOT
-      SCALA_VERSION: 2.13.14
+      SPARK_VERSION: 4.1.0
+      SCALA_VERSION: 2.13.15
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -209,17 +208,11 @@ jobs:
         run: conda env update -n glow-spark4 -f python/spark-4-environment.yml
         if: steps.cache.outputs.cache-hit != 'true'
 
-      - name: Clone Spark (for PySpark source)
-        run: (cd $HOME && git clone https://github.com/apache/spark.git)
-
       - name: Scala tests
-        run: sbt core/test exit
-
-      - name: Uninstall PySpark
-        run: pip uninstall -y pyspark
+        run: sbt compile coverage core/test core/coverageReport exit
 
       - name: Python tests
-        run: EXTRA_PYTHON_PATH=$HOME/spark/python sbt python/test exit
+        run: sbt python/test exit
 
       # Temporarily disabled due to sybil/pytest compatibility issues
       # - name: Docs tests
@@ -253,7 +246,7 @@ jobs:
         uses: codecov/codecov-action@v4
         with:
           files: ./coverage.xml, *scoverage.xml
-          fail_ci_if_error: true
+          fail_ci_if_error: false
           token: ${{ secrets.CODECOV_TOKEN }}
-          flags: unittests
+          flags: spark4
           verbose: true
diff --git a/build.sbt b/build.sbt
@@ -9,10 +9,10 @@ import sbt.nio.Keys._
 
 // Scala version used by DBR 13.3 LTS and 14.0
 lazy val scala212 = "2.12.19"
-lazy val scala213 = "2.13.14"
+lazy val scala213 = "2.13.15"
 
 lazy val spark3 = "3.5.1"
-lazy val spark4 = "4.0.0-SNAPSHOT"
+lazy val spark4 = "4.1.0"
 
 lazy val sparkVersion = settingKey[String]("sparkVersion")
 ThisBuild / sparkVersion := sys.env.getOrElse("SPARK_VERSION", spark3)
@@ -35,6 +35,15 @@ def majorMinorVersion(version: String): String = {
   }
 }
 
+// For shim directory resolution: Spark 3.x uses major.minor (3.4, 3.5),
+// Spark 4+ uses major only (4) since one shim covers all 4.x
+def shimVersion(version: String): String = {
+  majorVersion(version) match {
+    case "3" => majorMinorVersion(version)
+    case _ => majorVersion(version)
+  }
+}
+
 val defaultScalaVersion = Map("3" -> scala212, "4" -> scala213)
 
 ThisBuild / scalaVersion := sys
@@ -113,12 +122,25 @@ lazy val commonSettings = Seq(
   assembly / assemblyMergeStrategy := {
     case p if p.toLowerCase.contains("manifest.mf") =>
       MergeStrategy.discard
+    case p if p.toLowerCase.endsWith(".sf") || p.toLowerCase.endsWith(".dsa") || p.toLowerCase.endsWith(".rsa") =>
+      MergeStrategy.discard
+    case p if p.startsWith("com/fasterxml/jackson/") =>
+      MergeStrategy.discard
+    case "META-INF/services/java.net.spi.InetAddressResolverProvider" =>
+      MergeStrategy.discard
     case _ =>
       // Be permissive for other files
       MergeStrategy.first
   },
-  scalacOptions += "-target:jvm-1.8",
-  resolvers += "Apache Snapshots" at "https://repository.apache.org/snapshots/"
+  scalacOptions ++= {
+    if (majorVersion(sparkVersion.value) == "3") Seq("-target:jvm-1.8")
+    else Seq("-release", "17")
+  },
+  resolvers ++= {
+    if (sparkVersion.value.contains("SNAPSHOT"))
+      Seq("Apache Snapshots" at "https://repository.apache.org/snapshots/")
+    else Seq.empty
+  },
 )
 
 lazy val functionsYml = settingKey[File]("functionsYml")
@@ -165,7 +187,7 @@ ThisBuild / testCoreDependencies := Seq(
   majorVersion((ThisBuild / sparkVersion).value) match {
     case "3" => "org.scalatest" %% "scalatest" % "3.2.18" % "test"
     case "4" => "org.scalatest" %% "scalatest" % "3.2.17" % "test"
-    case _ => throw new IllegalArgumentException("Only Spark 3 is supported")
+    case _ => throw new IllegalArgumentException("Only Spark 3 and 4 are supported")
   },
   "org.mockito" % "mockito-all" % "1.10.19" % "test",
   "org.apache.spark" %% "spark-catalyst" % sparkVersion.value % "test" classifier "tests",
@@ -176,23 +198,31 @@ ThisBuild / testCoreDependencies := Seq(
 )
 
 lazy val coreDependencies = settingKey[Seq[ModuleID]]("coreDependencies")
-ThisBuild / coreDependencies := (providedSparkDependencies.value ++ testCoreDependencies.value ++ Seq(
-  "org.seqdoop" % "hadoop-bam" % "7.10.0",
-  "org.slf4j" % "slf4j-api" % "2.0.12",
-  "org.jdbi" % "jdbi" % "2.78",
-  "com.github.broadinstitute" % "picard" % "2.27.5",
-  "org.apache.commons" % "commons-lang3" % "3.14.0",
-  // Fix versions of libraries that are depended on multiple times
-  "org.apache.hadoop" % "hadoop-client" % "3.3.6",
-  "io.netty" % "netty-all" % "4.1.96.Final",
-  "io.netty" % "netty-handler" % "4.1.96.Final",
-  "io.netty" % "netty-transport-native-epoll" % "4.1.96.Final",
-  "com.github.samtools" % "htsjdk" % "3.0.5",
-  "org.yaml" % "snakeyaml" % "2.2",
-  "com.univocity" % "univocity-parsers" % "2.9.1",
-  // Fix CVE: Upgrade Avro to 1.11.4+ to fix Arbitrary Code Execution vulnerability
-  "org.apache.avro" % "avro" % "1.11.4"
-)).map(_.exclude("com.google.code.findbugs", "jsr305"))
+ThisBuild / coreDependencies := {
+  val sparkMajor = majorVersion(sparkVersion.value)
+
+  // Dependency versions that differ between Spark 3 and 4
+  val hadoopVersion = if (sparkMajor == "3") "3.3.6" else "3.4.2"
+  val nettyVersion = if (sparkMajor == "3") "4.1.96.Final" else "4.2.7.Final"
+  val avroVersion = if (sparkMajor == "3") "1.11.4" else "1.12.0"
+
+  (providedSparkDependencies.value ++ testCoreDependencies.value ++ Seq(
+    "org.seqdoop" % "hadoop-bam" % "7.10.0",
+    "org.slf4j" % "slf4j-api" % "2.0.12",
+    "org.jdbi" % "jdbi" % "2.78",
+    "com.github.broadinstitute" % "picard" % "2.27.5",
+    "org.apache.commons" % "commons-lang3" % "3.14.0",
+    // Fix versions of libraries that are depended on multiple times
+    "org.apache.hadoop" % "hadoop-client" % hadoopVersion,
+    "io.netty" % "netty-all" % nettyVersion,
+    "io.netty" % "netty-handler" % nettyVersion,
+    "io.netty" % "netty-transport-native-epoll" % nettyVersion,
+    "com.github.samtools" % "htsjdk" % "3.0.5",
+    "org.yaml" % "snakeyaml" % "2.2",
+    "com.univocity" % "univocity-parsers" % "2.9.1",
+    "org.apache.avro" % "avro" % avroVersion
+  )).map(_.exclude("com.google.code.findbugs", "jsr305"))
+}
 
 lazy val root = (project in file(".")).aggregate(core, python, docs)
 
@@ -214,7 +244,7 @@ lazy val core = (project in file("core"))
     Compile / packageBin / packageOptions += Package.ManifestAttributes(
       "Git-Release-Hash" -> currentGitHash(baseDirectory.value)),
     libraryDependencies ++= coreDependencies.value :+ scalaLoggingDependency.value,
-    Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "shim" / majorMinorVersion(
+    Compile / unmanagedSourceDirectories += baseDirectory.value / "src" / "main" / "shim" / shimVersion(
       sparkVersion.value),
     Compile / unmanagedSourceDirectories += {
       val sourceDir = (Compile / sourceDirectory).value
@@ -223,7 +253,7 @@ lazy val core = (project in file("core"))
         case _ => sourceDir / "scala-2.13-"
       }
     },
-    Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "shim" / majorMinorVersion(
+    Test / unmanagedSourceDirectories += baseDirectory.value / "src" / "test" / "shim" / shimVersion(
       sparkVersion.value),
     functionsTemplate := baseDirectory.value / "functions.scala.TEMPLATE",
     generatedFunctionsOutput := (Compile / scalaSource).value / "io" / "projectglow" / "functions.scala",
@@ -368,7 +398,7 @@ lazy val stagedRelease = (project in file("core/src/test"))
     commonSettings,
     Test / resourceDirectory := baseDirectory.value / "resources",
     Test / scalaSource := baseDirectory.value / "scala",
-    Test / unmanagedSourceDirectories += baseDirectory.value / "shim" / majorMinorVersion(
+    Test / unmanagedSourceDirectories += baseDirectory.value / "shim" / shimVersion(
       sparkVersion.value),
     libraryDependencies ++= testSparkDependencies.value ++ testCoreDependencies.value :+ "io.projectglow" %% s"glow-spark${majorVersion(
       sparkVersion.value)}" % stableVersion.value % "test",

diff --git a/codecov.yml b/codecov.yml
@@ -4,3 +4,12 @@ coverage:
       default:
         target: 93%    # the required coverage value
         threshold: 2%  # the leniency in hitting the target
+    patch:
+      default:
+        target: 60%
+
+flags:
+  spark3:
+    carryforward: true
+  spark4:
+    carryforward: true
diff --git a/conftest.py b/conftest.py
@@ -32,9 +32,9 @@ def _spark_builder():
     'docs/source/tertiary/regression-tests.rst'
 ]
 
-def pytest_ignore_collect(path):
+def pytest_ignore_collect(collection_path):
     major_version = SPARK_VERSION.split('.')[0]
-    if int(major_version) < 3 and any([str(path).endswith(p) for p in SPARK3_PLUS_FILES]):
+    if int(major_version) < 3 and any([str(collection_path).endswith(p) for p in SPARK3_PLUS_FILES]):
         return True
 
 

diff --git a/core/functions.scala.TEMPLATE b/core/functions.scala.TEMPLATE
@@ -17,6 +17,7 @@
 package io.projectglow
 
 import org.apache.spark.sql.Column
+import org.apache.spark.sql.SQLUtils
 import org.apache.spark.sql.catalyst.expressions.{Expression, LambdaFunction, Literal, UnresolvedNamedLambdaVariable}
 
 import io.projectglow.sql.expressions.ExpressionHelper
@@ -32,19 +33,19 @@ import io.projectglow.sql.expressions.ExpressionHelper
  */
 object functions {
   private def withExpr(expr: Expression): Column = {
-    new Column(ExpressionHelper.wrapAggregate(ExpressionHelper.rewrite(expr)))
+    SQLUtils.exprToColumn(ExpressionHelper.wrapAggregate(ExpressionHelper.rewrite(expr)))
   }
 
   private def createLambda(f: Column => Column) = {
     val x = UnresolvedNamedLambdaVariable(Seq("x"))
-    val function = f(new Column(x)).expr
+    val function = SQLUtils.columnToExpr(f(SQLUtils.exprToColumn(x)))
     LambdaFunction(function, Seq(x))
   }
 
   private def createLambda(f: (Column, Column) => Column) = {
     val x = UnresolvedNamedLambdaVariable(Seq("x"))
     val y = UnresolvedNamedLambdaVariable(Seq("y"))
-    val function = f(new Column(x), new Column(y)).expr
+    val function = SQLUtils.columnToExpr(f(SQLUtils.exprToColumn(x), SQLUtils.exprToColumn(y)))
     LambdaFunction(function, Seq(x, y))
   }
   {% for group_name, group in groups.items() %}