WIP spark

Jolanrensen · Jolanrensen · commit 211412a494c5 · 2025-06-04T14:18:21.000+02:00
diff --git a/examples/idea-examples/unsupported-data-sources/build.gradle.kts b/examples/idea-examples/unsupported-data-sources/build.gradle.kts
@@ -27,6 +27,10 @@ dependencies {
     implementation(libs.exposed.jdbc)
     implementation(libs.exposed.json)
     implementation(libs.exposed.money)
+
+    // (kotlin) spark support
+    implementation(libs.kotlin.spark)
+    compileOnly(libs.spark)
 }
 
 tasks.withType<KotlinCompile> {
diff --git a/examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/typedDataset.kt b/examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/typedDataset.kt
@@ -0,0 +1,50 @@
+package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark
+
+import org.apache.spark.sql.Dataset
+import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
+import org.jetbrains.kotlinx.dataframe.api.print
+import org.jetbrains.kotlinx.dataframe.api.toDataFrame
+import org.jetbrains.kotlinx.spark.api.withSpark
+
+@DataSchema
+data class Name(val firstName: String, val lastName: String)
+
+@DataSchema
+data class Person(
+    val name: Name,
+    val age: Int,
+    val city: String?,
+    val weight: Int?,
+    val isHappy: Boolean,
+)
+
+object TypedDataset {
+
+    /**
+     * With the Kotlin Spark API, norm;;al Kotlin data classes are supported,
+     * meaning we can reuse the same class for Spark and DataFrame!
+     *
+     */
+    @JvmStatic
+    fun main(args: Array<String>): Unit =
+        withSpark {
+            // Spark Dataset
+            val rawDataset: Dataset<Person> = listOf(
+                Person(Name("Alice", "Cooper"), 15, "London", 54, true),
+                Person(Name("Bob", "Dylan"), 45, "Dubai", 87, true),
+                Person(Name("Charlie", "Daniels"), 20, "Moscow", null, false),
+                Person(Name("Charlie", "Chaplin"), 40, "Milan", null, true),
+                Person(Name("Bob", "Marley"), 30, "Tokyo", 68, true),
+                Person(Name("Alice", "Wolf"), 20, null, 55, false),
+                Person(Name("Charlie", "Byrd"), 30, "Moscow", 90, true),
+            ).toDS()
+
+            // we can perform large operations in Spark
+            val dataset = rawDataset.filter { it.age > 17 }
+
+            // and convert to DataFrame
+            val dataframe = dataset.collectAsList().toDataFrame()
+
+            dataframe.print(columnTypes = true, borders = true)
+        }
+}
diff --git a/examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/untypedDataset.kt b/examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/untypedDataset.kt
@@ -0,0 +1,5 @@
+package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark
+
+fun main() {
+
+}
diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml
@@ -64,6 +64,10 @@ jts = "1.20.0"
 kandy = "0.8.1-dev-66"
 exposed = "1.0.0-beta-2"
 
+# check the versions down in the [libraries] section too!
+kotlin-spark = "1.2.4"
+spark = "3.3.2"
+
 [libraries]
 ksp-gradle = { group = "com.google.devtools.ksp", name = "symbol-processing-gradle-plugin", version.ref = "ksp" }
 ksp-api = { group = "com.google.devtools.ksp", name = "symbol-processing-api", version.ref = "ksp" }
@@ -157,14 +161,17 @@ kotlin-jupyter-test-kit = { group = "org.jetbrains.kotlinx", name = "kotlin-jupy
 kotlinx-benchmark-runtime = { group = "org.jetbrains.kotlinx", name = "kotlinx-benchmark-runtime", version.ref = "benchmark" }
 dataframe-symbol-processor = { group = "org.jetbrains.kotlinx.dataframe", name = "symbol-processor-all" }
 
-duckdb-jdbc = { group = "org.duckdb", name = "duckdb_jdbc", version.ref= "duckdb"}
+duckdb-jdbc = { group = "org.duckdb", name = "duckdb_jdbc", version.ref = "duckdb" }
 
 exposed-core = { group = "org.jetbrains.exposed", name = "exposed-core", version.ref = "exposed" }
 exposed-jdbc = { group = "org.jetbrains.exposed", name = "exposed-jdbc", version.ref = "exposed" }
 exposed-kotlin-datetime = { group = "org.jetbrains.exposed", name = "exposed-kotlin-datetime", version.ref = "exposed" }
 exposed-json = { group = "org.jetbrains.exposed", name = "exposed-json", version.ref = "exposed" }
 exposed-money = { group = "org.jetbrains.exposed", name = "exposed-money", version.ref = "exposed" }
 
+kotlin-spark = { group = "org.jetbrains.kotlinx.spark", name = "kotlin-spark-api_3.3.2_2.13", version.ref = "kotlin-spark" }
+spark = { group = "org.apache.spark", name = "spark-sql_2.13", version.ref = "spark" }
+
 [plugins]
 jupyter-api = { id = "org.jetbrains.kotlin.jupyter.api", version.ref = "kotlinJupyter" }
 ksp = { id = "com.google.devtools.ksp", version.ref = "ksp" }

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark
++
 +fun main() {
++
 +}