Kotlin
diff --git a/‎build.gradle.kts
Lines changed: 1 addition & 1 deletion b/‎build.gradle.kts
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/idea-examples/unsupported-data-sources/build.gradle.kts
Lines changed: 18 additions & 2 deletions b/‎examples/idea-examples/unsupported-data-sources/build.gradle.kts
Lines changed: 18 additions & 2 deletions
diff --git a/‎examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/compatibilityLayer.kt
Lines changed: 313 additions & 0 deletions b/‎examples/idea-examples/unsupported-data-sources/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/kotlinSpark/compatibilityLayer.kt
Lines changed: 313 additions & 0 deletions
@@ -196,7 +196,7 @@ allprojects {
             logger.warn("Could not set ktlint config on :${this.name}")
         }
 
-        // set the java toolchain version to 11 for all subprojects for CI stability
+        // set the java toolchain version to 21 for all subprojects for CI stability
         extensions.findByType<KotlinJvmProjectExtension>()?.jvmToolchain(21)
 
         // Attempts to configure buildConfig for each sub-project that uses it
 
@@ -31,8 +31,24 @@ dependencies {
     // (kotlin) spark support
     implementation(libs.kotlin.spark)
     compileOnly(libs.spark)
+    implementation(libs.log4j.core)
+    implementation(libs.log4j.api)
 }
 
-tasks.withType<KotlinCompile> {
-    compilerOptions.jvmTarget = JvmTarget.JVM_1_8
+/**
+ * Runs the kotlinSpark/typedDataset example with java 11.
+ */
+val runKotlinSparkTypedDataset by tasks.registering(JavaExec::class) {
+    classpath = sourceSets["main"].runtimeClasspath
+    javaLauncher = javaToolchains.launcherFor { languageVersion = JavaLanguageVersion.of(11) }
+    mainClass = "org.jetbrains.kotlinx.dataframe.examples.kotlinSpark.TypedDatasetKt"
+}
+
+/**
+ * Runs the kotlinSpark/untypedDataset example with java 11.
+ */
+val runKotlinSparkUntypedDataset by tasks.registering(JavaExec::class) {
+    classpath = sourceSets["main"].runtimeClasspath
+    javaLauncher = javaToolchains.launcherFor { languageVersion = JavaLanguageVersion.of(11) }
+    mainClass = "org.jetbrains.kotlinx.dataframe.examples.kotlinSpark.UntypedDatasetKt"
 }
@@ -0,0 +1,313 @@
+package org.jetbrains.kotlinx.dataframe.examples.kotlinSpark
+
+import org.apache.spark.api.java.JavaSparkContext
+import org.apache.spark.sql.Dataset
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.RowFactory
+import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.types.ArrayType
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.types.DataTypes
+import org.apache.spark.sql.types.Decimal
+import org.apache.spark.sql.types.DecimalType
+import org.apache.spark.sql.types.MapType
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.unsafe.types.CalendarInterval
+import org.jetbrains.kotlinx.dataframe.AnyFrame
+import org.jetbrains.kotlinx.dataframe.DataColumn
+import org.jetbrains.kotlinx.dataframe.DataFrame
+import org.jetbrains.kotlinx.dataframe.DataRow
+import org.jetbrains.kotlinx.dataframe.api.rows
+import org.jetbrains.kotlinx.dataframe.api.schema
+import org.jetbrains.kotlinx.dataframe.api.toDataFrame
+import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
+import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
+import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
+import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
+import org.jetbrains.kotlinx.spark.api.toRDD
+import java.math.BigDecimal
+import java.math.BigInteger
+import java.sql.Date
+import java.sql.Timestamp
+import java.time.Instant
+import java.time.LocalDate
+import kotlin.reflect.KType
+import kotlin.reflect.KTypeProjection
+import kotlin.reflect.full.createType
+import kotlin.reflect.full.isSubtypeOf
+import kotlin.reflect.full.withNullability
+import kotlin.reflect.typeOf
+
+// region Spark to DataFrame
+
+/**
+ * Converts an untyped Spark [Dataset] (Dataframe) to a Kotlin [DataFrame].
+ * [StructTypes][StructType] are converted to [ColumnGroups][ColumnGroup].
+ *
+ * DataFrame supports type inference to do the conversion automatically.
+ * This is usually fine for smaller data sets, but when working with larger datasets a type map might be a good idea.
+ * See [convertToDataFrame] for more information.
+ */
+fun Dataset<Row>.convertToDataFrameByInference(
+    schema: StructType = schema(),
+    prefix: List<String> = emptyList(),
+): AnyFrame {
+    val columns = schema.fields().map { field ->
+        val name = field.name()
+        when (val dataType = field.dataType()) {
+            is StructType ->
+                DataColumn.createColumnGroup(
+                    name = name,
+                    df = convertToDataFrameByInference(dataType, prefix + name),
+                )
+
+            else ->
+                DataColumn.createByInference(
+                    name = name,
+                    values = select((prefix + name).joinToString("."))
+                        .collectAsList()
+                        .map { it[0] },
+                    suggestedType = TypeSuggestion.Infer,
+                    nullable = field.nullable(),
+                )
+        }
+    }
+    return columns.toDataFrame()
+}
+
+/**
+ * Converts an untyped Spark [Dataset] (Dataframe) to a Kotlin [DataFrame].
+ * [StructTypes][StructType] are converted to [ColumnGroups][ColumnGroup].
+ *
+ * This version uses a [type-map][DataType.convertToDataFrame] to convert the schemas with a fallback to inference.
+ * For smaller data sets, inference is usually fine too.
+ * See [convertToDataFrameByInference] for more information.
+ */
+fun Dataset<Row>.convertToDataFrame(schema: StructType = schema(), prefix: List<String> = emptyList()): AnyFrame {
+    val columns = schema.fields().map { field ->
+        val name = field.name()
+        when (val dataType = field.dataType()) {
+            is StructType ->
+                DataColumn.createColumnGroup(
+                    name = name,
+                    df = convertToDataFrame(dataType, prefix + name),
+                )
+
+            else ->
+                DataColumn.createByInference(
+                    name = name,
+                    values = select((prefix + name).joinToString("."))
+                        .collectAsList()
+                        .map { it[0] },
+                    suggestedType =
+                        dataType.convertToDataFrame()
+                            ?.let(TypeSuggestion::Use)
+                            ?: TypeSuggestion.Infer, // fallback to inference if needed
+                    nullable = field.nullable(),
+                )
+        }
+    }
+    return columns.toDataFrame()
+}
+
+/**
+ * Returns the corresponding Kotlin type for a given Spark DataType.
+ *
+ * This list may be incomplete, but it can at least give you a good start.
+ *
+ * @return The KType that corresponds to the Spark DataType, or null if no matching KType is found.
+ */
+fun DataType.convertToDataFrame(): KType? =
+    when {
+        this == DataTypes.ByteType -> typeOf<Byte>()
+
+        this == DataTypes.ShortType -> typeOf<Short>()
+
+        this == DataTypes.IntegerType -> typeOf<Int>()
+
+        this == DataTypes.LongType -> typeOf<Long>()
+
+        this == DataTypes.BooleanType -> typeOf<Boolean>()
+
+        this == DataTypes.FloatType -> typeOf<Float>()
+
+        this == DataTypes.DoubleType -> typeOf<Double>()
+
+        this == DataTypes.StringType -> typeOf<String>()
+
+        this == DataTypes.DateType -> typeOf<Date>()
+
+        this == DataTypes.TimestampType -> typeOf<Timestamp>()
+
+        this is DecimalType -> typeOf<Decimal>()
+
+        this == DataTypes.CalendarIntervalType -> typeOf<CalendarInterval>()
+
+        this == DataTypes.NullType -> nullableNothingType
+
+        this == DataTypes.BinaryType -> typeOf<ByteArray>()
+
+        this is ArrayType -> {
+            when (elementType()) {
+                DataTypes.ShortType -> typeOf<ShortArray>()
+                DataTypes.IntegerType -> typeOf<IntArray>()
+                DataTypes.LongType -> typeOf<LongArray>()
+                DataTypes.FloatType -> typeOf<FloatArray>()
+                DataTypes.DoubleType -> typeOf<DoubleArray>()
+                DataTypes.BooleanType -> typeOf<BooleanArray>()
+                else -> null
+            }
+        }
+
+        this is MapType -> {
+            val key = keyType().convertToDataFrame() ?: return null
+            val value = valueType().convertToDataFrame() ?: return null
+            Map::class.createType(
+                listOf(
+                    KTypeProjection.invariant(key),
+                    KTypeProjection.invariant(value.withNullability(valueContainsNull())),
+                ),
+            )
+        }
+
+        else -> null
+    }
+
+// endregion
+
+// region DataFrame to Spark
+
+/**
+ * Converts the DataFrame to a Spark Dataset of Rows using the provided SparkSession and JavaSparkContext.
+ *
+ * Spark needs both the data and the schema to be converted to create a correct [Dataset].
+ *
+ * @param spark The SparkSession object to use for creating the DataFrame.
+ * @param sc The JavaSparkContext object to use for converting the DataFrame to RDD.
+ * @return A Dataset of Rows representing the converted DataFrame.
+ */
+fun DataFrame<*>.convertToSpark(spark: SparkSession, sc: JavaSparkContext): Dataset<Row> {
+    val rows = sc.toRDD(rows().map { it.convertToSpark() })
+    return spark.createDataFrame(rows, schema().convertToSpark())
+}
+
+/**
+ * Converts a DataRow to a Spark Row object.
+ *
+ * @return The converted Spark Row.
+ */
+fun DataRow<*>.convertToSpark(): Row =
+    RowFactory.create(
+        *values().map {
+            when (it) {
+                is DataRow<*> -> it.convertToSpark()
+                else -> it
+            }
+        }.toTypedArray(),
+    )
+
+/**
+ * Converts a DataFrameSchema to a Spark StructType.
+ *
+ * @return The converted Spark StructType.
+ */
+fun DataFrameSchema.convertToSpark(): StructType =
+    DataTypes.createStructType(
+        columns.map { (name, schema) ->
+            DataTypes.createStructField(name, schema.convertToSpark(), schema.nullable)
+        },
+    )
+
+/**
+ * Converts a ColumnSchema object to Spark DataType.
+ *
+ * @return The Spark DataType corresponding to the given ColumnSchema object.
+ * @throws IllegalArgumentException if the column type or kind is unknown.
+ */
+fun ColumnSchema.convertToSpark(): DataType =
+    when (this) {
+        is ColumnSchema.Value -> type.convertToSpark() ?: error("unknown data type: $type")
+        is ColumnSchema.Group -> schema.convertToSpark()
+        is ColumnSchema.Frame -> error("nested dataframes are not supported")
+        else -> error("unknown column kind: $this")
+    }
+
+/**
+ * Returns the corresponding Spark DataType for a given Kotlin type.
+ *
+ * This list may be incomplete, but it can at least give you a good start.
+ *
+ * @return The Spark DataType that corresponds to the Kotlin type, or null if no matching DataType is found.
+ */
+fun KType.convertToSpark(): DataType? =
+    when {
+        isSubtypeOf(typeOf<Byte?>()) -> DataTypes.ByteType
+
+        isSubtypeOf(typeOf<Short?>()) -> DataTypes.ShortType
+
+        isSubtypeOf(typeOf<Int?>()) -> DataTypes.IntegerType
+
+        isSubtypeOf(typeOf<Long?>()) -> DataTypes.LongType
+
+        isSubtypeOf(typeOf<Boolean?>()) -> DataTypes.BooleanType
+
+        isSubtypeOf(typeOf<Float?>()) -> DataTypes.FloatType
+
+        isSubtypeOf(typeOf<Double?>()) -> DataTypes.DoubleType
+
+        isSubtypeOf(typeOf<String?>()) -> DataTypes.StringType
+
+        isSubtypeOf(typeOf<LocalDate?>()) -> DataTypes.DateType
+
+        isSubtypeOf(typeOf<Date?>()) -> DataTypes.DateType
+
+        isSubtypeOf(typeOf<Timestamp?>()) -> DataTypes.TimestampType
+
+        isSubtypeOf(typeOf<Instant?>()) -> DataTypes.TimestampType
+
+        isSubtypeOf(typeOf<Decimal?>()) -> DecimalType.SYSTEM_DEFAULT()
+
+        isSubtypeOf(typeOf<BigDecimal?>()) -> DecimalType.SYSTEM_DEFAULT()
+
+        isSubtypeOf(typeOf<BigInteger?>()) -> DecimalType.SYSTEM_DEFAULT()
+
+        isSubtypeOf(typeOf<CalendarInterval?>()) -> DataTypes.CalendarIntervalType
+
+        isSubtypeOf(nullableNothingType) -> DataTypes.NullType
+
+        isSubtypeOf(typeOf<ByteArray?>()) -> DataTypes.BinaryType
+
+        isSubtypeOf(typeOf<ShortArray?>()) -> DataTypes.createArrayType(DataTypes.ShortType, false)
+
+        isSubtypeOf(typeOf<IntArray?>()) -> DataTypes.createArrayType(DataTypes.IntegerType, false)
+
+        isSubtypeOf(typeOf<LongArray?>()) -> DataTypes.createArrayType(DataTypes.LongType, false)
+
+        isSubtypeOf(typeOf<FloatArray?>()) -> DataTypes.createArrayType(DataTypes.FloatType, false)
+
+        isSubtypeOf(typeOf<DoubleArray?>()) -> DataTypes.createArrayType(DataTypes.DoubleType, false)
+
+        isSubtypeOf(typeOf<BooleanArray?>()) -> DataTypes.createArrayType(DataTypes.BooleanType, false)
+
+        isSubtypeOf(typeOf<Array<*>>()) ->
+            error("non-primitive arrays are not supported for now, you can add it yourself")
+
+        isSubtypeOf(typeOf<List<*>>()) -> error("lists are not supported for now, you can add it yourself")
+
+        isSubtypeOf(typeOf<Set<*>>()) -> error("sets are not supported for now, you can add it yourself")
+
+        classifier == Map::class -> {
+            val (key, value) = arguments
+            DataTypes.createMapType(
+                key.type?.convertToSpark(),
+                value.type?.convertToSpark(),
+                value.type?.isMarkedNullable ?: true,
+            )
+        }
+
+        else -> null
+    }
+
+private val nullableNothingType: KType = typeOf<List<Nothing?>>().arguments.first().type!!
+
+// endregion
Original file line number	Diff line number	Diff line change
`@@ -196,7 +196,7 @@ allprojects {`
`196`	`196`	`logger.warn("Could not set ktlint config on :${this.name}")`
`197`	`197`	`}`
`198`	`198`
`199`		`- // set the java toolchain version to 11 for all subprojects for CI stability`
	`199`	`+ // set the java toolchain version to 21 for all subprojects for CI stability`
`200`	`200`	`extensions.findByType<KotlinJvmProjectExtension>()?.jvmToolchain(21)`
`201`	`201`
`202`	`202`	`// Attempts to configure buildConfig for each sub-project that uses it`