introduced incremental schema evolution

afiore · afiore · commit dfd77403e62d · 2023-12-19T11:33:25.000+01:00
diff --git a/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaEvolution.java b/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaEvolution.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2023 Celonis SE
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.celonis.kafka.connect.schema;
+
+import org.apache.kafka.connect.data.Schema;
+
+public interface SchemaEvolution {
+  Schema evolve(Schema currentSchema, Schema recordSchema) throws SchemaEvolutionException;
+}
diff --git a/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaEvolutionException.java b/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaEvolutionException.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2023 Celonis SE
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.celonis.kafka.connect.schema;
+
+public final class SchemaEvolutionException extends RuntimeException {
+  public SchemaEvolutionException(String message) {
+    super(message);
+  }
+}
diff --git a/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaUtils.java b/connector/src/main/java/com/celonis/kafka/connect/schema/SchemaUtils.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright 2023 Celonis SE
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.celonis.kafka.connect.schema;
+
+import org.apache.kafka.connect.data.Schema;
+import org.apache.kafka.connect.data.SchemaBuilder;
+
+import java.util.Optional;
+
+public class SchemaUtils {
+  public static SchemaBuilder withMetadata(SchemaBuilder builder, Schema schema) {
+    Optional.ofNullable(schema.parameters())
+        .filter(params -> !params.isEmpty())
+        .ifPresent(builder::parameters);
+
+    Optional.ofNullable(schema.name()).ifPresent(builder::name);
+    Optional.ofNullable(schema.doc()).ifPresent(builder::doc);
+    Optional.ofNullable(schema.defaultValue()).ifPresent(builder::defaultValue);
+    Optional.ofNullable(schema.version()).ifPresent(builder::version);
+
+    return builder;
+  }
+}
diff --git a/connector/src/main/java/com/celonis/kafka/connect/schema/StructSchemaAlignment.java b/connector/src/main/java/com/celonis/kafka/connect/schema/StructSchemaAlignment.java
@@ -0,0 +1,76 @@
+/*
+ * Copyright 2023 Celonis SE
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.celonis.kafka.connect.schema;
+
+import org.apache.kafka.connect.data.Schema;
+import org.apache.kafka.connect.data.Struct;
+
+import java.util.Collection;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+public final class StructSchemaAlignment {
+  /**
+   * Align an object to the schema accumulated by incrementally calling SchemaEvolution#evolve.
+   *
+   * <p>NOTE: this is needed in order to avoid the unnecessarily frequent flush of output Parquet
+   * files due JSON schema inference producing different records for what effectively is the same
+   * data (see `EmsOutputRecordSink#put)
+   *
+   * @param evolvedSchema the schema this value should align to. Must be a superset of the supplied
+   *     struct schema.
+   * @param value the current SinKRecord input struct
+   * @return a struct with the evolved schema
+   */
+  public static Struct alignTo(Schema evolvedSchema, Struct value) {
+    return (Struct) align(evolvedSchema, value);
+  }
+
+  private static Object align(Schema evolvedSchema, Object value) {
+    switch (evolvedSchema.type()) {
+      case ARRAY:
+        final var collection = (Collection<?>) value;
+        return collection.stream()
+            .map(item -> align(evolvedSchema.valueSchema(), item))
+            .collect(Collectors.toList());
+
+      case MAP:
+        final var map = (Map<?, ?>) value;
+        return map.entrySet().stream()
+            .collect(
+                Collectors.toMap(
+                    entry -> align(evolvedSchema.keySchema(), entry.getKey()),
+                    entry -> align(evolvedSchema.valueSchema(), entry.getValue())));
+
+      case STRUCT:
+        final var structValue = (Struct) value;
+        if (structValue.schema() == evolvedSchema) return structValue;
+        final var newStruct = new Struct(evolvedSchema);
+
+        for (final var evolvedField : evolvedSchema.fields()) {
+          if (structValue.schema().field(evolvedField.name()) != null) {
+            newStruct.put(
+                evolvedField.name(),
+                align(evolvedField.schema(), structValue.get(evolvedField.name())));
+          }
+        }
+
+        return newStruct;
+      default:
+        return value;
+    }
+  }
+}
diff --git a/connector/src/main/java/com/celonis/kafka/connect/schema/StructSchemaEvolution.java b/connector/src/main/java/com/celonis/kafka/connect/schema/StructSchemaEvolution.java
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2023 Celonis SE
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.celonis.kafka.connect.schema;
+
+import org.apache.kafka.connect.data.Schema;
+import org.apache.kafka.connect.data.SchemaBuilder;
+
+import java.util.Objects;
+
+/**
+ * StructSchemaEvolution is responsible for the recursively merging existing schema and schema from
+ * new record. And thus evolving the schema with each new record received from connector.
+ */
+public class StructSchemaEvolution implements SchemaEvolution {
+
+  /**
+   * Merge top level Kafka Connect Structs
+   *
+   * @param currentSchema existing schema, must be of type Struct
+   * @param recordSchema schema of new record, must be of type Struct
+   * @return Schema after merging existing and new schema recursively
+   */
+  @Override
+  public Schema evolve(Schema currentSchema, Schema recordSchema) throws SchemaEvolutionException {
+    if (currentSchema == recordSchema) return currentSchema;
+
+    // RecordTransformer ensures that the top level schema are of type Struct.
+    return mergeSchemas(null, currentSchema, recordSchema);
+  }
+
+  /**
+   * Merge a Kafka Connect Schemas
+   *
+   * @param fieldName current field name, `null` when the recursion starts
+   * @param currentSchema existing schema, (Accepted types MAP, ARRAY, STRUCT)
+   * @param recordSchema schema of new record, (Accepted types MAP, ARRAY, STRUCT)
+   * @return Schema after merging existing and new schemas recursively
+   */
+  private Schema mergeSchemas(String fieldName, Schema currentSchema, Schema recordSchema) {
+    // validationsFirst
+    validateSchemasTypes(fieldName, currentSchema, recordSchema);
+
+    switch (currentSchema.type()) {
+      case STRUCT:
+        return mergeStructs(currentSchema, recordSchema);
+      case ARRAY:
+        return SchemaBuilder.array(
+                mergeSchemas(fieldName, currentSchema.valueSchema(), recordSchema.valueSchema()))
+            .build();
+      case MAP:
+        var keySchema =
+            mergeSchemas(fieldName, currentSchema.keySchema(), recordSchema.keySchema());
+        var valueSchema =
+            mergeSchemas(fieldName, currentSchema.valueSchema(), recordSchema.valueSchema());
+        return SchemaBuilder.map(keySchema, valueSchema).build();
+      default:
+        return currentSchema;
+    }
+  }
+
+  private Schema mergeStructs(Schema currentSchema, Schema recordSchema)
+      throws SchemaEvolutionException {
+    SchemaBuilder result = SchemaUtils.withMetadata(SchemaBuilder.struct(), currentSchema);
+
+    // First currentSchemaFields
+    currentSchema.fields().stream()
+        .forEach(
+            currentSchemaField -> {
+              final var recordSchemaField = recordSchema.field(currentSchemaField.name());
+              if (recordSchemaField == null) {
+                // If not present in recordSchema, just add it
+                result.field(currentSchemaField.name(), currentSchemaField.schema());
+              } else {
+                // Recursively evolve otherwise
+                result.field(
+                    currentSchemaField.name(),
+                    mergeSchemas(
+                        currentSchemaField.name(),
+                        currentSchemaField.schema(),
+                        recordSchemaField.schema()));
+              }
+            });
+
+    // Just add remaining record schema fields as they are
+    recordSchema.fields().stream()
+        .filter(rf -> currentSchema.field(rf.name()) == null)
+        .forEach(rf -> result.field(rf.name(), rf.schema()));
+
+    return result.build();
+  }
+
+  private void validateSchemasTypes(String fieldName, Schema currentSchema, Schema recordSchema) {
+    if (bothPrimitives(currentSchema, recordSchema) && !sameLogicalType(currentSchema, recordSchema)
+        || !currentSchema.type().equals(recordSchema.type())) {
+
+      throw new SchemaEvolutionException(
+          String.format(
+              "New schema has field '%s' with a different type! "
+                  + "previous type: %s, current type: %s",
+              fieldName, currentSchema, recordSchema));
+    }
+  }
+
+  private boolean bothPrimitives(Schema s1, Schema s2) {
+    return s1.type().isPrimitive() && s2.type().isPrimitive();
+  }
+
+  private boolean sameLogicalType(Schema s1, Schema s2) {
+    return Objects.equals(s1.type(), s2.type())
+        && Objects.equals(s1.name(), s2.name())
+        && Objects.equals(s1.version(), s2.version())
+        && Objects.equals(s1.parameters(), s2.parameters());
+  }
+}
diff --git a/connector/src/main/scala/com/celonis/kafka/connect/transform/RecordTransformer.scala b/connector/src/main/scala/com/celonis/kafka/connect/transform/RecordTransformer.scala
@@ -25,12 +25,14 @@ import com.celonis.kafka.connect.ems.errors.FailedObfuscationException
 import com.celonis.kafka.connect.ems.model._
 import com.celonis.kafka.connect.ems.obfuscation.ObfuscationUtils._
 import com.celonis.kafka.connect.ems.storage.PrimaryKeysValidator
+import com.celonis.kafka.connect.schema.{StructSchemaAlignment, StructSchemaEvolution}
 import com.celonis.kafka.connect.transform.conversion.ConnectConversion
 import com.celonis.kafka.connect.transform.fields.EmbeddedKafkaMetadata
 import com.celonis.kafka.connect.transform.fields.FieldInserter
 import com.celonis.kafka.connect.transform.flatten.Flattener
 import com.typesafe.scalalogging.StrictLogging
 import org.apache.avro.generic.GenericRecord
+import org.apache.kafka.connect.data.{Schema, SchemaBuilder, Struct}
 import org.apache.kafka.connect.sink.SinkRecord
 
 /** The main business transformation.
@@ -45,9 +47,13 @@ final class RecordTransformer(
   obfuscation:   Option[ObfuscationConfig],
   inserter:      FieldInserter,
 ) extends StrictLogging {
+
+  private var targetSchema: Schema = SchemaBuilder.struct().build();
+  private val schemaEvolution = new StructSchemaEvolution();
+
   def transform(sinkRecord: SinkRecord): IO[GenericRecord] = {
     val (convertedValue, convertedSchema) = preConversion.convert(sinkRecord.value(), Option(sinkRecord.valueSchema()))
-    val flattenedValue                    = flattener.flatten(convertedValue, convertedSchema)
+    val flattenedValue = flattener.flatten(convertedValue, convertedSchema)
 
     for {
       transformedValue <- IO(
@@ -56,7 +62,8 @@ final class RecordTransformer(
           EmbeddedKafkaMetadata(sinkRecord.kafkaPartition(), sinkRecord.kafkaOffset(), sinkRecord.timestamp()),
         ),
       )
-      v <- IO.fromEither(DataConverter.apply(transformedValue))
+      schemaAlignedValue = evolveSchemaAndAlignValue(transformedValue)
+      v <- IO.fromEither(DataConverter.apply(schemaAlignedValue))
       _ <- IO(logger.debug("[{}] EmsSinkTask:put obfuscation={}", sinkName, obfuscation))
       value <- obfuscation.fold(IO.pure(v)) { o =>
         IO.fromEither(v.obfuscate(o).leftMap(FailedObfuscationException))
@@ -68,34 +75,42 @@ final class RecordTransformer(
       _ <- IO.fromEither(pksValidator.validate(value, metadata))
     } yield value
   }
+
+  private def evolveSchemaAndAlignValue(value: Any): Any =
+    value match {
+      case struct: Struct =>
+        targetSchema = schemaEvolution.evolve(targetSchema, struct.schema())
+        StructSchemaAlignment.alignTo(targetSchema, struct)
+      case _ => value
+    }
 }
 
-object RecordTransformer {
-  def fromConfig(
-    sinkName:            String,
-    preConversionConfig: PreConversionConfig,
-    flattenerConfig:     Option[FlattenerConfig],
-    primaryKeys:         List[String],
-    obfuscation:         Option[ObfuscationConfig],
-    allowNullsAsPks:     Boolean,
-    inserter:            FieldInserter): RecordTransformer =
-    new RecordTransformer(
-      sinkName,
-      ConnectConversion.fromConfig(preConversionConfig),
-      Flattener.fromConfig(flattenerConfig),
-      new PrimaryKeysValidator(primaryKeys, allowNullsAsPks),
-      obfuscation,
-      inserter,
-    )
+  object RecordTransformer {
+    def fromConfig(
+                    sinkName: String,
+                    preConversionConfig: PreConversionConfig,
+                    flattenerConfig: Option[FlattenerConfig],
+                    primaryKeys: List[String],
+                    obfuscation: Option[ObfuscationConfig],
+                    allowNullsAsPks: Boolean,
+                    inserter: FieldInserter): RecordTransformer =
+      new RecordTransformer(
+        sinkName,
+        ConnectConversion.fromConfig(preConversionConfig),
+        Flattener.fromConfig(flattenerConfig),
+        new PrimaryKeysValidator(primaryKeys, allowNullsAsPks),
+        obfuscation,
+        inserter,
+      )
 
-  def fromConfig(config: EmsSinkConfig): RecordTransformer =
-    fromConfig(
-      config.sinkName,
-      config.preConversionConfig,
-      config.flattenerConfig,
-      config.primaryKeys,
-      config.obfuscation,
-      config.allowNullsAsPks,
-      FieldInserter.embeddedKafkaMetadata(config.embedKafkaMetadata, config.orderField.name),
-    )
+    def fromConfig(config: EmsSinkConfig): RecordTransformer =
+      fromConfig(
+        config.sinkName,
+        config.preConversionConfig,
+        config.flattenerConfig,
+        config.primaryKeys,
+        config.obfuscation,
+        config.allowNullsAsPks,
+        FieldInserter.embeddedKafkaMetadata(config.embedKafkaMetadata, config.orderField.name),
+      )
 }
diff --git a/connector/src/test/scala/com/celonis/kafka/connect/schema/StructSchemaAlignmentTest.scala b/connector/src/test/scala/com/celonis/kafka/connect/schema/StructSchemaAlignmentTest.scala
diff --git a/connector/src/test/scala/com/celonis/kafka/connect/schema/StructSchemaEvolutionTest.scala b/connector/src/test/scala/com/celonis/kafka/connect/schema/StructSchemaEvolutionTest.scala
diff --git a/connector/src/test/scala/com/celonis/kafka/connect/transform/RecordTransformerTest.scala b/connector/src/test/scala/com/celonis/kafka/connect/transform/RecordTransformerTest.scala