snowpipe: support processors when evolving the schema

rockwotj · rockwotj · commit a8b4a346d765 · 2025-01-23T18:31:05.000Z
diff --git a/docs/modules/components/pages/outputs/snowflake_streaming.adoc b/docs/modules/components/pages/outputs/snowflake_streaming.adoc
@@ -62,6 +62,7 @@ output:
           this == "timestamp" => "TIMESTAMP"
           _ => "VARIANT"
         }
+      processors: [] # No default (optional)
     batching:
       count: 0
       byte_size: 0
@@ -103,6 +104,7 @@ output:
           this == "timestamp" => "TIMESTAMP"
           _ => "VARIANT"
         }
+      processors: [] # No default (optional)
     build_options:
       parallelism: 1
       chunk_size: 50000
@@ -463,13 +465,23 @@ Whether schema evolution is enabled.
 
 The mapping function from Redpanda Connect type to column type in Snowflake. Overriding this can allow for customization of the datatype if there is specific information that you know about the data types in use. This mapping should result in the `root` variable being assigned a string with the data type for the new column in Snowflake.
 
-The input to this mapping is an object with the value and the name of the new column, for example: `{"value": 42.3, "name":"new_data_field"}"
+        The input to this mapping is either the output of `processors` if specified, otherwise it is an object with the value and the name of the new column, the original message and table being written too. The metadata is unchanged from the original message that caused the schema to change. For example: `{"value": 42.3, "name":"new_data_field", "message": {"existing_data_field": 42, "new_data_field": "foo"}, "db": MY_DATABASE", "schema": "MY_SCHEMA", "table": "MY_TABLE"}
 
 
 *Type*: `string`
 
 *Default*: `"root = match this.value.type() {\n  this == \"string\" =\u003e \"STRING\"\n  this == \"bytes\" =\u003e \"BINARY\"\n  this == \"number\" =\u003e \"DOUBLE\"\n  this == \"bool\" =\u003e \"BOOLEAN\"\n  this == \"timestamp\" =\u003e \"TIMESTAMP\"\n  _ =\u003e \"VARIANT\"\n}"`
 
+=== `schema_evolution.processors`
+
+A series of processors to execute when new columns are added to the table. Specifying this can support running side effects when the schema evolves or enriching the message with additional message to guide the schema changes. For example, one could read the schema the message was produced with from the schema registry and use that to decide which type the new column in Snowflake should be.
+
+        The input to these processors is an object with the value and the name of the new column, the original message and table being written too. The metadata is unchanged from the original message that caused the schema to change. For example: `{"value": 42.3, "name":"new_data_field", "message": {"existing_data_field": 42, "new_data_field": "foo"}, "db": MY_DATABASE", "schema": "MY_SCHEMA", "table": "MY_TABLE"}`
+
+
+*Type*: `array`
+
+
 === `build_options`
 
 Options to optimize the time to build output data that is sent to Snowflake. The metric to watch to see if you need to change this is `snowflake_build_output_latency_ns`.
diff --git a/internal/impl/snowflake/output_snowflake_streaming.go b/internal/impl/snowflake/output_snowflake_streaming.go
@@ -49,6 +49,7 @@ const (
 	ssoFieldSchemaEvolution                     = "schema_evolution"
 	ssoFieldSchemaEvolutionEnabled              = "enabled"
 	ssoFieldSchemaEvolutionNewColumnTypeMapping = "new_column_type_mapping"
+	ssoFieldSchemaEvolutionProcessors           = "processors"
 
 	defaultSchemaEvolutionNewColumnMapping = `root = match this.value.type() {
   this == "string" => "STRING"
@@ -117,7 +118,11 @@ ALTER TABLE t1 ADD COLUMN a2 NUMBER;
 				service.NewBloblangField(ssoFieldSchemaEvolutionNewColumnTypeMapping).Description(`
 The mapping function from Redpanda Connect type to column type in Snowflake. Overriding this can allow for customization of the datatype if there is specific information that you know about the data types in use. This mapping should result in the `+"`root`"+` variable being assigned a string with the data type for the new column in Snowflake.
 
-The input to this mapping is an object with the value and the name of the new column, for example: `+"`"+`{"value": 42.3, "name":"new_data_field"}`+`"`).Default(defaultSchemaEvolutionNewColumnMapping),
+        The input to this mapping is either the output of `+"`processors`"+` if specified, otherwise it is an object with the value and the name of the new column, the original message and table being written too. The metadata is unchanged from the original message that caused the schema to change. For example: `+"`"+`{"value": 42.3, "name":"new_data_field", "message": {"existing_data_field": 42, "new_data_field": "foo"}, "db": MY_DATABASE", "schema": "MY_SCHEMA", "table": "MY_TABLE"}`).Default(defaultSchemaEvolutionNewColumnMapping),
+				service.NewProcessorListField(ssoFieldSchemaEvolutionProcessors).Description(`
+A series of processors to execute when new columns are added to the table. Specifying this can support running side effects when the schema evolves or enriching the message with additional message to guide the schema changes. For example, one could read the schema the message was produced with from the schema registry and use that to decide which type the new column in Snowflake should be.
+
+        The input to these processors is an object with the value and the name of the new column, the original message and table being written too. The metadata is unchanged from the original message that caused the schema to change. For example: `+"`"+`{"value": 42.3, "name":"new_data_field", "message": {"existing_data_field": 42, "new_data_field": "foo"}, "db": MY_DATABASE", "schema": "MY_SCHEMA", "table": "MY_TABLE"}`+"`").Optional(),
 			).Description(`Options to control schema evolution within the pipeline as new columns are added to the pipeline.`).Optional(),
 			service.NewIntField(ssoFieldBuildParallelism).Description("The maximum amount of parallelism to use when building the output for Snowflake. The metric to watch to see if you need to change this is `snowflake_build_output_latency_ns`.").Optional().Advanced().Deprecated(),
 			service.NewObjectField(ssoFieldBuildOpts,
@@ -398,14 +403,25 @@ func newSnowflakeStreamer(
 		}
 	}
 	var schemaEvolutionMapping *bloblang.Executor
+	var schemaEvolutionProcessors []*service.OwnedProcessor
 	if conf.Contains(ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionEnabled) {
-		enabled, err := conf.FieldBool(ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionEnabled)
-		if err == nil && enabled {
-			schemaEvolutionMapping, err = conf.FieldBloblang(ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionNewColumnTypeMapping)
-		}
+		seConf := conf.Namespace(ssoFieldSchemaEvolution)
+		enabled, err := seConf.FieldBool(ssoFieldSchemaEvolutionEnabled)
 		if err != nil {
 			return nil, err
 		}
+		if enabled {
+			schemaEvolutionMapping, err = seConf.FieldBloblang(ssoFieldSchemaEvolutionNewColumnTypeMapping)
+			if err != nil {
+				return nil, err
+			}
+		}
+		if seConf.Contains(ssoFieldSchemaEvolutionProcessors) {
+			schemaEvolutionProcessors, err = seConf.FieldProcessorList(ssoFieldSchemaEvolutionProcessors)
+			if err != nil {
+				return nil, err
+			}
+		}
 	}
 
 	var buildOpts streaming.BuildOptions
@@ -512,6 +528,7 @@ func newSnowflakeStreamer(
 		if schemaEvolutionMapping != nil {
 			schemaEvolver = &snowpipeSchemaEvolver{
 				schemaEvolutionMapping: schemaEvolutionMapping,
+				pipeline:               schemaEvolutionProcessors,
 				restClient:             restClient,
 				logger:                 mgr.Logger(),
 				db:                     db,
diff --git a/internal/impl/snowflake/schema_evolution.go b/internal/impl/snowflake/schema_evolution.go
@@ -67,18 +67,46 @@ func asSchemaMigrationError(err error) (*schemaMigrationNeededError, bool) {
 
 type snowpipeSchemaEvolver struct {
 	schemaEvolutionMapping *bloblang.Executor
+	pipeline               []*service.OwnedProcessor
 	logger                 *service.Logger
 	// The evolver does not close nor own this rest client.
 	restClient              *streaming.SnowflakeRestClient
 	db, schema, table, role string
 }
 
-func (o *snowpipeSchemaEvolver) ComputeMissingColumnType(col *streaming.MissingColumnError) (string, error) {
-	msg := service.NewMessage(nil)
+func (o *snowpipeSchemaEvolver) ComputeMissingColumnType(ctx context.Context, col *streaming.MissingColumnError) (string, error) {
+	msg := col.Message().Copy()
+	original, err := msg.AsStructuredMut()
+	if err != nil {
+		// This should never happen, we had to get the data as structured to be able to know it was a missing column type
+		return "", fmt.Errorf("unable to extract JSON data from message that caused schema evolution: %w", err)
+	}
+	msg.SetError(nil) // Clear error
 	msg.SetStructuredMut(map[string]any{
-		"name":  col.RawName(),
-		"value": col.Value(),
+		"name":    col.RawName(),
+		"value":   col.Value(),
+		"message": original,
+		"db":      o.db,
+		"schema":  o.schema,
+		"table":   o.table,
 	})
+	if len(o.pipeline) > 0 {
+		batches, err := service.ExecuteProcessors(ctx, o.pipeline, service.MessageBatch{msg})
+		if err != nil {
+			return "", fmt.Errorf("failure to execute %s.%s prior to schema evolution: %w", ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionProcessors, err)
+		}
+		if len(batches) != 1 {
+			return "", fmt.Errorf("expected a single batch output from %s.%s, got: %d", ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionProcessors, len(batches))
+		}
+		batch := batches[0]
+		if len(batch) != 1 {
+			return "", fmt.Errorf("expected a single message output from %s.%s, got: %d", ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionProcessors, len(batch))
+		}
+		msg = batch[0]
+		if err := msg.GetError(); err != nil {
+			return "", fmt.Errorf("message failure executing %s.%s prior to schema evolution: %w", ssoFieldSchemaEvolution, ssoFieldSchemaEvolutionProcessors, err)
+		}
+	}
 	out, err := msg.BloblangQuery(o.schemaEvolutionMapping)
 	if err != nil {
 		return "", fmt.Errorf("unable to compute new column type for %s: %w", col.ColumnName(), err)
@@ -95,7 +123,7 @@ func (o *snowpipeSchemaEvolver) ComputeMissingColumnType(col *streaming.MissingC
 }
 
 func (o *snowpipeSchemaEvolver) MigrateMissingColumn(ctx context.Context, col *streaming.MissingColumnError) error {
-	columnType, err := o.ComputeMissingColumnType(col)
+	columnType, err := o.ComputeMissingColumnType(ctx, col)
 	if err != nil {
 		return err
 	}
@@ -154,7 +182,7 @@ func (o *snowpipeSchemaEvolver) CreateOutputTable(ctx context.Context, batch ser
 	columns := []string{}
 	for k, v := range row {
 		col := streaming.NewMissingColumnError(msg, k, v)
-		colType, err := o.ComputeMissingColumnType(col)
+		colType, err := o.ComputeMissingColumnType(ctx, col)
 		if err != nil {
 			return err
 		}