SeldonIO · lc525 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025 · Oct 7, 2025
@@ -9061,6 +9061,44 @@ spec:
                           x-kubernetes-int-or-string: true
                         type: object
                     type: object
+                  scalingConfig:
+                    description: Control scaling parameters for various components
+                    properties:
+                      models:
+                        properties:
+                          enabled:
+                            type: boolean
+                        type: object
+                      pipelines:
+                        description: Scaling config impacting pipeline-gateway, dataflow-engine
+                          and model-gateway
+                        properties:
+                          maxShardCountMultiplier:
+                            description: |-
+                              MaxShardCountMultiplier influences the way the inferencing workload is sharded over the
+                              replicas of pipeline components.
+
+                              - For each of pipeline-gateway and dataflow-engine, the max number of replicas is
+                                `maxShardCountMultiplier * number of pipelines`
+                              - For model-gateway, the max number of replicas is
+                                `maxShardCountMultiplier * number of consumers`
+
+                              It doesn't make sense to set this to a value larger than the number of partitions for kafka
+                              topics used in the Core 2 install.
+                            format: int32
+                            type: integer
+                        type: object
+                      servers:
+                        properties:
+                          enabled:
+                            type: boolean
+                          scaleDownPackingEnabled:
+                            type: boolean
+                          scaleDownPackingPercentage:
+                            format: int32
+                            type: integer
+                        type: object
+                    type: object
                   serviceConfig:
                     properties:
                       grpcServicePrefix:
@@ -9187,6 +9225,44 @@ spec:
                           x-kubernetes-int-or-string: true
                         type: object
                     type: object
+                  scalingConfig:
+                    description: Control scaling parameters for various components
+                    properties:
+                      models:
+                        properties:
+                          enabled:
+                            type: boolean
+                        type: object
+                      pipelines:
+                        description: Scaling config impacting pipeline-gateway, dataflow-engine
+                          and model-gateway
+                        properties:
+                          maxShardCountMultiplier:
+                            description: |-
+                              MaxShardCountMultiplier influences the way the inferencing workload is sharded over the
+                              replicas of pipeline components.
+
+                              - For each of pipeline-gateway and dataflow-engine, the max number of replicas is
+                                `maxShardCountMultiplier * number of pipelines`
+                              - For model-gateway, the max number of replicas is
+                                `maxShardCountMultiplier * number of consumers`
+
+                              It doesn't make sense to set this to a value larger than the number of partitions for kafka
+                              topics used in the Core 2 install.
+                            format: int32
+                            type: integer
+                        type: object
+                      servers:
+                        properties:
+                          enabled:
+                            type: boolean
+                          scaleDownPackingEnabled:
+                            type: boolean
+                          scaleDownPackingPercentage:
+                            format: int32
+                            type: integer
+                        type: object
+                    type: object
                   serviceConfig:
                     properties:
                       grpcServicePrefix:

@@ -4,32 +4,33 @@ seldonConfig: default
 hodometer:
   disable: false
   replicas: 1
-  
+
 scheduler:
   disable: false
   replicas: 1
   # controlplane exposure
-  serviceType: LoadBalancer 
-  
+  serviceType: LoadBalancer
+
 envoy:
   disable: false
   replicas: 1
   # dataplane exposure
-  serviceType: LoadBalancer 
-  
+  serviceType: LoadBalancer
+
 dataflow:
   disable: false
   replicas: 1
-  
+
 modelgateway:
   disable: false
   replicas: 1
-  
+
 pipelinegateway:
   disable: false
   replicas: 1
 
 config:
+  scalingConfig:
   agentConfig:
     rclone:
       configSecrets:
@@ -48,4 +49,4 @@ config:
   serviceConfig:
     serviceGRPCPrefix:
     serviceType:
-  
+
@@ -511,6 +511,7 @@ spec:
         - --db-path=/mnt/scheduler/db
         - --allow-plaintxt=$(ALLOW_PLAINTXT)
         - --kafka-config-path=/mnt/kafka/kafka.json
+        - --scaling-config-path=/mnt/scaling/scaling.yaml
         - --scheduler-ready-timeout-seconds=$(SCHEDULER_READY_TIMEOUT_SECONDS)
         - --server-packing-enabled=$(SERVER_PACKING_ENABLED)
         - --server-packing-percentage=$(SERVER_PACKING_PERCENTAGE)
@@ -639,6 +640,8 @@ spec:
         volumeMounts:
         - mountPath: /mnt/kafka
           name: kafka-config-volume
+        - mountPath: /mnt/scaling
+          name: scaling-config-volume
         - mountPath: /mnt/tracing
           name: tracing-config-volume
         - mountPath: /mnt/scheduler
@@ -652,6 +655,9 @@ spec:
       serviceAccountName: seldon-scheduler
       terminationGracePeriodSeconds: 5
       volumes:
+      - configMap:
+          name: seldon-scaling
+        name: scaling-config-volume
       - configMap:
           name: seldon-kafka
         name: kafka-config-volume
@@ -1306,6 +1312,18 @@ spec:
       topics:
         numPartitions: '{{ .Values.kafka.topics.numPartitions }}'
         replicationFactor: '{{ .Values.kafka.topics.replicationFactor }}'
+    scalingConfig:
+      models:
+        enabled: {{ .Values.autoscaling.autoscalingModelEnabled }}
+      pipelines:
+        maxShardCountMultiplier: {{ .Values.kafka.topics.numPartitions
+          }}
+      servers:
+        enabled: {{ .Values.autoscaling.autoscalingServerEnabled }}
+        scaleDownPackingEnabled: {{ .Values.autoscaling.serverPackingEnabled
+          }}
+        scaleDownPackingPercentage: {{ .Values.autoscaling.serverPackingPercentage
+          }}
     serviceConfig:
       grpcServicePrefix: '{{ .Values.services.serviceGRPCPrefix }}'
       serviceType: '{{ .Values.services.defaultServiceType }}'

@@ -511,6 +511,7 @@ spec:
         - --db-path=/mnt/scheduler/db
         - --allow-plaintxt=$(ALLOW_PLAINTXT)
         - --kafka-config-path=/mnt/kafka/kafka.json
+        - --scaling-config-path=/mnt/scaling/scaling.yaml
         - --scheduler-ready-timeout-seconds=$(SCHEDULER_READY_TIMEOUT_SECONDS)
         - --server-packing-enabled=$(SERVER_PACKING_ENABLED)
         - --server-packing-percentage=$(SERVER_PACKING_PERCENTAGE)
@@ -639,6 +640,8 @@ spec:
         volumeMounts:
         - mountPath: /mnt/kafka
           name: kafka-config-volume
+        - mountPath: /mnt/scaling
+          name: scaling-config-volume
         - mountPath: /mnt/tracing
           name: tracing-config-volume
         - mountPath: /mnt/scheduler
@@ -652,6 +655,9 @@ spec:
       serviceAccountName: seldon-scheduler
       terminationGracePeriodSeconds: 5
       volumes:
+      - configMap:
+          name: seldon-scaling
+        name: scaling-config-volume
       - configMap:
           name: seldon-kafka
         name: kafka-config-volume
@@ -1306,6 +1312,18 @@ spec:
       topics:
         numPartitions: '{{ .Values.kafka.topics.numPartitions }}'
         replicationFactor: '{{ .Values.kafka.topics.replicationFactor }}'
+    scalingConfig:
+      models:
+        enabled: {{ .Values.autoscaling.autoscalingModelEnabled }}
+      pipelines:
+        maxShardCountMultiplier: {{ .Values.kafka.topics.numPartitions
+          }}
+      servers:
+        enabled: {{ .Values.autoscaling.autoscalingServerEnabled }}
+        scaleDownPackingEnabled: {{ .Values.autoscaling.serverPackingEnabled
+          }}
+        scaleDownPackingPercentage: {{ .Values.autoscaling.serverPackingPercentage
+          }}
     serviceConfig:
       grpcServicePrefix: '{{ .Values.services.serviceGRPCPrefix }}'
       serviceType: '{{ .Values.services.defaultServiceType }}'

@@ -85,7 +85,7 @@ opentelemetry:
 
 # logging
 # this is a global setting, in the case individual components logLevel is not set
-# Users should set a value from: 
+# Users should set a value from:
 # fatal, error, warn, info, debug, trace
 # if used also for .rclone.logLevel, the allowed set reduces to:
 # debug, info, error
@@ -245,7 +245,7 @@ scheduler:
     runAsGroup: 1000
     runAsNonRoot: true
   schedulerReadyTimeoutSeconds: 600
-  
+
 autoscaling:
   autoscalingModelEnabled: false
   autoscalingServerEnabled: true

@@ -85,7 +85,7 @@ opentelemetry:
 
 # logging
 # this is a global setting, in the case individual components logLevel is not set
-# Users should set a value from: 
+# Users should set a value from:
 # fatal, error, warn, info, debug, trace
 # if used also for .rclone.logLevel, the allowed set reduces to:
 # debug, info, error
@@ -245,7 +245,7 @@ scheduler:
     runAsGroup: 1000
     runAsNonRoot: true
   schedulerReadyTimeoutSeconds: 600
-  
+
 autoscaling:
   autoscalingModelEnabled: false
   autoscalingServerEnabled: true

@@ -18,6 +18,7 @@ patchesStrategicMerge:
 - ../../kustomize/helm-components-sc/patch_scheduler.yaml
 - ../../kustomize/helm-components-sc/patch_kafkaconfig.yaml
 - ../../kustomize/helm-components-sc/patch_tracingconfig.yaml
+- ../../kustomize/helm-components-sc/patch_scalingconfig.yaml
 - ../../kustomize/helm-components-sc/patch_agentconfig.yaml
 - ../../kustomize/helm-components-sc/patch_serviceconfig.yaml
 - patch_mlserver.yaml
@@ -59,6 +60,11 @@ patches:
     version: v1alpha1
     kind: SeldonConfig
     name: default
+- path: ../../kustomize/helm-components-sc/patch_scalingconfig_json6902.yaml
+  target:
+    version: v1alpha1
+    kind: SeldonConfig
+    name: default
 - path: ../../kustomize/helm-components-sc/patch_pipelinegateway_json6902.yaml
   target:
     version: v1alpha1

@@ -21,6 +21,7 @@ patchesStrategicMerge:
 - patch_kafkaconfig.yaml
 - patch_tracingconfig.yaml
 - patch_agentconfig.yaml
+- patch_scalingconfig.yaml
 - patch_serviceconfig.yaml
 
 patches:
@@ -59,6 +60,11 @@ patches:
     version: v1alpha1
     kind: SeldonConfig
     name: default
+- path: patch_scalingconfig_json6902.yaml
+  target:
+    version: v1alpha1
+    kind: SeldonConfig
+    name: default
 - path: patch_pipelinegateway_json6902.yaml
   target:
     version: v1alpha1

@@ -0,0 +1,15 @@
+apiVersion: mlops.seldon.io/v1alpha1
+kind: SeldonConfig
+metadata:
+  name: default
+spec:
+  config:
+    scalingConfig:
+      models:
+        enabled:
+      servers:
+        enabled:
+        scaleDownPackingEnabled:
+        scaleDownPackingPercentage:
+      pipelines:
+        maxShardCountMultiplier:
@@ -0,0 +1,15 @@
+- op: add
+  path: /spec/config/scalingConfig/models/enabled
+  value: HACK_REMOVE_ME{{ .Values.autoscaling.autoscalingModelEnabled }}
+- op: add
+  path: /spec/config/scalingConfig/servers/enabled
+  value: HACK_REMOVE_ME{{ .Values.autoscaling.autoscalingServerEnabled }}
+- op: add
+  path: /spec/config/scalingConfig/servers/scaleDownPackingEnabled
+  value: HACK_REMOVE_ME{{ .Values.autoscaling.serverPackingEnabled }}
+- op: add
+  path: /spec/config/scalingConfig/servers/scaleDownPackingPercentage
+  value: HACK_REMOVE_ME{{ .Values.autoscaling.serverPackingPercentage }}
+- op: add
+  path: /spec/config/scalingConfig/pipelines/maxShardCountMultiplier
+  value: HACK_REMOVE_ME{{ .Values.kafka.topics.numPartitions }}
@@ -358,6 +358,7 @@ spec:
         - --db-path=/mnt/scheduler/db
         - --allow-plaintxt=$(ALLOW_PLAINTXT)
         - --kafka-config-path=/mnt/kafka/kafka.json
+        - --scaling-config-path=/mnt/scaling/scaling.yaml
         - --scheduler-ready-timeout-seconds=$(SCHEDULER_READY_TIMEOUT_SECONDS)
         - --server-packing-enabled=$(SERVER_PACKING_ENABLED)
         - --server-packing-percentage=$(SERVER_PACKING_PERCENTAGE)
@@ -481,6 +482,8 @@ spec:
         volumeMounts:
         - mountPath: /mnt/kafka
           name: kafka-config-volume
+        - mountPath: /mnt/scaling
+          name: scaling-config-volume
         - mountPath: /mnt/tracing
           name: tracing-config-volume
         - mountPath: /mnt/scheduler
@@ -493,6 +496,9 @@ spec:
       serviceAccountName: seldon-scheduler
       terminationGracePeriodSeconds: 5
       volumes:
+      - configMap:
+          name: seldon-scaling
+        name: scaling-config-volume
       - configMap:
           name: seldon-kafka
         name: kafka-config-volume
@@ -1127,6 +1133,15 @@ spec:
       topics:
         numPartitions: '1'
         replicationFactor: '1'
+    scalingConfig:
+      models:
+        enabled: false
+      pipelines:
+        maxShardCountMultiplier: 1
+      servers:
+        enabled: true
+        scaleDownPackingEnabled: false
+        scaleDownPackingPercentage: 0
     serviceConfig:
       grpcServicePrefix: ''
       serviceType: 'LoadBalancer'