From bca1b9fdf3d6c2af693a370acf41c96a280e0c70 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Wed, 25 Mar 2026 13:33:20 -0700 Subject: [PATCH 1/4] Add KEDA autoscaling support for ModelServing via Prometheus - Populate .status.labelSelector in controller to enable HPA pod discovery - Add ScaledObject targeting ModelServing custom resource - Add RBAC for KEDA to scale ModelServing resources - Add ServiceMonitor and test deployment manifests Signed-off-by: WHOIM1205 --- keda-rbac.yaml | 30 +++++ modelserving.yaml | 38 ++++++ .../controller/model_serving_controller.go | 19 ++- scaledobject.yaml | 20 ++++ servicemonitor.yaml | 16 +++ test-deployment.yaml | 113 ++++++++++++++++++ 6 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 keda-rbac.yaml create mode 100644 modelserving.yaml create mode 100644 scaledobject.yaml create mode 100644 servicemonitor.yaml create mode 100644 test-deployment.yaml diff --git a/keda-rbac.yaml b/keda-rbac.yaml new file mode 100644 index 000000000..195298c09 --- /dev/null +++ b/keda-rbac.yaml @@ -0,0 +1,30 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: keda-modelserving-scaling +rules: + - apiGroups: + - workload.serving.volcano.sh + resources: + - modelservings + - modelservings/scale + - modelservings/status + verbs: + - get + - list + - watch + - update + - patch +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: keda-modelserving-scaling +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: keda-modelserving-scaling +subjects: + - kind: ServiceAccount + name: keda-operator + namespace: keda diff --git a/modelserving.yaml b/modelserving.yaml new file mode 100644 index 000000000..e82d28c98 --- /dev/null +++ b/modelserving.yaml @@ -0,0 +1,38 @@ +apiVersion: workload.serving.volcano.sh/v1alpha1 +kind: ModelServing +metadata: + name: test-model + namespace: default +spec: + replicas: 1 + template: + roles: + - name: entry + workerReplicas: 1 + entryTemplate: + metadata: {} + spec: + containers: + - name: entry + image: nginx + workerTemplate: + metadata: {} + spec: + containers: + - name: worker + image: nginx + + - name: worker + workerReplicas: 1 + entryTemplate: + metadata: {} + spec: + containers: + - name: entry + image: nginx + workerTemplate: + metadata: {} + spec: + containers: + - name: worker + image: nginx diff --git a/pkg/model-serving-controller/controller/model_serving_controller.go b/pkg/model-serving-controller/controller/model_serving_controller.go index 8d7599eeb..6a71f5d6c 100644 --- a/pkg/model-serving-controller/controller/model_serving_controller.go +++ b/pkg/model-serving-controller/controller/model_serving_controller.go @@ -1617,9 +1617,14 @@ func (c *ModelServingController) UpdateModelServingStatus(ms *workloadv1alpha1.M // If no groups exist, handle gracefully by setting revisions to the new revision if errors.Is(err, datastore.ErrServingGroupNotFound) { copy := latestMS.DeepCopy() - if copy.Status.CurrentRevision != revision || copy.Status.UpdateRevision != revision { + selector := labels.Set{ + workloadv1alpha1.ModelServingNameLabelKey: latestMS.Name, + }.String() + needsUpdate := copy.Status.CurrentRevision != revision || copy.Status.UpdateRevision != revision || copy.Status.LabelSelector != selector + if needsUpdate { copy.Status.CurrentRevision = revision copy.Status.UpdateRevision = revision + copy.Status.LabelSelector = selector _, updateErr := c.modelServingClient.WorkloadV1alpha1().ModelServings(copy.GetNamespace()).UpdateStatus(context.TODO(), copy, metav1.UpdateOptions{}) return updateErr } @@ -1745,6 +1750,18 @@ func (c *ModelServingController) UpdateModelServingStatus(ms *workloadv1alpha1.M copy.Status.ObservedGeneration = latestMS.Generation } + // Set labelSelector so the scale subresource can report it to HPA. + // Without this, HPA fails with "selector is required" because it cannot + // determine which pods belong to this ModelServing. + // The selector matches the label applied to all pods by createBasePod(). + selector := labels.Set{ + workloadv1alpha1.ModelServingNameLabelKey: latestMS.Name, + }.String() + if copy.Status.LabelSelector != selector { + shouldUpdate = true + copy.Status.LabelSelector = selector + } + if shouldUpdate { _, err := c.modelServingClient.WorkloadV1alpha1().ModelServings(copy.GetNamespace()).UpdateStatus(context.TODO(), copy, metav1.UpdateOptions{}) if err != nil { diff --git a/scaledobject.yaml b/scaledobject.yaml new file mode 100644 index 000000000..5c967e851 --- /dev/null +++ b/scaledobject.yaml @@ -0,0 +1,20 @@ +apiVersion: keda.sh/v1alpha1 +kind: ScaledObject +metadata: + name: modelserving-scaler + namespace: default +spec: + scaleTargetRef: + apiVersion: workload.serving.volcano.sh/v1alpha1 + kind: ModelServing + name: test-model + minReplicaCount: 1 + maxReplicaCount: 5 + pollingInterval: 15 + cooldownPeriod: 60 + triggers: + - type: prometheus + metadata: + serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 + query: sum(rate(process_cpu_seconds_total[1m])) + threshold: "0.01" diff --git a/servicemonitor.yaml b/servicemonitor.yaml new file mode 100644 index 000000000..0673063ee --- /dev/null +++ b/servicemonitor.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kthena-router + namespace: monitoring +spec: + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + app.kubernetes.io/component: kthena-router + endpoints: + - port: http + path: /metrics + interval: 15s diff --git a/test-deployment.yaml b/test-deployment.yaml new file mode 100644 index 000000000..eb30827f6 --- /dev/null +++ b/test-deployment.yaml @@ -0,0 +1,113 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kthena-router + labels: + app.kubernetes.io/component: kthena-router +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: kthena-router + template: + metadata: + labels: + app.kubernetes.io/component: kthena-router + spec: + containers: + - name: kthena-router + image: nginx:alpine + ports: + - containerPort: 8080 + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/conf.d + volumes: + - name: nginx-config + configMap: + name: kthena-router-nginx-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: kthena-router-nginx-config +data: + default.conf: | + server { + listen 8080; + + location /metrics { + default_type text/plain; + return 200 '# HELP kthena_router_active_downstream_requests Number of active downstream requests\n# TYPE kthena_router_active_downstream_requests gauge\nkthena_router_active_downstream_requests 3\n# HELP kthena_router_requests_total Total requests\n# TYPE kthena_router_requests_total counter\nkthena_router_requests_total 100\n'; + } + + location / { + return 200 'kthena-router ok\n'; + } + } +--- +apiVersion: v1 +kind: Service +metadata: + name: kthena-router + labels: + app.kubernetes.io/component: kthena-router +spec: + selector: + app.kubernetes.io/component: kthena-router + ports: + - name: http + port: 80 + targetPort: 8080 + protocol: TCP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dummy-inference-vllm + labels: + modelserving.volcano.sh/name: test-model + modelserving.volcano.sh/entry: "true" +spec: + replicas: 1 + selector: + matchLabels: + modelserving.volcano.sh/name: test-model + modelserving.volcano.sh/entry: "true" + template: + metadata: + labels: + modelserving.volcano.sh/name: test-model + modelserving.volcano.sh/entry: "true" + spec: + containers: + - name: dummy-vllm + image: nginx:alpine + ports: + - containerPort: 8000 + volumeMounts: + - name: nginx-config + mountPath: /etc/nginx/conf.d + volumes: + - name: nginx-config + configMap: + name: dummy-vllm-nginx-config +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: dummy-vllm-nginx-config +data: + default.conf: | + server { + listen 8000; + + location /metrics { + default_type text/plain; + return 200 '# HELP vllm_num_requests_running Number of running requests\n# TYPE vllm_num_requests_running gauge\nvllm_num_requests_running 2\n# HELP vllm_num_requests_waiting Number of waiting requests\n# TYPE vllm_num_requests_waiting gauge\nvllm_num_requests_waiting 0\n# HELP vllm_gpu_cache_usage_perc GPU cache usage percentage\n# TYPE vllm_gpu_cache_usage_perc gauge\nvllm_gpu_cache_usage_perc 0.45\n'; + } + + location / { + return 200 'dummy-vllm ok\n'; + } + } From 75d78ee51b25aca17f5c4ed13fc628a916592a22 Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Sun, 29 Mar 2026 03:36:14 -0700 Subject: [PATCH 2/4] Trim KEDA RBAC and remove redundant example - Remove modelservings/status from KEDA ClusterRole (not needed) - Restrict modelservings base resource to read-only verbs - Delete redundant modelserving.yaml test file Signed-off-by: WHOIM1205 --- keda-rbac.yaml | 8 ++++++-- modelserving.yaml | 38 -------------------------------------- 2 files changed, 6 insertions(+), 40 deletions(-) delete mode 100644 modelserving.yaml diff --git a/keda-rbac.yaml b/keda-rbac.yaml index 195298c09..22a028cac 100644 --- a/keda-rbac.yaml +++ b/keda-rbac.yaml @@ -7,12 +7,16 @@ rules: - workload.serving.volcano.sh resources: - modelservings - - modelservings/scale - - modelservings/status verbs: - get - list - watch + - apiGroups: + - workload.serving.volcano.sh + resources: + - modelservings/scale + verbs: + - get - update - patch --- diff --git a/modelserving.yaml b/modelserving.yaml deleted file mode 100644 index e82d28c98..000000000 --- a/modelserving.yaml +++ /dev/null @@ -1,38 +0,0 @@ -apiVersion: workload.serving.volcano.sh/v1alpha1 -kind: ModelServing -metadata: - name: test-model - namespace: default -spec: - replicas: 1 - template: - roles: - - name: entry - workerReplicas: 1 - entryTemplate: - metadata: {} - spec: - containers: - - name: entry - image: nginx - workerTemplate: - metadata: {} - spec: - containers: - - name: worker - image: nginx - - - name: worker - workerReplicas: 1 - entryTemplate: - metadata: {} - spec: - containers: - - name: entry - image: nginx - workerTemplate: - metadata: {} - spec: - containers: - - name: worker - image: nginx From fc2b6516c5fbae0f8850b80cdc48058ffca8d0dd Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Tue, 31 Mar 2026 12:17:13 -0700 Subject: [PATCH 3/4] Add regression tests for labelSelector in UpdateModelServingStatus Signed-off-by: WHOIM1205 --- .../model_serving_controller_test.go | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/pkg/model-serving-controller/controller/model_serving_controller_test.go b/pkg/model-serving-controller/controller/model_serving_controller_test.go index 3a9416053..cc623f453 100644 --- a/pkg/model-serving-controller/controller/model_serving_controller_test.go +++ b/pkg/model-serving-controller/controller/model_serving_controller_test.go @@ -3431,6 +3431,109 @@ func TestScaleUpServingGroups_TemplateRecovery(t *testing.T) { // TestUpdateModelServingStatusRevisionFields tests the CurrentRevision and UpdateRevision logic // following StatefulSet's behavior +func TestUpdateModelServingStatusLabelSelector(t *testing.T) { + tests := []struct { + name string + msName string + existingGroups map[int]string // ordinal -> revision; nil means no groups (ErrServingGroupNotFound path) + revision string + }{ + { + name: "no ServingGroups yet — labelSelector is set on empty status", + msName: "my-llm", + existingGroups: nil, + revision: "rev-1", + }, + { + name: "existing ServingGroups — labelSelector is set consistently", + msName: "my-llm", + existingGroups: map[int]string{ + 0: "rev-1", + 1: "rev-1", + }, + revision: "rev-1", + }, + { + name: "name with special characters — selector encodes correctly", + msName: "serving-gpt-4o-mini", + existingGroups: map[int]string{ + 0: "rev-abc", + }, + revision: "rev-abc", + }, + } + + for idx, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + kubeClient := kubefake.NewSimpleClientset() + kthenaClient := kthenafake.NewSimpleClientset() + volcanoClient := volcanofake.NewSimpleClientset() + apiextClient := apiextfake.NewSimpleClientset() + + controller, err := NewModelServingController(kubeClient, kthenaClient, volcanoClient, apiextClient) + assert.NoError(t, err) + + replicas := int32(len(tt.existingGroups)) + if tt.existingGroups == nil { + replicas = 1 + } + ms := &workloadv1alpha1.ModelServing{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "default", + Name: tt.msName, + }, + Spec: workloadv1alpha1.ModelServingSpec{ + Replicas: ptr.To(replicas), + SchedulerName: "volcano", + Template: workloadv1alpha1.ServingGroup{ + Roles: []workloadv1alpha1.Role{ + { + Name: "prefill", + Replicas: ptr.To[int32](1), + EntryTemplate: workloadv1alpha1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + Containers: []corev1.Container{ + {Name: "c", Image: "img:latest"}, + }, + }, + }, + }, + }, + }, + RecoveryPolicy: workloadv1alpha1.RoleRecreate, + }, + } + + _, err = kthenaClient.WorkloadV1alpha1().ModelServings("default").Create(context.Background(), ms, metav1.CreateOptions{}) + assert.NoError(t, err) + err = controller.modelServingsInformer.GetIndexer().Add(ms) + assert.NoError(t, err) + + // Populate store only when groups exist; nil means the "not found" path. + if tt.existingGroups != nil { + for ordinal, rev := range tt.existingGroups { + controller.store.AddServingGroup(utils.GetNamespaceName(ms), ordinal, rev) + groupName := utils.GenerateServingGroupName(tt.msName, ordinal) + controller.store.UpdateServingGroupStatus(utils.GetNamespaceName(ms), groupName, datastore.ServingGroupRunning) + } + } + + err = controller.UpdateModelServingStatus(ms, tt.revision) + assert.NoError(t, err, "case %d: UpdateModelServingStatus should not error", idx) + + updated, err := kthenaClient.WorkloadV1alpha1().ModelServings("default").Get(context.Background(), tt.msName, metav1.GetOptions{}) + assert.NoError(t, err) + + expectedSelector := labels.Set{ + workloadv1alpha1.ModelServingNameLabelKey: tt.msName, + }.String() + + assert.Equal(t, expectedSelector, updated.Status.LabelSelector, + "case %d: status.labelSelector must be %q", idx, expectedSelector) + }) + } +} + func TestUpdateModelServingStatusRevisionFields(t *testing.T) { tests := []struct { name string From cdeb40f1571b6e0d0c2f4553795c077062e797aa Mon Sep 17 00:00:00 2001 From: WHOIM1205 Date: Thu, 2 Apr 2026 04:44:33 -0700 Subject: [PATCH 4/4] Move KEDA examples to examples/keda-autoscaling/ and fix reviewer feedback - Remove loose test-deployment.yaml and scaledobject.yaml from repo root - Add proper ModelServing example (test-model) with entry + worker roles - Add ScaledObject with real Prometheus query (kthena_router_active_downstream_requests) - Add README explaining usage and groups-vs-pods scaling design Signed-off-by: WHOIM1205 --- examples/keda-autoscaling/README.md | 27 +++++ examples/keda-autoscaling/modelserving.yaml | 29 +++++ .../keda-autoscaling/scaledobject.yaml | 4 +- test-deployment.yaml | 113 ------------------ 4 files changed, 58 insertions(+), 115 deletions(-) create mode 100644 examples/keda-autoscaling/README.md create mode 100644 examples/keda-autoscaling/modelserving.yaml rename scaledobject.yaml => examples/keda-autoscaling/scaledobject.yaml (84%) delete mode 100644 test-deployment.yaml diff --git a/examples/keda-autoscaling/README.md b/examples/keda-autoscaling/README.md new file mode 100644 index 000000000..b1d7dd8be --- /dev/null +++ b/examples/keda-autoscaling/README.md @@ -0,0 +1,27 @@ +# KEDA Autoscaling for ModelServing + +This example demonstrates how to autoscale a ModelServing resource using [KEDA](https://keda.sh/) with Prometheus metrics. + +## Prerequisites + +- KEDA installed in the cluster (`keda` namespace) +- Prometheus stack deployed (`monitoring` namespace) +- Kthena router exposing metrics via ServiceMonitor +- KEDA RBAC for ModelServing (see `keda-rbac.yaml` in the repo root) + +## Usage + +```bash +kubectl apply -f modelserving.yaml +kubectl apply -f scaledobject.yaml +``` + +## How it works + +- `modelserving.yaml` creates a ModelServing resource named `test-model` with entry and worker roles. +- `scaledobject.yaml` creates a KEDA ScaledObject that targets the ModelServing resource and scales based on Prometheus metrics from the kthena-router. +- KEDA queries Prometheus for `kthena_router_active_downstream_requests` and adjusts `spec.replicas` (the number of serving groups) accordingly. + +## Scaling: groups vs pods + +ModelServing scales at the **group** level (`spec.replicas`), not at the individual pod level. Each group may contain multiple pods (entry + workers). The Prometheus query uses `sum()` to aggregate metrics across all pods, and the threshold is set relative to group capacity. This ensures the scaling decision correctly maps to the number of groups, even though the actual pod count is a multiple of the group count. diff --git a/examples/keda-autoscaling/modelserving.yaml b/examples/keda-autoscaling/modelserving.yaml new file mode 100644 index 000000000..342a8233c --- /dev/null +++ b/examples/keda-autoscaling/modelserving.yaml @@ -0,0 +1,29 @@ +apiVersion: workload.serving.volcano.sh/v1alpha1 +kind: ModelServing +metadata: + name: test-model + namespace: default +spec: + schedulerName: volcano + replicas: 1 + template: + roles: + - name: entry + replicas: 1 + entryTemplate: + spec: + containers: + - name: entry + image: nginx:alpine + ports: + - containerPort: 8080 + - name: worker + replicas: 1 + workerReplicas: 1 + workerTemplate: + spec: + containers: + - name: worker + image: nginx:alpine + ports: + - containerPort: 8080 diff --git a/scaledobject.yaml b/examples/keda-autoscaling/scaledobject.yaml similarity index 84% rename from scaledobject.yaml rename to examples/keda-autoscaling/scaledobject.yaml index 5c967e851..126c767b2 100644 --- a/scaledobject.yaml +++ b/examples/keda-autoscaling/scaledobject.yaml @@ -16,5 +16,5 @@ spec: - type: prometheus metadata: serverAddress: http://prometheus-kube-prometheus-prometheus.monitoring.svc.cluster.local:9090 - query: sum(rate(process_cpu_seconds_total[1m])) - threshold: "0.01" + query: sum(kthena_router_active_downstream_requests) + threshold: "10" diff --git a/test-deployment.yaml b/test-deployment.yaml deleted file mode 100644 index eb30827f6..000000000 --- a/test-deployment.yaml +++ /dev/null @@ -1,113 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kthena-router - labels: - app.kubernetes.io/component: kthena-router -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/component: kthena-router - template: - metadata: - labels: - app.kubernetes.io/component: kthena-router - spec: - containers: - - name: kthena-router - image: nginx:alpine - ports: - - containerPort: 8080 - volumeMounts: - - name: nginx-config - mountPath: /etc/nginx/conf.d - volumes: - - name: nginx-config - configMap: - name: kthena-router-nginx-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: kthena-router-nginx-config -data: - default.conf: | - server { - listen 8080; - - location /metrics { - default_type text/plain; - return 200 '# HELP kthena_router_active_downstream_requests Number of active downstream requests\n# TYPE kthena_router_active_downstream_requests gauge\nkthena_router_active_downstream_requests 3\n# HELP kthena_router_requests_total Total requests\n# TYPE kthena_router_requests_total counter\nkthena_router_requests_total 100\n'; - } - - location / { - return 200 'kthena-router ok\n'; - } - } ---- -apiVersion: v1 -kind: Service -metadata: - name: kthena-router - labels: - app.kubernetes.io/component: kthena-router -spec: - selector: - app.kubernetes.io/component: kthena-router - ports: - - name: http - port: 80 - targetPort: 8080 - protocol: TCP ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: dummy-inference-vllm - labels: - modelserving.volcano.sh/name: test-model - modelserving.volcano.sh/entry: "true" -spec: - replicas: 1 - selector: - matchLabels: - modelserving.volcano.sh/name: test-model - modelserving.volcano.sh/entry: "true" - template: - metadata: - labels: - modelserving.volcano.sh/name: test-model - modelserving.volcano.sh/entry: "true" - spec: - containers: - - name: dummy-vllm - image: nginx:alpine - ports: - - containerPort: 8000 - volumeMounts: - - name: nginx-config - mountPath: /etc/nginx/conf.d - volumes: - - name: nginx-config - configMap: - name: dummy-vllm-nginx-config ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: dummy-vllm-nginx-config -data: - default.conf: | - server { - listen 8000; - - location /metrics { - default_type text/plain; - return 200 '# HELP vllm_num_requests_running Number of running requests\n# TYPE vllm_num_requests_running gauge\nvllm_num_requests_running 2\n# HELP vllm_num_requests_waiting Number of waiting requests\n# TYPE vllm_num_requests_waiting gauge\nvllm_num_requests_waiting 0\n# HELP vllm_gpu_cache_usage_perc GPU cache usage percentage\n# TYPE vllm_gpu_cache_usage_perc gauge\nvllm_gpu_cache_usage_perc 0.45\n'; - } - - location / { - return 200 'dummy-vllm ok\n'; - } - }