kubernetes · k8s-ci-robot · Oct 14, 2025 · Sep 25, 2025
diff --git a/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml b/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml
@@ -405,3 +405,99 @@ periodics:
               value: "true"
             - name: NODE_PRELOAD_IMAGES
               value: "gcr.io/k8s-staging-perf-tests/sleep:v0.0.3"
+
+  - name: ci-kubernetes-e2e-gce-5000-node-dra-with-workload
+    cluster: k8s-infra-prow-build
+    tags:
+      - "perfDashPrefix: gce-dra-5000Nodes-with-workload"
+      - "perfDashBuildsCount: 270"
+      - "perfDashJobType: performance"
+    # Run on even days at 17:01 UTC (9:01 PST) to complement the release-blocking test that runs on odd days
+    # The release-blocking test runs on even days at 17:01 UTC (9:01 PST)
+    cron: '1 17 2-30/2 * *'
+    labels:
+      preset-service-account: "true"
+      preset-k8s-ssh: "true"
+      preset-e2e-scalability-common: "true"
+      preset-e2e-scalability-periodics: "true"
+      preset-e2e-scalability-periodics-master: "true"
+    annotations:
+      testgrid-dashboards: sig-scalability-dra
+      testgrid-tab-name: gce-dra-with-workload-master-scalability-5000
+      testgrid-alert-email: [email protected], [email protected]
+      testgrid-num-failures-to-alert: '2'
+      description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 5000-node cluster with DRA enabled"
+    decorate: true
+    decoration_config:
+      timeout: 8h
+    extra_refs:
+      - org: kubernetes
+        repo: kubernetes
+        base_ref: master
+        path_alias: k8s.io/kubernetes
+      - org: kubernetes
+        repo: perf-tests
+        base_ref: master
+        path_alias: k8s.io/perf-tests
+    spec:
+      containers:
+        - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20250925-95b5a2c7a5-master
+          command:
+            - runner.sh
+            - /workspace/scenarios/kubernetes_e2e.py
+          args:
+            - --cluster=gce-scale-cluster
+            - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32
+            # TODO(mborsz): Adjust or remove this change once we understand coredns
+            # memory usage regression.
+            - --env=KUBE_DNS_MEMORY_LIMIT=300Mi
+            - --extract=ci/fast/latest-fast
+            - --gcp-nodes=5000
+            - --gcp-project-type=scalability-scale-project
+            - --gcp-zone=us-east1-b
+            - --provider=gce
+            - --metadata-sources=cl2-metadata.json
+            - --env=KUBE_FEATURE_GATES=DynamicResourceAllocation=true
+            - --runtime-config=api/all=true
+            - --test=false
+            - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh
+            - --test-cmd-args=cluster-loader2
+            - --test-cmd-args=--nodes=5000
+            - --test-cmd-args=--provider=gce
+            - --test-cmd-args=--enable-prometheus-server=true
+            - --test-cmd-args=--prometheus-scrape-node-exporter
+            - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true
+            - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID)
+            - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true
+            - --test-cmd-args=--testconfig=testing/dra/config.yaml
+            - --test-cmd-args=--report-dir=$(ARTIFACTS)
+            - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml
+            - --test-cmd-name=ClusterLoaderV2
+            - --timeout=420m
+            - --use-logexporter
+            - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID)
+          resources:
+            requests:
+              cpu: 6
+              memory: "16Gi"
+            limits:
+              cpu: 6
+              memory: "16Gi"
+          env:
+            - name: CL2_MODE
+              value: "Indexed"
+            - name: CL2_NODES_PER_NAMESPACE
+              value: "2500"
+            - name: CL2_JOB_RUNNING_TIME
+              value: "3s"
+            - name: CL2_LONG_JOB_RUNNING_TIME
+              value: "240m"
+#              TODO: 100 node gce test run with 10 and 5, can increase if needed
+#            - name: CL2_LOAD_TEST_THROUGHPUT
+#              value: "50"
+#            - name: CL2_STEADY_STATE_QPS
+#              value: "25"
+            - name: PROMETHEUS_SCRAPE_KUBELETS
+              value: "true"
+            - name: NODE_PRELOAD_IMAGES
+              value: "gcr.io/k8s-staging-perf-tests/sleep:v0.0.3"