diff --git a/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml b/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml index a7ebb5f390e9..237a0fbd8e6e 100644 --- a/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml +++ b/config/jobs/kubernetes/sig-scalability/DRA/sig-scalability-periodic-dra.yaml @@ -405,3 +405,99 @@ periodics: value: "true" - name: NODE_PRELOAD_IMAGES value: "gcr.io/k8s-staging-perf-tests/sleep:v0.0.3" + + - name: ci-kubernetes-e2e-gce-5000-node-dra-with-workload + cluster: k8s-infra-prow-build + tags: + - "perfDashPrefix: gce-dra-5000Nodes-with-workload" + - "perfDashBuildsCount: 270" + - "perfDashJobType: performance" + # Run on even days at 17:01 UTC (9:01 PST) to complement the release-blocking test that runs on odd days + # The release-blocking test runs on even days at 17:01 UTC (9:01 PST) + cron: '1 17 2-30/2 * *' + labels: + preset-service-account: "true" + preset-k8s-ssh: "true" + preset-e2e-scalability-common: "true" + preset-e2e-scalability-periodics: "true" + preset-e2e-scalability-periodics-master: "true" + annotations: + testgrid-dashboards: sig-scalability-dra + testgrid-tab-name: gce-dra-with-workload-master-scalability-5000 + testgrid-alert-email: kubernetes-sig-scale@googlegroups.com, kubernetes-scalability-tickets@google.com + testgrid-num-failures-to-alert: '2' + description: "Uses kubetest to run k8s.io/perf-tests/run-e2e.sh against a 5000-node cluster with DRA enabled" + decorate: true + decoration_config: + timeout: 8h + extra_refs: + - org: kubernetes + repo: kubernetes + base_ref: master + path_alias: k8s.io/kubernetes + - org: kubernetes + repo: perf-tests + base_ref: master + path_alias: k8s.io/perf-tests + spec: + containers: + - image: gcr.io/k8s-staging-test-infra/kubekins-e2e:v20250925-95b5a2c7a5-master + command: + - runner.sh + - /workspace/scenarios/kubernetes_e2e.py + args: + - --cluster=gce-scale-cluster + - --env=HEAPSTER_MACHINE_TYPE=e2-standard-32 + # TODO(mborsz): Adjust or remove this change once we understand coredns + # memory usage regression. + - --env=KUBE_DNS_MEMORY_LIMIT=300Mi + - --extract=ci/fast/latest-fast + - --gcp-nodes=5000 + - --gcp-project-type=scalability-scale-project + - --gcp-zone=us-east1-b + - --provider=gce + - --metadata-sources=cl2-metadata.json + - --env=KUBE_FEATURE_GATES=DynamicResourceAllocation=true + - --runtime-config=api/all=true + - --test=false + - --test-cmd=$GOPATH/src/k8s.io/perf-tests/run-e2e.sh + - --test-cmd-args=cluster-loader2 + - --test-cmd-args=--nodes=5000 + - --test-cmd-args=--provider=gce + - --test-cmd-args=--enable-prometheus-server=true + - --test-cmd-args=--prometheus-scrape-node-exporter + - --test-cmd-args=--experimental-gcp-snapshot-prometheus-disk=true + - --test-cmd-args=--experimental-prometheus-disk-snapshot-name=$(JOB_NAME)-$(BUILD_ID) + - --test-cmd-args=--experimental-prometheus-snapshot-to-report-dir=true + - --test-cmd-args=--testconfig=testing/dra/config.yaml + - --test-cmd-args=--report-dir=$(ARTIFACTS) + - --test-cmd-args=--testoverrides=./testing/overrides/5000_nodes.yaml + - --test-cmd-name=ClusterLoaderV2 + - --timeout=420m + - --use-logexporter + - --logexporter-gcs-path=gs://k8s-infra-scalability-tests-logs/$(JOB_NAME)/$(BUILD_ID) + resources: + requests: + cpu: 6 + memory: "16Gi" + limits: + cpu: 6 + memory: "16Gi" + env: + - name: CL2_MODE + value: "Indexed" + - name: CL2_NODES_PER_NAMESPACE + value: "2500" + - name: CL2_JOB_RUNNING_TIME + value: "3s" + - name: CL2_LONG_JOB_RUNNING_TIME + value: "240m" +# TODO: 100 node gce test run with 10 and 5, can increase if needed +# - name: CL2_LOAD_TEST_THROUGHPUT +# value: "50" +# - name: CL2_STEADY_STATE_QPS +# value: "25" + - name: PROMETHEUS_SCRAPE_KUBELETS + value: "true" + - name: NODE_PRELOAD_IMAGES + value: "gcr.io/k8s-staging-perf-tests/sleep:v0.0.3"