Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions clusterloader2/pkg/dependency/dra/dra.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,6 @@ func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetNam
}

func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) {
// Get a list of all nodes
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure why changes in this file are needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've noticed that I haven't deleted commented code in the previous PR :/

// nodes, err := getReadyNodesCount(config)
// if err != nil {
// return false, fmt.Errorf("failed to list nodes: %v", err)
// }

driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName)
if err != nil {
return false, fmt.Errorf("failed to list driverPluginPods: %v", err)
Expand Down
4 changes: 2 additions & 2 deletions clusterloader2/testing/dra/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ export CL2_MODE=Indexed
export CL2_NODES_PER_NAMESPACE=1
export CL2_LOAD_TEST_THROUGHPUT=20 # Fast initial fill
export CL2_STEADY_STATE_QPS=5 # Controlled rate for measurement
export CL2_JOB_RUNNING_TIME=30s # Short-lived pods runtime
export CL2_LONG_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill)
export CL2_SHORT_LIVED_JOB_RUNNING_TIME=30s # Short-lived pods runtime
export CL2_LONG_LIVED_JOB_RUNNING_TIME=1h # Long-running pods runtime (for cluster fill)
export CL2_GPUS_PER_NODE=8 # GPUs per node
export CL2_FILL_PERCENTAGE=90 # Cluster fill percentage
```
Expand Down
77 changes: 53 additions & 24 deletions clusterloader2/testing/dra/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,47 @@
{{$NODES_PER_NAMESPACE := MinInt .Nodes (DefaultParam .CL2_NODES_PER_NAMESPACE 100)}}
{{$LOAD_TEST_THROUGHPUT := DefaultParam .CL2_LOAD_TEST_THROUGHPUT 10}}
{{$STEADY_STATE_QPS := DefaultParam .CL2_STEADY_STATE_QPS 5}}
{{$RESOURCE_SLICES_PER_NODE := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
{{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD "5s"}}
{{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD "5s"}}
{{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD "5s"}}
{{$CHURN_JOBS_WAIT_PERC50_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC50_THRESHOLD "5s"}}
{{$CHURN_JOBS_WAIT_PERC90_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC90_THRESHOLD "5s"}}
{{$CHURN_JOBS_WAIT_PERC99_THRESHOLD := DefaultParam .CL2_CHURN_JOBS_WAIT_PERC99_THRESHOLD "5s"}}
{{$SHORT_LIVED_JOBS_WAIT_THRESHOLD := DefaultParam .CL2_SHORT_LIVED_JOBS_WAIT_THRESHOLD "10m"}}
{{$LONG_LIVED_JOBS_WAIT_THRESHOLD := DefaultParam .CL2_LONG_LIVED_JOBS_WAIT_THRESHOLD "10m"}}
{{$RUNNING_JOBS_OPERATION_THRESHOLD := DefaultParam .CL2_RUNNING_JOBS_OPERATION_THRESHOLD "120s"}}
{{$token := .CL2_TOKEN }}

{{$namespaces := DivideInt .Nodes $NODES_PER_NAMESPACE}}

# dra
{{$draManifests := DefaultParam .CL2_DRA_MANIFESTS "dra-example-driver"}}
{{$draDaemonsetName := DefaultParam .CL2_DRA_DAEMONSET_NAME "dra-example-driver-kubeletplugin"}}

# Node resource configuration
{{$gpusPerNode := DefaultParam .CL2_GPUS_PER_NODE 8}}
{{$resourceSlicesPerNode := DefaultParam .CL2_RESOURCE_SLICES_PER_NODE 1}}
{{$totalResourceSliceCount := MultiplyInt $resourceSlicesPerNode .Nodes}}
{{$totalGPUs := MultiplyInt $gpusPerNode .Nodes}}

# fast fill job configuration - for initial fill up
{{$fillPercentage := DefaultParam .CL2_FILL_PERCENTAGE 90}}
{{$fillPodsCount := DivideInt (MultiplyInt $totalGPUs $fillPercentage) 100}}
{{$fillPodsPerNamespace := DivideInt $fillPodsCount $namespaces}}
{{$longJobSize := 1}}
{{$longJobRunningTime := DefaultParam .CL2_LONG_JOB_RUNNING_TIME "1h"}}
{{$longLivedJobSize := 1}}
{{$longLivedJobRunningTime := DefaultParam .CL2_LONG_LIVED_JOB_RUNNING_TIME "1h"}}

# churn job configuration for steady state
{{$smallJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
{{$smallJobsPerNamespace := DivideInt $smallJobPodsCount $namespaces}}
{{$smallJobSize := 1}}
{{$smallJobCompletions := 10}}
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
{{$shortLivedJobPodsCount := SubtractInt $totalGPUs (MultiplyInt $fillPodsPerNamespace $namespaces)}}
{{$calculatedSJPN := DivideInt $shortLivedJobPodsCount $namespaces}}
{{$maxSJPN := DefaultParam .CL2_MAX_SHORT_LIVED_JOBS_PER_NAMESPACE 999999}}
{{$shortLivedJobsPerNamespace := MinInt $calculatedSJPN $maxSJPN}}
{{$shortLivedJobSize := 1}}
{{$shortLivedJobCompletions := DefaultParam .CL2_SHORT_LIVED_JOB_COMPLETIONS 10}}
{{$shortLivedJobRunningTime := DefaultParam .CL2_SHORT_LIVED_JOB_RUNNING_TIME "30s"}}
{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}
{{$deviceClassName := DefaultParam .CL2_DEVICE_CLASS_NAME "gpu.example.com"}}

{{$extendedResourceName := ""}}
{{if $ENABLE_EXTENDED_RESOURCES}}
Expand All @@ -49,7 +68,9 @@ dependencies:
- name: Install dra-example-driver for test
Method: DRATestDriver
Params:
WorkerNodeCount: {{.Nodes}}
WorkerNodeCount: {{$totalResourceSliceCount}}
DaemonsetName: {{$draDaemonsetName}}
Manifests: {{$draManifests}}
{{if $ENABLE_EXTENDED_RESOURCES}}
ExtendedResourceName: {{$extendedResourceName}}
{{end}}
Expand All @@ -70,12 +91,15 @@ steps:
apiVersion: batch/v1
kind: Job
labelSelector: job-type = long-running
operationTimeout: 120s
operationTimeout: {{$RUNNING_JOBS_OPERATION_THRESHOLD}}
- Identifier: FastFillPodStartupLatency
Method: PodStartupLatency
Params:
action: start
labelSelector: job-type = long-running
perc50Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD}}
perc90Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD}}
threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD}}
- Identifier: FastFillClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
Expand Down Expand Up @@ -111,6 +135,8 @@ steps:
objectBundle:
- basename: single-gpu
objectTemplatePath: "resourceclaimtemplate.yaml"
templateFillMap:
DeviceClassName: {{$deviceClassName}}
{{end}}
- name: Fill cluster to {{$fillPercentage}}% utilization
phases:
Expand All @@ -123,9 +149,9 @@ steps:
- basename: long-running
objectTemplatePath: "long-running-job.yaml"
templateFillMap:
Replicas: {{$longJobSize}}
Replicas: {{$longLivedJobSize}}
Mode: {{$MODE}}
Sleep: {{$longJobRunningTime}}
Sleep: {{$longLivedJobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for fill pods to be running
measurements:
Expand All @@ -134,7 +160,7 @@ steps:
Params:
action: gather
labelSelector: job-type = long-running
timeout: 15m
timeout: {{$LONG_LIVED_JOBS_WAIT_THRESHOLD}}
- name: Gather measurements for long running pods
measurements:
- Identifier: FastFillSchedulingMetrics
Expand All @@ -145,6 +171,9 @@ steps:
Method: PodStartupLatency
Params:
action: gather
perc50Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC50_THRESHOLD}}
perc90Threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC90_THRESHOLD}}
threshold: {{$LONG_LIVED_JOBS_STARTUP_PERC100_THRESHOLD}}
- Identifier: FastFillClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
Expand All @@ -164,9 +193,9 @@ steps:
Params:
action: start
labelSelector: job-type = short-lived
perc50Threshold: 40s
perc90Threshold: 60s
perc99Threshold: 80s
perc50Threshold: {{$CHURN_JOBS_WAIT_PERC50_THRESHOLD}}
perc90Threshold: {{$CHURN_JOBS_WAIT_PERC90_THRESHOLD}}
perc99Threshold: {{$CHURN_JOBS_WAIT_PERC99_THRESHOLD}}
- Identifier: ChurnClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
Expand All @@ -192,16 +221,16 @@ steps:
- namespaceRange:
min: 1
max: {{$namespaces}}
replicasPerNamespace: {{$smallJobsPerNamespace}}
replicasPerNamespace: {{$shortLivedJobsPerNamespace}}
tuningSet: SteadyState
objectBundle:
- basename: small
objectTemplatePath: "job.yaml"
templateFillMap:
Replicas: {{$smallJobSize}}
CompletionReplicas: {{$smallJobCompletions}}
Replicas: {{$shortLivedJobSize}}
CompletionReplicas: {{$shortLivedJobCompletions}}
Mode: {{$MODE}}
Sleep: {{$jobRunningTime}}
Sleep: {{$shortLivedJobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for short-lived jobs to finish
measurements:
Expand All @@ -210,7 +239,7 @@ steps:
Params:
action: gather
labelSelector: job-type = short-lived
timeout: 15m
timeout: {{$SHORT_LIVED_JOBS_WAIT_THRESHOLD}}
- name: Measure scheduler metrics
measurements:
- Identifier: ChurnSchedulingMetrics
Expand All @@ -221,14 +250,14 @@ steps:
Method: PodStartupLatency
Params:
action: gather
perc50Threshold: 40s
perc90Threshold: 60s
perc99Threshold: 80s
perc50Threshold: {{$CHURN_JOBS_WAIT_PERC50_THRESHOLD}}
perc90Threshold: {{$CHURN_JOBS_WAIT_PERC90_THRESHOLD}}
perc99Threshold: {{$CHURN_JOBS_WAIT_PERC99_THRESHOLD}}
- Identifier: ChurnClaimAllocationLatency
Method: ResourceClaimAllocationLatency
Params:
action: gather
- Identifier: ChurnDRAMetrics
Method: GenericPrometheusQuery
Params:
action: gather
action: gather
4 changes: 3 additions & 1 deletion clusterloader2/testing/dra/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ spec:
parallelism: {{.Replicas}}
completions: {{.CompletionReplicas}}
completionMode: {{.Mode}}
ttlSecondsAfterFinished: 300
# In tests involving a large number of sequentially created, short-lived jobs, the spin-up time may be significant.
# A TTL of 1 hour should be sufficient to retain the jobs long enough for measurement checks.
ttlSecondsAfterFinished: 3600 # 1 hour
template:
metadata:
labels:
Expand Down
2 changes: 1 addition & 1 deletion clusterloader2/testing/dra/resourceclaimtemplate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ spec:
requests:
- name: gpu
exactly:
deviceClassName: gpu.example.com
deviceClassName: {{.DeviceClassName}}