Skip to content

Commit 7673d03

Browse files
committed
Update DRA testing to stable API version and prepare it to test more types of drivers
1 parent 147e565 commit 7673d03

File tree

11 files changed

+157
-53
lines changed

11 files changed

+157
-53
lines changed

clusterloader2/pkg/dependency/dra/dra.go

Lines changed: 111 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,10 @@ import (
2020
"context"
2121
"embed"
2222
"fmt"
23+
"strings"
2324
"time"
2425

26+
corev1 "k8s.io/api/core/v1"
2527
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2628
"k8s.io/apimachinery/pkg/util/wait"
2729
"k8s.io/klog/v2"
@@ -31,16 +33,16 @@ import (
3133
)
3234

3335
const (
34-
draDependencyName = "DRATestDriver"
35-
//TODO: this needs to be converted into a parameter. Will will not need this until parititionable devices test
36+
draDependencyName = "DRATestDriver"
3637
draNamespace = "dra-example-driver"
38+
draManifests = "dra-example-driver"
3739
defaultWorkerNodeCount = "100"
3840
draDaemonsetName = "dra-example-driver-kubeletplugin"
3941
checkDRAReadyInterval = 30 * time.Second
4042
defaultDRATimeout = 10 * time.Minute
4143
)
4244

43-
//go:embed manifests/*.yaml
45+
//go:embed manifests/**/*.yaml
4446
var manifestsFS embed.FS
4547

4648
func init() {
@@ -57,13 +59,24 @@ type draDependency struct{}
5759

5860
func (d *draDependency) Setup(config *dependency.Config) error {
5961
klog.V(2).Infof("%s: Installing DRA example driver", d)
60-
if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
61-
return fmt.Errorf("namespace %s creation error: %v", draNamespace, err)
62+
63+
namespace, err := getNamespace(config)
64+
if err != nil {
65+
return err
6266
}
6367

64-
namespace, ok := config.Params["Namespace"]
65-
if !ok {
66-
namespace = draNamespace
68+
if err := client.CreateNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil {
69+
return fmt.Errorf("namespace %s creation error: %v", namespace, err)
70+
}
71+
72+
manifests, err := getManifests(config)
73+
if err != nil {
74+
return err
75+
}
76+
77+
daemonsetName, err := getDaemonset(config)
78+
if err != nil {
79+
return err
6780
}
6881

6982
mapping := map[string]interface{}{
@@ -72,7 +85,7 @@ func (d *draDependency) Setup(config *dependency.Config) error {
7285
}
7386
if err := config.ClusterFramework.ApplyTemplatedManifests(
7487
manifestsFS,
75-
"manifests/*.yaml",
88+
manifests,
7689
mapping,
7790
client.Retry(client.IsRetryableAPIError),
7891
); err != nil {
@@ -82,8 +95,8 @@ func (d *draDependency) Setup(config *dependency.Config) error {
8295
if err != nil {
8396
return err
8497
}
85-
klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, draDaemonsetName)
86-
if err := d.waitForDRADriverToBeHealthy(config, timeout); err != nil {
98+
klog.V(2).Infof("%s: checking if DRA driver %s is healthy", d, daemonsetName)
99+
if err := d.waitForDRADriverToBeHealthy(config, timeout, daemonsetName, namespace); err != nil {
87100
return err
88101
}
89102

@@ -94,60 +107,76 @@ func (d *draDependency) Setup(config *dependency.Config) error {
94107
func (d *draDependency) Teardown(config *dependency.Config) error {
95108
klog.V(2).Infof("%s: Tearing down DRA example driver", d)
96109

110+
namespace, err := getNamespace(config)
111+
if err != nil {
112+
return err
113+
}
114+
97115
// Delete namespace (this will delete all resources in it)
98-
if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace); err != nil {
99-
return fmt.Errorf("deleting %s namespace error: %v", draNamespace, err)
116+
if err := client.DeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace); err != nil {
117+
return fmt.Errorf("deleting %s namespace error: %v", namespace, err)
100118
}
101119

102-
if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), draNamespace, client.DefaultNamespaceDeletionTimeout); err != nil {
120+
if err := client.WaitForDeleteNamespace(config.ClusterFramework.GetClientSets().GetClient(), namespace, client.DefaultNamespaceDeletionTimeout); err != nil {
103121
return err
104122
}
105123

106124
klog.V(2).Infof("%s: DRA example driver uninstalled successfully", d)
107125
return nil
108126
}
109127

110-
func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration) error {
128+
func (d *draDependency) waitForDRADriverToBeHealthy(config *dependency.Config, timeout time.Duration, daemonsetName string, namespace string) error {
111129
if err := wait.PollImmediate(
112130
checkDRAReadyInterval,
113131
timeout,
114132
func() (done bool, err error) {
115-
return d.isDRADriverReady(config)
133+
return d.isDRADriverReady(config, daemonsetName, namespace)
116134
}); err != nil {
117135
return err
118136
}
119137
if err := wait.PollImmediate(
120138
checkDRAReadyInterval,
121139
timeout,
122140
func() (done bool, err error) {
123-
return isResourceSlicesPublished(config)
141+
return isResourceSlicesPublished(config, namespace)
124142
}); err != nil {
125143
return err
126144
}
127145
return nil
128146
}
129147

130-
func (d *draDependency) isDRADriverReady(config *dependency.Config) (done bool, err error) {
148+
func (d *draDependency) isDRADriverReady(config *dependency.Config, daemonsetName string, namespace string) (done bool, err error) {
131149
ds, err := config.ClusterFramework.GetClientSets().
132150
GetClient().
133151
AppsV1().
134-
DaemonSets(draNamespace).
135-
Get(context.Background(), draDaemonsetName, metav1.GetOptions{})
152+
DaemonSets(namespace).
153+
Get(context.Background(), daemonsetName, metav1.GetOptions{})
136154
if err != nil {
137-
return false, fmt.Errorf("failed to get %s: %v", draDaemonsetName, err)
155+
return false, fmt.Errorf("failed to get %s: %v", daemonsetName, err)
138156
}
139157
ready := ds.Status.NumberReady == ds.Status.DesiredNumberScheduled
140158
if !ready {
141159
klog.V(2).Infof("%s is not ready, "+
142-
"DesiredNumberScheduled: %d, NumberReady: %d", draDaemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady)
160+
"DesiredNumberScheduled: %d, NumberReady: %d", daemonsetName, ds.Status.DesiredNumberScheduled, ds.Status.NumberReady)
143161
}
144162
return ready, nil
145163
}
146164

147-
func isResourceSlicesPublished(config *dependency.Config) (bool, error) {
148-
workerCount := int(getWorkerCount(config).(float64))
165+
func isResourceSlicesPublished(config *dependency.Config, namespace string) (bool, error) {
166+
// Get a list of all nodes
167+
// nodes, err := getReadyNodesCount(config)
168+
// if err != nil {
169+
// return false, fmt.Errorf("failed to list nodes: %v", err)
170+
// }
149171

150-
resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1beta1().ResourceSlices().List(context.Background(), metav1.ListOptions{})
172+
driverPluginPods, err := getDriverPluginPods(config, namespace, draDaemonsetName)
173+
if err != nil {
174+
return false, fmt.Errorf("failed to list driverPluginPods: %v", err)
175+
}
176+
177+
workerCount := driverPluginPods
178+
179+
resourceSlices, err := config.ClusterFramework.GetClientSets().GetClient().ResourceV1().ResourceSlices().List(context.Background(), metav1.ListOptions{})
151180
if err != nil {
152181
return false, fmt.Errorf("failed to list resourceslices: %v", err)
153182
}
@@ -159,6 +188,26 @@ func isResourceSlicesPublished(config *dependency.Config) (bool, error) {
159188
return true, nil
160189
}
161190

191+
func getDriverPluginPods(config *dependency.Config, namespace string, namePrefix string) (int, error) {
192+
pods, err := config.ClusterFramework.GetClientSets().GetClient().CoreV1().Pods(namespace).List(context.Background(), metav1.ListOptions{})
193+
if err != nil {
194+
return 0, fmt.Errorf("failed to list pods in namespace %s: %w", namespace, err)
195+
}
196+
197+
runningPods := 0
198+
for _, pod := range pods.Items {
199+
if !strings.HasPrefix(pod.Name, namePrefix) {
200+
continue
201+
}
202+
203+
if pod.Status.Phase == corev1.PodRunning {
204+
runningPods++
205+
}
206+
}
207+
208+
return runningPods, nil
209+
}
210+
162211
func getWorkerCount(config *dependency.Config) interface{} {
163212
workerCount, ok := config.Params["WorkerNodeCount"]
164213
if !ok {
@@ -167,6 +216,43 @@ func getWorkerCount(config *dependency.Config) interface{} {
167216
return workerCount
168217
}
169218

219+
func getNamespace(config *dependency.Config) (string, error) {
220+
namespace, ok := config.Params["Namespace"]
221+
if !ok {
222+
namespace = draNamespace
223+
}
224+
namespaceString, ok := namespace.(string)
225+
226+
if !ok {
227+
return "", fmt.Errorf("namespace parameter is not a string: %v", namespace)
228+
}
229+
return namespaceString, nil
230+
}
231+
232+
func getManifests(config *dependency.Config) (string, error) {
233+
manifests, ok := config.Params["Manifests"]
234+
if !ok {
235+
manifests = draManifests
236+
}
237+
manifestsString, ok := manifests.(string)
238+
if !ok {
239+
return "", fmt.Errorf("manifests parameter is not a string: %v", manifests)
240+
}
241+
return "manifests/" + manifestsString + "/*.yaml", nil
242+
}
243+
244+
func getDaemonset(config *dependency.Config) (string, error) {
245+
daemonsetName, ok := config.Params["DaemonsetName"]
246+
if !ok {
247+
daemonsetName = draDaemonsetName
248+
}
249+
daemonsetNameString, ok := daemonsetName.(string)
250+
if !ok {
251+
return "", fmt.Errorf("DaemonsetName parameter is not a string: %v", daemonsetName)
252+
}
253+
return daemonsetNameString, nil
254+
}
255+
170256
// String returns string representation of this dependency.
171257
func (d *draDependency) String() string {
172258
return draDependencyName

clusterloader2/pkg/dependency/dra/manifests/clusterrole.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrole.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ rules:
1313
verbs: ["get"]
1414
- apiGroups: ["resource.k8s.io"]
1515
resources: ["resourceslices"]
16-
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
16+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]

clusterloader2/pkg/dependency/dra/manifests/clusterrolebinding.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/clusterrolebinding.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ subjects:
1111
roleRef:
1212
kind: ClusterRole
1313
name: dra-example-driver-role
14-
apiGroup: rbac.authorization.k8s.io
14+
apiGroup: rbac.authorization.k8s.io
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
---
22
# Source: dra-example-driver/templates/deviceclass.yaml
3-
apiVersion: resource.k8s.io/v1beta1
3+
apiVersion: resource.k8s.io/v1
44
kind: DeviceClass
55
metadata:
66
name: gpu.example.com
77
spec:
88
selectors:
99
- cel:
10-
expression: "device.driver == 'gpu.example.com'"
10+
expression: "device.driver == 'gpu.example.com'"

clusterloader2/pkg/dependency/dra/manifests/kubeletplugin.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/kubeletplugin.yaml

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@ metadata:
66
name: dra-example-driver-kubeletplugin
77
namespace: {{.Namespace}}
88
labels:
9-
helm.sh/chart: dra-example-driver-0.1.3
9+
helm.sh/chart: dra-example-driver-0.0.0-dev
1010
app.kubernetes.io/name: dra-example-driver
1111
app.kubernetes.io/instance: dra-example-driver
12-
app.kubernetes.io/version: "v0.1.0"
12+
app.kubernetes.io/version: "v0.2.0"
13+
app.kubernetes.io/managed-by: Helm
1314
app.kubernetes.io/component: kubeletplugin
1415
spec:
1516
selector:
@@ -26,22 +27,33 @@ spec:
2627
app.kubernetes.io/instance: dra-example-driver
2728
app.kubernetes.io/component: kubeletplugin
2829
spec:
29-
priorityClassName: system-node-critical
3030
serviceAccountName: dra-example-driver-service-account
3131
securityContext:
3232
{}
3333
containers:
3434
- name: plugin
3535
securityContext:
3636
privileged: true
37-
image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.1.0
38-
imagePullPolicy: IfNotPresent
37+
# image: /:v0.2.0
38+
image: registry.k8s.io/dra-example-driver/dra-example-driver:v0.2.0
39+
imagePullPolicy: Always
3940
command: ["dra-example-kubeletplugin"]
4041
resources:
4142
{}
43+
44+
livenessProbe:
45+
grpc:
46+
port: 51515
47+
service: liveness
48+
failureThreshold: 3
49+
periodSeconds: 10
4250
env:
4351
- name: CDI_ROOT
4452
value: /var/run/cdi
53+
- name: KUBELET_REGISTRAR_DIRECTORY_PATH
54+
value: "/var/lib/kubelet/plugins_registry"
55+
- name: KUBELET_PLUGINS_DIRECTORY_PATH
56+
value: "/var/lib/kubelet/plugins"
4557
- name: NODE_NAME
4658
valueFrom:
4759
fieldRef:
@@ -53,20 +65,26 @@ spec:
5365
# Simulated number of devices the example driver will pretend to have.
5466
- name: NUM_DEVICES
5567
value: "8"
68+
- name: HEALTHCHECK_PORT
69+
value: "51515"
5670
volumeMounts:
5771
- name: plugins-registry
58-
mountPath: /var/lib/kubelet/plugins_registry
72+
mountPath: "/var/lib/kubelet/plugins_registry"
5973
- name: plugins
60-
mountPath: /var/lib/kubelet/plugins
74+
mountPath: "/var/lib/kubelet/plugins"
6175
- name: cdi
6276
mountPath: /var/run/cdi
6377
volumes:
6478
- name: plugins-registry
6579
hostPath:
66-
path: /var/lib/kubelet/plugins_registry
80+
path: "/var/lib/kubelet/plugins_registry"
6781
- name: plugins
6882
hostPath:
69-
path: /var/lib/kubelet/plugins
83+
path: "/var/lib/kubelet/plugins"
7084
- name: cdi
7185
hostPath:
7286
path: /var/run/cdi
87+
tolerations:
88+
- effect: NoSchedule
89+
key: google.com/tpu
90+
operator: Exists

clusterloader2/pkg/dependency/dra/manifests/serviceaccount.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/serviceaccount.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@ metadata:
66
name: dra-example-driver-service-account
77
namespace: {{.Namespace}}
88
labels:
9-
helm.sh/chart: dra-example-driver-0.1.3
9+
helm.sh/chart: dra-example-driver-0.0.0-dev
1010
app.kubernetes.io/name: dra-example-driver
1111
app.kubernetes.io/instance: dra-example-driver
12-
app.kubernetes.io/version: "v0.1.0"
12+
app.kubernetes.io/version: "v0.2.0"
13+
app.kubernetes.io/managed-by: Helm

clusterloader2/pkg/dependency/dra/manifests/validatingadmissionpolicy.yaml renamed to clusterloader2/pkg/dependency/dra/manifests/dra-example-driver/validatingadmissionpolicy.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
matchConstraints:
1010
resourceRules:
1111
- apiGroups: ["resource.k8s.io"]
12-
apiVersions: ["v1beta1"]
12+
apiVersions: ["v1"]
1313
operations: ["CREATE", "UPDATE", "DELETE"]
1414
resources: ["resourceslices"]
1515
matchConditions:
@@ -30,4 +30,4 @@ spec:
3030
- expression: variables.userNodeName == variables.objectNodeName
3131
messageExpression: >-
3232
"this user running on node '"+variables.userNodeName+"' may not modify " +
33-
(variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")
33+
(variables.objectNodeName == "" ?"cluster resourceslices" : "resourceslices on node '"+variables.objectNodeName+"'")

0 commit comments

Comments
 (0)