Skip to content

Commit 1e6a587

Browse files
committed
Allow the DRA driver for GPUs to be force installed if desired
Signed-off-by: Kevin Klues <[email protected]>
1 parent da50104 commit 1e6a587

8 files changed

+74
-6
lines changed

deployments/helm/nvidia-dra-driver-gpu/templates/clusterrole.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ rules:
2929
- apiGroups: ["apps"]
3030
resources: ["daemonsets"]
3131
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
32+
- apiGroups: ["apps"]
33+
resources: ["deployments"]
34+
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
3235
- apiGroups: [""]
3336
resources: ["nodes"]
3437
verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]

deployments/helm/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-daemon.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if .Values.resources.computeDomains.enabled }}
12
---
23
apiVersion: resource.k8s.io/v1beta1
34
kind: DeviceClass
@@ -7,3 +8,4 @@ spec:
78
selectors:
89
- cel:
910
expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'daemon'"
11+
{{- end }}

deployments/helm/nvidia-dra-driver-gpu/templates/deviceclass-compute-domain-default-channel.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
{{- if .Values.resources.computeDomains.enabled }}
12
---
23
apiVersion: resource.k8s.io/v1beta1
34
kind: DeviceClass
@@ -7,4 +8,4 @@ spec:
78
selectors:
89
- cel:
910
expression: "device.driver == 'compute-domain.nvidia.com' && device.attributes['compute-domain.nvidia.com'].type == 'channel' && device.attributes['compute-domain.nvidia.com'].id == 0"
10-
11+
{{- end }}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{{- if .Values.resources.gpus.enabled }}
2+
---
3+
apiVersion: resource.k8s.io/v1beta1
4+
kind: DeviceClass
5+
metadata:
6+
name: gpu.nvidia.com
7+
spec:
8+
selectors:
9+
- cel:
10+
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'gpu'"
11+
{{- end }}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{{- if .Values.resources.gpus.enabled }}
2+
---
3+
apiVersion: resource.k8s.io/v1beta1
4+
kind: DeviceClass
5+
metadata:
6+
name: mig.nvidia.com
7+
spec:
8+
selectors:
9+
- cel:
10+
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'mig'"
11+
{{- end }}

deployments/helm/nvidia-dra-driver-gpu/templates/kubeletplugin.yaml

+37-3
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,49 @@ spec:
5050
{{- toYaml .Values.kubeletPlugin.podSecurityContext | nindent 8 }}
5151
containers:
5252
{{- if .Values.resources.computeDomains.enabled }}
53-
- name: compute-domain
53+
- name: compute-domains
5454
securityContext:
55-
{{- toYaml .Values.kubeletPlugin.containers.computeDomain.securityContext | nindent 10 }}
55+
{{- toYaml .Values.kubeletPlugin.containers.computeDomains.securityContext | nindent 10 }}
5656
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
5757
imagePullPolicy: {{ .Values.image.pullPolicy }}
5858
command:
5959
- compute-domain-kubelet-plugin
6060
resources:
61-
{{- toYaml .Values.kubeletPlugin.containers.computeDomain.resources | nindent 10 }}
61+
{{- toYaml .Values.kubeletPlugin.containers.computeDomains.resources | nindent 10 }}
62+
env:
63+
- name: CDI_ROOT
64+
value: /var/run/cdi
65+
- name: NODE_NAME
66+
valueFrom:
67+
fieldRef:
68+
fieldPath: spec.nodeName
69+
- name: NAMESPACE
70+
valueFrom:
71+
fieldRef:
72+
fieldPath: metadata.namespace
73+
volumeMounts:
74+
- name: plugins-registry
75+
mountPath: /var/lib/kubelet/plugins_registry
76+
- name: plugins
77+
mountPath: /var/lib/kubelet/plugins
78+
mountPropagation: Bidirectional
79+
- name: cdi
80+
mountPath: /var/run/cdi
81+
# We always mount the driver root at /driver-root in the container.
82+
- name: driver-root
83+
mountPath: /driver-root
84+
readOnly: true
85+
{{- end }}
86+
{{- if .Values.resources.gpus.enabled }}
87+
- name: gpus
88+
securityContext:
89+
{{- toYaml .Values.kubeletPlugin.containers.gpus.securityContext | nindent 10 }}
90+
image: {{ include "nvidia-dra-driver-gpu.fullimage" . }}
91+
imagePullPolicy: {{ .Values.image.pullPolicy }}
92+
command:
93+
- gpu-kubelet-plugin
94+
resources:
95+
{{- toYaml .Values.kubeletPlugin.containers.gpus.resources | nindent 10 }}
6296
env:
6397
- name: CDI_ROOT
6498
value: /var/run/cdi

deployments/helm/nvidia-dra-driver-gpu/templates/validation.yaml

+2-1
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@
3030
{{- fail $error }}
3131
{{- end }}
3232

33-
{{- if .Values.resources.gpus.enabled }}
33+
{{- if and .Values.resources.gpus.enabled (not .Values.gpuResourcesEnabledOverride) }}
3434
{{- $error := "" }}
3535
{{- $error = printf "%s\nThe default value of 'resources.gpus.enabled=true' is not yet supported." $error }}
3636
{{- $error = printf "%s\nIt is set to true by default to future proof it as the default once support for it becomes available." $error }}
3737
{{- $error = printf "%s\nUntil then, please explicitly set 'resources.gpus.enabled=false' when installing this chart." $error }}
38+
{{- $error = printf "%s\nIf you truly want to force 'resources.gpus.enabled=true' to apply, you can set 'gpuResourcesEnabledOverride=true'." $error }}
3839
{{- fail $error }}
3940
{{- end }}

deployments/helm/nvidia-dra-driver-gpu/values.yaml

+6-1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ nameOverride: ""
3131
fullnameOverride: ""
3232
namespaceOverride: ""
3333
selectorLabelsOverride: {}
34+
gpuResourcesEnabledOverride: false
3435

3536
allowDefaultNamespace: false
3637

@@ -89,7 +90,11 @@ kubeletPlugin:
8990
init:
9091
securityContext: {}
9192
resources: {}
92-
computeDomain:
93+
computeDomains:
94+
securityContext:
95+
privileged: true
96+
resources: {}
97+
gpus:
9398
securityContext:
9499
privileged: true
95100
resources: {}

0 commit comments

Comments
 (0)