Skip to content

Commit d1fad7e

Browse files
authored
Merge pull request #225 from klueska/add-multi-node-crd
Add ComputeDomain for running multi-node workloads
2 parents 5a39c79 + 474f968 commit d1fad7e

File tree

1,357 files changed

+8166
-569476
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,357 files changed

+8166
-569476
lines changed

.common-ci.yml

+4-4
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ trigger-pipeline:
111111
.scan-base:
112112
stage: scan
113113
variables:
114-
IMAGE: "${CI_REGISTRY_IMAGE}/k8s-dra-driver:${CI_COMMIT_SHORT_SHA}-${DIST}"
115-
IMAGE_ARCHIVE: "k8s-dra-driver-${CI_COMMIT_SHORT_SHA}-${DIST}-${PLATFORM_ARCH}.tar"
114+
IMAGE: "${CI_REGISTRY_IMAGE}/k8s-dra-driver-gpu:${CI_COMMIT_SHORT_SHA}-${DIST}"
115+
IMAGE_ARCHIVE: "k8s-dra-driver-gpu-${CI_COMMIT_SHORT_SHA}-${DIST}-${PLATFORM_ARCH}.tar"
116116
before_script:
117117
- docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}"
118118
- docker pull --platform="${PLATFORM}" "${IMAGE}"
@@ -160,7 +160,7 @@ scan-ubi9-arm64:
160160
stage: release
161161
variables:
162162
# Define the source image for the release
163-
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver"
163+
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver-gpu"
164164
VERSION: "${CI_COMMIT_SHORT_SHA}"
165165
# OUT_IMAGE_VERSION is overridden for external releases
166166
OUT_IMAGE_VERSION: "${CI_COMMIT_SHORT_SHA}"
@@ -195,7 +195,7 @@ scan-ubi9-arm64:
195195
OUT_REGISTRY_USER: "${CI_REGISTRY_USER}"
196196
OUT_REGISTRY_TOKEN: "${CI_REGISTRY_PASSWORD}"
197197
OUT_REGISTRY: "${CI_REGISTRY}"
198-
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/k8s-dra-driver"
198+
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/staging/k8s-dra-driver-gpu"
199199

200200
# Define an external release step that pushes an image to an external repository.
201201
# This includes a devlopment image off main.

.github/workflows/image.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ jobs:
5656
password: ${{ secrets.GITHUB_TOKEN }}
5757
- name: Build image
5858
env:
59-
IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/k8s-dra-driver
59+
IMAGE_NAME: ghcr.io/${LOWERCASE_REPO_OWNER}/k8s-dra-driver-gpu
6060
VERSION: ${COMMIT_SHORT_SHA}
6161
run: |
6262
echo "${VERSION}"

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
.cache/
22
.bash_history
3-
/nvidia-dra-controller
4-
/nvidia-dra-plugin
3+
/compute-domain-controller
4+
/compute-domain-kubelet-plugin
5+
/gpu-kubelet-plugin
56
.idea
67
[._]*.sw[a-p]
78
coverage.out

.gitlab-ci.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ unit-tests:
5858
.image-build:
5959
stage: image-build
6060
variables:
61-
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver"
61+
IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver-gpu"
6262
VERSION: "${CI_COMMIT_SHORT_SHA}"
6363
PUSH_ON_BUILD: "true"
6464
before_script:

.golangci.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,4 @@ run:
1717

1818
linters-settings:
1919
goimports:
20-
local-prefixes: "github.com/NVIDIA/k8s-dra-driver"
20+
local-prefixes: "github.com/NVIDIA/k8s-dra-driver-gpu"

.nvidia-ci.yml

+3-3
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,19 @@ variables:
3333
# On the multi-arch builder we don't need the qemu setup.
3434
SKIP_QEMU_SETUP: "1"
3535
# Define the public staging registry
36-
STAGING_REGISTRY: registry.gitlab.com/nvidia/cloud-native/k8s-dra-driver/staging
36+
STAGING_REGISTRY: registry.gitlab.com/nvidia/cloud-native/k8s-dra-driver-gpu/staging
3737
STAGING_VERSION: ${CI_COMMIT_SHORT_SHA}
3838

3939
.image-pull:
4040
stage: image-build
4141
variables:
4242
IN_REGISTRY: "${STAGING_REGISTRY}"
43-
IN_IMAGE_NAME: k8s-dra-driver
43+
IN_IMAGE_NAME: k8s-dra-driver-gpu
4444
IN_VERSION: "${STAGING_VERSION}"
4545
OUT_REGISTRY_USER: "${CI_REGISTRY_USER}"
4646
OUT_REGISTRY_TOKEN: "${CI_REGISTRY_PASSWORD}"
4747
OUT_REGISTRY: "${CI_REGISTRY}"
48-
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver"
48+
OUT_IMAGE_NAME: "${CI_REGISTRY_IMAGE}/k8s-dra-driver-gpu"
4949
PUSH_MULTIPLE_TAGS: "false"
5050
# We delay the job start to allow the public pipeline to generate the required images.
5151
when: delayed

Makefile

+67-3
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ goimports:
7777
find . -name \*.go \
7878
-not -name "zz_generated.deepcopy.go" \
7979
-not -path "./vendor/*" \
80-
-not -path "./pkg/nvidia.com/resource/clientset/versioned/*" \
80+
-not -path "./$(PKG_BASE)/clientset/versioned/*" \
8181
-exec goimports -local $(MODULE) -w {} \;
8282

8383
golangci-lint:
@@ -99,7 +99,19 @@ coverage: test
9999
cat $(COVERAGE_FILE) | grep -v "_mock.go" > $(COVERAGE_FILE).no-mocks
100100
go tool cover -func=$(COVERAGE_FILE).no-mocks
101101

102-
generate: generate-deepcopy fmt
102+
generate: generate-crds generate-informers fmt
103+
104+
generate-crds: generate-deepcopy .remove-crds
105+
for dir in $(CLIENT_SOURCES); do \
106+
controller-gen crd:crdVersions=v1 \
107+
paths=$(CURDIR)/$${dir} \
108+
output:crd:dir=$(CURDIR)/deployments/helm/tmp_crds; \
109+
done
110+
mkdir -p $(CURDIR)/deployments/helm/$(HELM_DRIVER_NAME)/crds
111+
cp -R $(CURDIR)/deployments/helm/tmp_crds/* \
112+
$(CURDIR)/deployments/helm/$(HELM_DRIVER_NAME)/crds
113+
rm -rf $(CURDIR)/deployments/helm/tmp_crds
114+
103115

104116
generate-deepcopy: .remove-deepcopy
105117
for dir in $(DEEPCOPY_SOURCES); do \
@@ -109,14 +121,66 @@ generate-deepcopy: .remove-deepcopy
109121
output:object:dir=$(CURDIR)/$${dir}; \
110122
done
111123

124+
generate-informers: .remove-informers generate-listers
125+
informer-gen \
126+
--go-header-file=$(CURDIR)/hack/boilerplate.go.txt \
127+
--output-package "$(MODULE)/$(PKG_BASE)/informers" \
128+
--input-dirs "$(shell for api in $(CLIENT_APIS); do echo -n "$(MODULE)/$(API_BASE)/$$api,"; done | sed 's/,$$//')" \
129+
--output-base "$(CURDIR)/pkg/tmp_informers" \
130+
--versioned-clientset-package "$(MODULE)/$(PKG_BASE)/clientset/versioned" \
131+
--listers-package "$(MODULE)/$(PKG_BASE)/listers"
132+
mkdir -p $(CURDIR)/$(PKG_BASE)
133+
mv $(CURDIR)/pkg/tmp_informers/$(MODULE)/$(PKG_BASE)/informers \
134+
$(CURDIR)/$(PKG_BASE)/informers
135+
rm -rf $(CURDIR)/pkg/tmp_informers
136+
137+
generate-listers: .remove-listers generate-clientset
138+
lister-gen \
139+
--go-header-file=$(CURDIR)/hack/boilerplate.go.txt \
140+
--output-package "$(MODULE)/$(PKG_BASE)/listers" \
141+
--input-dirs "$(shell for api in $(CLIENT_APIS); do echo -n "$(MODULE)/$(API_BASE)/$$api,"; done | sed 's/,$$//')" \
142+
--output-base "$(CURDIR)/pkg/tmp_listers"
143+
mkdir -p $(CURDIR)/$(PKG_BASE)
144+
mv $(CURDIR)/pkg/tmp_listers/$(MODULE)/$(PKG_BASE)/listers \
145+
$(CURDIR)/$(PKG_BASE)/listers
146+
rm -rf $(CURDIR)/pkg/tmp_listers
147+
148+
generate-clientset: .remove-clientset
149+
client-gen \
150+
--go-header-file=$(CURDIR)/hack/boilerplate.go.txt \
151+
--clientset-name "versioned" \
152+
--build-tag "ignore_autogenerated" \
153+
--output-package "$(MODULE)/$(PKG_BASE)/clientset" \
154+
--input-base "$(MODULE)/$(API_BASE)" \
155+
--output-base "$(CURDIR)/pkg/tmp_clientset" \
156+
--input "$(shell echo $(CLIENT_APIS) | tr ' ' ',')" \
157+
--plural-exceptions "$(shell echo $(PLURAL_EXCEPTIONS) | tr ' ' ',')"
158+
mkdir -p $(CURDIR)/$(PKG_BASE)
159+
mv $(CURDIR)/pkg/tmp_clientset/$(MODULE)/$(PKG_BASE)/clientset \
160+
$(CURDIR)/$(PKG_BASE)/clientset
161+
rm -rf $(CURDIR)/pkg/tmp_clientset
162+
163+
.remove-crds:
164+
rm -rf $(CURDIR)/deployments/helm/$(HELM_DRIVER_NAME)/crds
165+
112166
.remove-deepcopy:
113167
for dir in $(DEEPCOPY_SOURCES); do \
114168
rm -f $(CURDIR)/$${dir}/zz_generated.deepcopy.go; \
115169
done
116170

171+
.remove-clientset:
172+
rm -rf $(CURDIR)/$(PKG_BASE)/clientset
173+
174+
.remove-listers:
175+
rm -rf $(CURDIR)/$(PKG_BASE)/listers
176+
177+
.remove-informers:
178+
rm -rf $(CURDIR)/$(PKG_BASE)/informers
179+
117180
# Generate an image for containerized builds
118181
# Note: This image is local only
119-
.PHONY: .build-image
182+
.PHONY: .build-image build-image
183+
build-image: .build-image
120184
.build-image:
121185
make -f deployments/devel/Makefile .build-image
122186

README.md

+6-7
Original file line numberDiff line numberDiff line change
@@ -46,10 +46,10 @@ subdirectory, so take a moment to browse through the various files and see
4646
what's available:
4747

4848
```console
49-
git clone https://github.com/NVIDIA/k8s-dra-driver.git
49+
git clone https://github.com/NVIDIA/k8s-dra-driver-gpu.git
5050
```
5151
```console
52-
cd k8s-dra-driver
52+
cd k8s-dra-driver-gpu
5353
```
5454

5555
### Setting up the infrastructure
@@ -76,9 +76,8 @@ This should show two pods running in the `nvidia` namespace:
7676
kubectl get pods -n nvidia
7777
```
7878
```
79-
NAME READY STATUS RESTARTS AGE
80-
nvidia-dra-driver-k8s-dra-driver-controller-844fcb94b-ktbkc 1/1 Running 0 69s
81-
nvidia-dra-driver-k8s-dra-driver-kubelet-plugin-5vfp9 1/1 Running 0 69s
79+
NAME READY STATUS RESTARTS AGE
80+
k8s-dra-driver-gpu-kubelet-plugin-5vfp9 1/1 Running 0 69s
8281
```
8382

8483
### Run the examples by following the steps in the demo script
@@ -144,10 +143,10 @@ This may include the following content from the original scripts:
144143
```
145144
set -e
146145
147-
export VERSION=v0.1.0
146+
export VERSION=v25.2.0
148147
149148
REGISTRY=nvcr.io/nvidia/cloud-native
150-
IMAGE=k8s-dra-driver
149+
IMAGE=k8s-dra-driver-gpu
151150
PLATFORM=ubi9
152151
153152
sudo true

api/nvidia.com/resource/gpu/v1alpha1/imexchannelconfig.go

-49
This file was deleted.

api/nvidia.com/resource/gpu/v1alpha1/api.go api/nvidia.com/resource/v1beta1/api.go

+11-7
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* limitations under the License.
1515
*/
1616

17-
package v1alpha1
17+
package v1beta1
1818

1919
import (
2020
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -24,12 +24,14 @@ import (
2424
)
2525

2626
const (
27-
GroupName = "gpu.nvidia.com"
28-
Version = "v1alpha1"
27+
GroupName = "resource.nvidia.com"
28+
Version = "v1beta1"
2929

30-
GpuConfigKind = "GpuConfig"
31-
MigDeviceConfigKind = "MigDeviceConfig"
32-
ImexChannelConfigKind = "ImexChannelConfig"
30+
GpuConfigKind = "GpuConfig"
31+
MigDeviceConfigKind = "MigDeviceConfig"
32+
ComputeDomainChannelConfigKind = "ComputeDomainChannelConfig"
33+
ComputeDomainDaemonConfigKind = "ComputeDomainDaemonConfig"
34+
ComputeDomainKind = "ComputeDomain"
3335
)
3436

3537
// Interface defines the set of common APIs for all configs
@@ -55,7 +57,9 @@ func init() {
5557
scheme.AddKnownTypes(schemeGroupVersion,
5658
&GpuConfig{},
5759
&MigDeviceConfig{},
58-
&ImexChannelConfig{},
60+
&ComputeDomainChannelConfig{},
61+
&ComputeDomainDaemonConfig{},
62+
&ComputeDomain{},
5963
)
6064
metav1.AddToGroupVersion(scheme, schemeGroupVersion)
6165

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package v1beta1
18+
19+
import (
20+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
21+
)
22+
23+
const (
24+
ComputeDomainStatusReady = "Ready"
25+
ComputeDomainStatusNotReady = "NotReady"
26+
)
27+
28+
// +genclient
29+
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
30+
// +k8s:openapi-gen=true
31+
// +kubebuilder:resource:scope=Namespaced
32+
// +kubebuilder:subresource:status
33+
34+
// ComputeDomain prepares a set of nodes to run a multi-node workload in.
35+
type ComputeDomain struct {
36+
metav1.TypeMeta `json:",inline"`
37+
metav1.ObjectMeta `json:"metadata,omitempty"`
38+
39+
Spec ComputeDomainSpec `json:"spec,omitempty"`
40+
Status ComputeDomainStatus `json:"status,omitempty"`
41+
}
42+
43+
// +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object
44+
45+
// ComputeDomainList provides a list of ComputeDomains.
46+
type ComputeDomainList struct {
47+
metav1.TypeMeta `json:",inline"`
48+
metav1.ListMeta `json:"metadata,omitempty"`
49+
50+
Items []ComputeDomain `json:"items"`
51+
}
52+
53+
// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="A computeDomain.spec is immutable"
54+
55+
// ComputeDomainSpec provides the spec for a ComputeDomain.
56+
type ComputeDomainSpec struct {
57+
NumNodes int `json:"numNodes"`
58+
Channel *ComputeDomainChannelSpec `json:"channel"`
59+
}
60+
61+
// ComputeDomainChannelSpec provides the spec for a channel used to run a workload inside a ComputeDomain.
62+
type ComputeDomainChannelSpec struct {
63+
ResourceClaimTemplate ComputeDomainResourceClaimTemplate `json:"resourceClaimTemplate"`
64+
}
65+
66+
// ComputeDomainResourceClaimTemplate provides the details of the ResourceClaimTemplate to generate.
67+
type ComputeDomainResourceClaimTemplate struct {
68+
Name string `json:"name"`
69+
}
70+
71+
// ComputeDomainStatus provides the status for a ComputeDomain.
72+
type ComputeDomainStatus struct {
73+
// +kubebuilder:validation:Enum=Ready;NotReady
74+
// +kubebuilder:default=NotReady
75+
Status string `json:"status"`
76+
// +listType=map
77+
// +listMapKey=name
78+
Nodes []*ComputeDomainNode `json:"nodes,omitempty"`
79+
}
80+
81+
// ComputeDomainNode provides information about each node added to a ComputeDomain.
82+
type ComputeDomainNode struct {
83+
Name string `json:"name"`
84+
IPAddress string `json:"ipAddress"`
85+
CliqueID string `json:"cliqueID"`
86+
}

0 commit comments

Comments
 (0)