Skip to content

Commit 1fb2ae1

Browse files
committed
WIP
Signed-off-by: Kevin Klues <[email protected]>
1 parent 6014ce6 commit 1fb2ae1

File tree

1 file changed

+81
-11
lines changed

1 file changed

+81
-11
lines changed

cmd/nvidia-dra-controller/mnenv.go

+81-11
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,13 @@ import (
4040
)
4141

4242
const (
43-
resourceClaimFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
44-
imexDeviceClass = "imex.nvidia.com"
43+
multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
44+
imexDeviceClass = "imex.nvidia.com"
4545

4646
MultiNodeEnvironmentAddEvent = "onMultiNodeEnvironmentAddEvent"
4747
MultiNodeEnvironmentDeleteEvent = "onMultiNodeEnvironmentDeleteEvent"
4848
ResourceClaimAddEvent = "ResourceClaimAddEvent"
49+
DeviceClassAddEvent = "DeviceClassAddEvent"
4950
)
5051

5152
type WorkItem struct {
@@ -60,25 +61,31 @@ type MultiNodeEnvironmentManager struct {
6061

6162
multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
6263
resourceClaimLister resourcelisters.ResourceClaimLister
64+
deviceClassLister resourcelisters.DeviceClassLister
6365
}
6466

6567
// StartManager starts a MultiNodeEnvironmentManager.
6668
func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*MultiNodeEnvironmentManager, error) {
6769
queue := workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())
6870

69-
mneInformerFactory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, 30*time.Second)
70-
mneInformer := mneInformerFactory.Gpu().V1alpha1().MultiNodeEnvironments().Informer()
71+
nvInformerFactory := nvinformers.NewSharedInformerFactory(config.clientsets.Nvidia, 30*time.Second)
72+
coreInformerFactory := informers.NewSharedInformerFactory(config.clientsets.Core, 30*time.Second)
73+
74+
mneInformer := nvInformerFactory.Gpu().V1alpha1().MultiNodeEnvironments().Informer()
7175
mneLister := nvlisters.NewMultiNodeEnvironmentLister(mneInformer.GetIndexer())
7276

73-
rcInformerFactory := informers.NewSharedInformerFactory(config.clientsets.Core, 30*time.Second)
74-
rcInformer := rcInformerFactory.Resource().V1beta1().ResourceClaims().Informer()
77+
rcInformer := coreInformerFactory.Resource().V1beta1().ResourceClaims().Informer()
7578
rcLister := resourcelisters.NewResourceClaimLister(rcInformer.GetIndexer())
7679

80+
dcInformer := coreInformerFactory.Resource().V1beta1().DeviceClasses().Informer()
81+
dcLister := resourcelisters.NewDeviceClassLister(dcInformer.GetIndexer())
82+
7783
m := &MultiNodeEnvironmentManager{
7884
clientsets: config.clientsets,
7985
queue: queue,
8086
multiNodeEnvironmentLister: mneLister,
8187
resourceClaimLister: rcLister,
88+
deviceClassLister: dcLister,
8289
}
8390

8491
mneInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
@@ -90,21 +97,25 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
9097
AddFunc: func(obj any) { m.enqueue(obj, ResourceClaimAddEvent) },
9198
})
9299

100+
dcInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
101+
AddFunc: func(obj any) { m.enqueue(obj, DeviceClassAddEvent) },
102+
})
103+
93104
m.waitGroup.Add(3)
94105
go func() {
95106
defer m.waitGroup.Done()
96-
rcInformerFactory.Start(ctx.Done())
107+
nvInformerFactory.Start(ctx.Done())
97108
}()
98109
go func() {
99110
defer m.waitGroup.Done()
100-
mneInformerFactory.Start(ctx.Done())
111+
coreInformerFactory.Start(ctx.Done())
101112
}()
102113
go func() {
103114
defer m.waitGroup.Done()
104115
m.run(ctx.Done())
105116
}()
106117

107-
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
118+
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced, dcInformer.HasSynced) {
108119
klog.Warning("Cache sync failed; retrying in 5 seconds")
109120
time.Sleep(5 * time.Second)
110121
if !cache.WaitForCacheSync(ctx.Done(), mneInformer.HasSynced, rcInformer.HasSynced) {
@@ -183,6 +194,8 @@ func (m *MultiNodeEnvironmentManager) reconcile(workItem WorkItem) error {
183194
return m.onMultiNodeEnvironmentDelete(workItem.Object)
184195
case ResourceClaimAddEvent:
185196
return m.onResourceClaimAdd(workItem.Object)
197+
case DeviceClassAddEvent:
198+
return m.onDeviceClassAdd(workItem.Object)
186199
}
187200
return fmt.Errorf("unknown event type: %s", workItem.EventType)
188201
}
@@ -223,7 +236,7 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
223236
Name: mne.Spec.ResourceClaimName,
224237
Namespace: mne.Namespace,
225238
OwnerReferences: []metav1.OwnerReference{ownerReference},
226-
Finalizers: []string{resourceClaimFinalizer},
239+
Finalizers: []string{multiNodeEnvironmentFinalizer},
227240
},
228241
Spec: resourceapi.ResourceClaimSpec{
229242
Devices: resourceapi.DeviceClaim{
@@ -288,6 +301,37 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAdd(obj any) error {
288301
return nil
289302
}
290303

304+
func (m *MultiNodeEnvironmentManager) onDeviceClassAdd(obj interface{}) error {
305+
dc, ok := obj.(*resourceapi.DeviceClass)
306+
if !ok {
307+
return fmt.Errorf("failed to cast to DeviceClass")
308+
}
309+
310+
klog.Infof("Processing added DeviceClass: %s/%s", dc.Namespace, dc.Name)
311+
312+
if len(dc.OwnerReferences) != 1 {
313+
return nil
314+
}
315+
316+
if dc.OwnerReferences[0].Kind != nvapi.MultiNodeEnvironmentKind {
317+
return nil
318+
}
319+
320+
_, err := m.multiNodeEnvironmentLister.MultiNodeEnvironments(dc.Namespace).Get(dc.OwnerReferences[0].Name)
321+
if err == nil {
322+
return nil
323+
}
324+
if !errors.IsNotFound(err) {
325+
return fmt.Errorf("error retrieving DeviceClass's OwnerReference '%s': %w", dc.OwnerReferences[0].Name, err)
326+
}
327+
328+
if err := m.removeDeviceClassFinalizer(dc.Name); err != nil {
329+
return fmt.Errorf("error removing finalizer on DeviceClass '%s': %w", dc.Name, err)
330+
}
331+
332+
return nil
333+
}
334+
291335
func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, name string) error {
292336
rc, err := m.resourceClaimLister.ResourceClaims(namespace).Get(name)
293337
if err != nil && errors.IsNotFound(err) {
@@ -301,7 +345,7 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
301345

302346
newRC.Finalizers = []string{}
303347
for _, f := range rc.Finalizers {
304-
if f != resourceClaimFinalizer {
348+
if f != multiNodeEnvironmentFinalizer {
305349
newRC.Finalizers = append(newRC.Finalizers, f)
306350
}
307351
}
@@ -313,3 +357,29 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
313357

314358
return nil
315359
}
360+
361+
func (m *MultiNodeEnvironmentManager) removeDeviceClassFinalizer(name string) error {
362+
dc, err := m.deviceClassLister.Get(name)
363+
if err != nil && errors.IsNotFound(err) {
364+
return fmt.Errorf("DeviceClass not found")
365+
}
366+
if err != nil {
367+
return fmt.Errorf("error retrieving DeviceClass: %w", err)
368+
}
369+
370+
newDC := dc.DeepCopy()
371+
372+
newDC.Finalizers = []string{}
373+
for _, f := range dc.Finalizers {
374+
if f != multiNodeEnvironmentFinalizer {
375+
newDC.Finalizers = append(newDC.Finalizers, f)
376+
}
377+
}
378+
379+
_, err = m.clientsets.Core.ResourceV1beta1().DeviceClasses().Update(context.Background(), newDC, metav1.UpdateOptions{})
380+
if err != nil {
381+
return fmt.Errorf("failed to update DeviceClass: %w", err)
382+
}
383+
384+
return nil
385+
}

0 commit comments

Comments
 (0)