@@ -40,12 +40,13 @@ import (
40
40
)
41
41
42
42
const (
43
- resourceClaimFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
44
- imexDeviceClass = "imex.nvidia.com"
43
+ multiNodeEnvironmentFinalizer = "gpu.nvidia.com/finalizer.multiNodeEnvironment"
44
+ imexDeviceClass = "imex.nvidia.com"
45
45
46
46
MultiNodeEnvironmentAddEvent = "onMultiNodeEnvironmentAddEvent"
47
47
MultiNodeEnvironmentDeleteEvent = "onMultiNodeEnvironmentDeleteEvent"
48
48
ResourceClaimAddEvent = "ResourceClaimAddEvent"
49
+ DeviceClassAddEvent = "DeviceClassAddEvent"
49
50
)
50
51
51
52
type WorkItem struct {
@@ -60,25 +61,31 @@ type MultiNodeEnvironmentManager struct {
60
61
61
62
multiNodeEnvironmentLister nvlisters.MultiNodeEnvironmentLister
62
63
resourceClaimLister resourcelisters.ResourceClaimLister
64
+ deviceClassLister resourcelisters.DeviceClassLister
63
65
}
64
66
65
67
// StartManager starts a MultiNodeEnvironmentManager.
66
68
func StartMultiNodeEnvironmentManager (ctx context.Context , config * Config ) (* MultiNodeEnvironmentManager , error ) {
67
69
queue := workqueue .NewRateLimitingQueue (workqueue .DefaultControllerRateLimiter ())
68
70
69
- mneInformerFactory := nvinformers .NewSharedInformerFactory (config .clientsets .Nvidia , 30 * time .Second )
70
- mneInformer := mneInformerFactory .Gpu ().V1alpha1 ().MultiNodeEnvironments ().Informer ()
71
+ nvInformerFactory := nvinformers .NewSharedInformerFactory (config .clientsets .Nvidia , 30 * time .Second )
72
+ coreInformerFactory := informers .NewSharedInformerFactory (config .clientsets .Core , 30 * time .Second )
73
+
74
+ mneInformer := nvInformerFactory .Gpu ().V1alpha1 ().MultiNodeEnvironments ().Informer ()
71
75
mneLister := nvlisters .NewMultiNodeEnvironmentLister (mneInformer .GetIndexer ())
72
76
73
- rcInformerFactory := informers .NewSharedInformerFactory (config .clientsets .Core , 30 * time .Second )
74
- rcInformer := rcInformerFactory .Resource ().V1beta1 ().ResourceClaims ().Informer ()
77
+ rcInformer := coreInformerFactory .Resource ().V1beta1 ().ResourceClaims ().Informer ()
75
78
rcLister := resourcelisters .NewResourceClaimLister (rcInformer .GetIndexer ())
76
79
80
+ dcInformer := coreInformerFactory .Resource ().V1beta1 ().DeviceClasses ().Informer ()
81
+ dcLister := resourcelisters .NewDeviceClassLister (dcInformer .GetIndexer ())
82
+
77
83
m := & MultiNodeEnvironmentManager {
78
84
clientsets : config .clientsets ,
79
85
queue : queue ,
80
86
multiNodeEnvironmentLister : mneLister ,
81
87
resourceClaimLister : rcLister ,
88
+ deviceClassLister : dcLister ,
82
89
}
83
90
84
91
mneInformer .AddEventHandler (cache.ResourceEventHandlerFuncs {
@@ -90,21 +97,25 @@ func StartMultiNodeEnvironmentManager(ctx context.Context, config *Config) (*Mul
90
97
AddFunc : func (obj any ) { m .enqueue (obj , ResourceClaimAddEvent ) },
91
98
})
92
99
100
+ dcInformer .AddEventHandler (cache.ResourceEventHandlerFuncs {
101
+ AddFunc : func (obj any ) { m .enqueue (obj , DeviceClassAddEvent ) },
102
+ })
103
+
93
104
m .waitGroup .Add (3 )
94
105
go func () {
95
106
defer m .waitGroup .Done ()
96
- rcInformerFactory .Start (ctx .Done ())
107
+ nvInformerFactory .Start (ctx .Done ())
97
108
}()
98
109
go func () {
99
110
defer m .waitGroup .Done ()
100
- mneInformerFactory .Start (ctx .Done ())
111
+ coreInformerFactory .Start (ctx .Done ())
101
112
}()
102
113
go func () {
103
114
defer m .waitGroup .Done ()
104
115
m .run (ctx .Done ())
105
116
}()
106
117
107
- if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
118
+ if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced , dcInformer . HasSynced ) {
108
119
klog .Warning ("Cache sync failed; retrying in 5 seconds" )
109
120
time .Sleep (5 * time .Second )
110
121
if ! cache .WaitForCacheSync (ctx .Done (), mneInformer .HasSynced , rcInformer .HasSynced ) {
@@ -183,6 +194,8 @@ func (m *MultiNodeEnvironmentManager) reconcile(workItem WorkItem) error {
183
194
return m .onMultiNodeEnvironmentDelete (workItem .Object )
184
195
case ResourceClaimAddEvent :
185
196
return m .onResourceClaimAdd (workItem .Object )
197
+ case DeviceClassAddEvent :
198
+ return m .onDeviceClassAdd (workItem .Object )
186
199
}
187
200
return fmt .Errorf ("unknown event type: %s" , workItem .EventType )
188
201
}
@@ -223,7 +236,7 @@ func (m *MultiNodeEnvironmentManager) onMultiNodeEnvironmentAdd(obj any) error {
223
236
Name : mne .Spec .ResourceClaimName ,
224
237
Namespace : mne .Namespace ,
225
238
OwnerReferences : []metav1.OwnerReference {ownerReference },
226
- Finalizers : []string {resourceClaimFinalizer },
239
+ Finalizers : []string {multiNodeEnvironmentFinalizer },
227
240
},
228
241
Spec : resourceapi.ResourceClaimSpec {
229
242
Devices : resourceapi.DeviceClaim {
@@ -288,6 +301,37 @@ func (m *MultiNodeEnvironmentManager) onResourceClaimAdd(obj any) error {
288
301
return nil
289
302
}
290
303
304
+ func (m * MultiNodeEnvironmentManager ) onDeviceClassAdd (obj interface {}) error {
305
+ dc , ok := obj .(* resourceapi.DeviceClass )
306
+ if ! ok {
307
+ return fmt .Errorf ("failed to cast to DeviceClass" )
308
+ }
309
+
310
+ klog .Infof ("Processing added DeviceClass: %s/%s" , dc .Namespace , dc .Name )
311
+
312
+ if len (dc .OwnerReferences ) != 1 {
313
+ return nil
314
+ }
315
+
316
+ if dc .OwnerReferences [0 ].Kind != nvapi .MultiNodeEnvironmentKind {
317
+ return nil
318
+ }
319
+
320
+ _ , err := m .multiNodeEnvironmentLister .MultiNodeEnvironments (dc .Namespace ).Get (dc .OwnerReferences [0 ].Name )
321
+ if err == nil {
322
+ return nil
323
+ }
324
+ if ! errors .IsNotFound (err ) {
325
+ return fmt .Errorf ("error retrieving DeviceClass's OwnerReference '%s': %w" , dc .OwnerReferences [0 ].Name , err )
326
+ }
327
+
328
+ if err := m .removeDeviceClassFinalizer (dc .Name ); err != nil {
329
+ return fmt .Errorf ("error removing finalizer on DeviceClass '%s': %w" , dc .Name , err )
330
+ }
331
+
332
+ return nil
333
+ }
334
+
291
335
func (m * MultiNodeEnvironmentManager ) removeResourceClaimFinalizer (namespace , name string ) error {
292
336
rc , err := m .resourceClaimLister .ResourceClaims (namespace ).Get (name )
293
337
if err != nil && errors .IsNotFound (err ) {
@@ -301,7 +345,7 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
301
345
302
346
newRC .Finalizers = []string {}
303
347
for _ , f := range rc .Finalizers {
304
- if f != resourceClaimFinalizer {
348
+ if f != multiNodeEnvironmentFinalizer {
305
349
newRC .Finalizers = append (newRC .Finalizers , f )
306
350
}
307
351
}
@@ -313,3 +357,29 @@ func (m *MultiNodeEnvironmentManager) removeResourceClaimFinalizer(namespace, na
313
357
314
358
return nil
315
359
}
360
+
361
+ func (m * MultiNodeEnvironmentManager ) removeDeviceClassFinalizer (name string ) error {
362
+ dc , err := m .deviceClassLister .Get (name )
363
+ if err != nil && errors .IsNotFound (err ) {
364
+ return fmt .Errorf ("DeviceClass not found" )
365
+ }
366
+ if err != nil {
367
+ return fmt .Errorf ("error retrieving DeviceClass: %w" , err )
368
+ }
369
+
370
+ newDC := dc .DeepCopy ()
371
+
372
+ newDC .Finalizers = []string {}
373
+ for _ , f := range dc .Finalizers {
374
+ if f != multiNodeEnvironmentFinalizer {
375
+ newDC .Finalizers = append (newDC .Finalizers , f )
376
+ }
377
+ }
378
+
379
+ _ , err = m .clientsets .Core .ResourceV1beta1 ().DeviceClasses ().Update (context .Background (), newDC , metav1.UpdateOptions {})
380
+ if err != nil {
381
+ return fmt .Errorf ("failed to update DeviceClass: %w" , err )
382
+ }
383
+
384
+ return nil
385
+ }
0 commit comments