From d17ee6770ee91fa32b7bea1514fce6375b97e790 Mon Sep 17 00:00:00 2001 From: Kevin Klues Date: Fri, 24 Jan 2025 01:11:43 +0000 Subject: [PATCH] Move creation of IMEX channel pool to after the deployment is fully up. Signed-off-by: Kevin Klues --- cmd/nvidia-dra-imex-controller/deployment.go | 43 ++++++++++++++++++- .../deploymentpods.go | 43 +++---------------- 2 files changed, 49 insertions(+), 37 deletions(-) diff --git a/cmd/nvidia-dra-imex-controller/deployment.go b/cmd/nvidia-dra-imex-controller/deployment.go index 8994a3f2..ce5ccd9e 100644 --- a/cmd/nvidia-dra-imex-controller/deployment.go +++ b/cmd/nvidia-dra-imex-controller/deployment.go @@ -246,10 +246,15 @@ func (m *DeploymentManager) Delete(ctx context.Context, cdUID string) error { } key := d.Spec.Selector.MatchLabels[computeDomainLabelKey] + if err := m.removePodManager(key); err != nil { return fmt.Errorf("error removing Pod manager: %w", err) } + if err := m.imexChannelManager.DeletePool(key); err != nil { + return fmt.Errorf("error deleting IMEX channel pool: %w", err) + } + return nil } @@ -301,6 +306,42 @@ func (m *DeploymentManager) onAddOrUpdate(ctx context.Context, obj any) error { return fmt.Errorf("error adding Pod manager '%s/%s': %w", d.Namespace, d.Name, err) } + if d.Status.AvailableReplicas != *d.Spec.Replicas { + return nil + } + + if err := m.createOrUpdatePool(d, cd); err != nil { + return fmt.Errorf("error creating or updating pool: %w", err) + } + + return nil +} + +func (m *DeploymentManager) createOrUpdatePool(d *appsv1.Deployment, cd *nvapi.ComputeDomain) error { + var nodeNames []string + for _, node := range cd.Status.Nodes { + nodeNames = append(nodeNames, node.Name) + } + + nodeSelector := corev1.NodeSelector{ + NodeSelectorTerms: []corev1.NodeSelectorTerm{ + { + MatchExpressions: []corev1.NodeSelectorRequirement{ + { + Key: "kubernetes.io/hostname", + Operator: corev1.NodeSelectorOpIn, + Values: nodeNames, + }, + }, + }, + }, + } + + computeDomainLabel := d.Spec.Selector.MatchLabels[computeDomainLabelKey] + if err := m.imexChannelManager.CreateOrUpdatePool(computeDomainLabel, &nodeSelector); err != nil { + return fmt.Errorf("failed to create or update IMEX channel pool: %w", err) + } + return nil } @@ -311,7 +352,7 @@ func (m *DeploymentManager) addPodManager(ctx context.Context, labelSelector *me return nil } - podManager := NewDeploymentPodManager(m.config, m.imexChannelManager, labelSelector, numPods, m.getComputeDomain) + podManager := NewDeploymentPodManager(m.config, labelSelector, numPods, m.getComputeDomain) if err := podManager.Start(ctx); err != nil { return fmt.Errorf("error creating Pod manager: %w", err) diff --git a/cmd/nvidia-dra-imex-controller/deploymentpods.go b/cmd/nvidia-dra-imex-controller/deploymentpods.go index fb4a8605..341d776c 100644 --- a/cmd/nvidia-dra-imex-controller/deploymentpods.go +++ b/cmd/nvidia-dra-imex-controller/deploymentpods.go @@ -48,14 +48,10 @@ type DeploymentPodManager struct { getComputeDomain GetComputeDomainFunc computeDomainNodes []*nvapi.ComputeDomainNode - computeDomainLabel string numPods int - nodeSelector corev1.NodeSelector - - imexChannelManager *ImexChannelManager } -func NewDeploymentPodManager(config *ManagerConfig, imexChannelManager *ImexChannelManager, labelSelector *metav1.LabelSelector, numPods int, getComputeDomain GetComputeDomainFunc) *DeploymentPodManager { +func NewDeploymentPodManager(config *ManagerConfig, labelSelector *metav1.LabelSelector, numPods int, getComputeDomain GetComputeDomainFunc) *DeploymentPodManager { factory := informers.NewSharedInformerFactoryWithOptions( config.clientsets.Core, informerResyncPeriod, @@ -68,30 +64,13 @@ func NewDeploymentPodManager(config *ManagerConfig, imexChannelManager *ImexChan informer := factory.Core().V1().Pods().Informer() lister := factory.Core().V1().Pods().Lister() - nodeSelector := corev1.NodeSelector{ - NodeSelectorTerms: []corev1.NodeSelectorTerm{ - { - MatchExpressions: []corev1.NodeSelectorRequirement{ - { - Key: "kubernetes.io/hostname", - Operator: corev1.NodeSelectorOpIn, - Values: []string{}, - }, - }, - }, - }, - } - m := &DeploymentPodManager{ - config: config, - factory: factory, - informer: informer, - lister: lister, - getComputeDomain: getComputeDomain, - computeDomainLabel: labelSelector.MatchLabels[computeDomainLabelKey], - numPods: numPods, - nodeSelector: nodeSelector, - imexChannelManager: imexChannelManager, + config: config, + factory: factory, + informer: informer, + lister: lister, + getComputeDomain: getComputeDomain, + numPods: numPods, } return m @@ -135,9 +114,6 @@ func (m *DeploymentPodManager) Start(ctx context.Context) (rerr error) { } func (m *DeploymentPodManager) Stop() error { - if err := m.imexChannelManager.DeletePool(m.computeDomainLabel); err != nil { - return fmt.Errorf("error deleting IMEX channel pool: %w", err) - } m.cancelContext() m.waitGroup.Wait() return nil @@ -194,11 +170,6 @@ func (m *DeploymentPodManager) onPodAddOrUpdate(ctx context.Context, obj any) er return fmt.Errorf("error updating nodes in ComputeDomain status: %w", err) } - m.nodeSelector.NodeSelectorTerms[0].MatchExpressions[0].Values = nodeNames - if err := m.imexChannelManager.CreateOrUpdatePool(m.computeDomainLabel, &m.nodeSelector); err != nil { - return fmt.Errorf("failed to create or update IMEX channel pool: %w", err) - } - return nil }