diff --git a/api/nvidia.com/resource/v1beta1/computedomain.go b/api/nvidia.com/resource/v1beta1/computedomain.go index 90296376..3eb0f853 100644 --- a/api/nvidia.com/resource/v1beta1/computedomain.go +++ b/api/nvidia.com/resource/v1beta1/computedomain.go @@ -21,6 +21,11 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) +const ( + ComputeModeImmediate = "Immediate" + ComputeModeDelayed = "Delayed" +) + // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // +k8s:openapi-gen=true @@ -46,13 +51,21 @@ type ComputeDomainList struct { Items []ComputeDomain `json:"items"` } -// +kubebuilder:validation:XValidation:rule="(has(self.resourceClaimName) ? !has(self.deviceClassName) : has(self.deviceClassName))",message="Exactly one of 'resourceClaimName' or 'deviceClassName' must be set." +// +kubebuilder:validation:XValidation:rule="has(self.deviceClassName) || size(self.resourceClaimNames) > 0",message="At least one name must be specified in 'resourceClaimNames' if 'deviceClassName' is not specified." +// +kubebuilder:validation:XValidation:rule="self.mode != 'Delayed'",message="'Delayed' mode is not yet supported." +// +kubebuilder:validation:XValidation:rule="self.mode == 'Immediate' || (self.mode == 'Delayed' && size(self.resourceClaimNames) == 1)",message="When 'mode' is 'Delayed', 'resourceClaimNames' must have exactly one entry." +// +kubebuilder:validation:XValidation:rule="self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.nodeSelector))",message="When 'mode' is 'Delayed', 'NodeSelector' must not be set." +// +kubebuilder:validation:XValidation:rule="self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAlignment))",message="When 'mode' is 'Delayed', 'TopologyAlignment' must not be set." +// +kubebuilder:validation:XValidation:rule="self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAntiAlignment))",message="When 'mode' is 'Delayed', 'TopologyAntiAlignment' must not be set." // ComputeDomainSpec provides the spec for a ComputeDomain. type ComputeDomainSpec struct { + // +kubebuilder:validation:Enum=Immediate;Delayed + // +kubebuilder:default=Immediate + Mode string `json:"mode"` NumNodes int `json:"numNodes"` - ResourceClaimName string `json:"resourceClaimName,omitempty"` DeviceClassName string `json:"deviceClassName,omitempty"` + ResourceClaimNames []string `json:"resourceClaimNames,omitempty"` NodeSelector map[string]string `json:"nodeSelector,omitempty"` NodeAffinity *ComputeDomainNodeAffinity `json:"nodeAffinity,omitempty"` TopologyAlignment *ComputeDomainTopologyAlignment `json:"topologyAlignment,omitempty"` diff --git a/api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go b/api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go index 54f87e8b..696f1738 100644 --- a/api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go +++ b/api/nvidia.com/resource/v1beta1/zz_generated.deepcopy.go @@ -129,6 +129,11 @@ func (in *ComputeDomainNodeAffinity) DeepCopy() *ComputeDomainNodeAffinity { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *ComputeDomainSpec) DeepCopyInto(out *ComputeDomainSpec) { *out = *in + if in.ResourceClaimNames != nil { + in, out := &in.ResourceClaimNames, &out.ResourceClaimNames + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.NodeSelector != nil { in, out := &in.NodeSelector, &out.NodeSelector *out = make(map[string]string, len(*in)) diff --git a/cmd/nvidia-dra-imex-controller/computedomain.go b/cmd/nvidia-dra-imex-controller/computedomain.go index b72521be..7fc3f8ea 100644 --- a/cmd/nvidia-dra-imex-controller/computedomain.go +++ b/cmd/nvidia-dra-imex-controller/computedomain.go @@ -180,9 +180,9 @@ func (m *ComputeDomainManager) onComputeDomainAdd(ctx context.Context, obj any) return fmt.Errorf("error creating DeviceClass: %w", err) } - if cd.Spec.ResourceClaimName != "" { - if _, err := m.resourceClaimManager.Create(ctx, cd.Namespace, cd.Spec.ResourceClaimName, dc.Name, cd); err != nil { - return fmt.Errorf("error creating ResourceClaim '%s/%s': %w", cd.Namespace, cd.Spec.ResourceClaimName, err) + for _, name := range cd.Spec.ResourceClaimNames { + if _, err := m.resourceClaimManager.Create(ctx, cd.Namespace, name, dc.Name, cd); err != nil { + return fmt.Errorf("error creating ResourceClaim '%s/%s': %w", cd.Namespace, name, err) } } @@ -205,8 +205,10 @@ func (m *ComputeDomainManager) onComputeDomainDelete(ctx context.Context, obj an return fmt.Errorf("error deleting DeviceClass: %w", err) } - if err := m.resourceClaimManager.Delete(ctx, string(cd.UID)); err != nil { - return fmt.Errorf("error deleting ResourceClaim '%s/%s': %w", cd.Namespace, cd.Spec.ResourceClaimName, err) + for _, name := range cd.Spec.ResourceClaimNames { + if err := m.resourceClaimManager.Delete(ctx, string(cd.UID)); err != nil { + return fmt.Errorf("error deleting ResourceClaim '%s/%s': %w", cd.Namespace, name, err) + } } return nil diff --git a/deployments/helm/k8s-dra-gpu-driver/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/k8s-dra-gpu-driver/crds/resource.nvidia.com_computedomains.yaml index c9259bc2..d04a7c59 100644 --- a/deployments/helm/k8s-dra-gpu-driver/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/helm/k8s-dra-gpu-driver/crds/resource.nvidia.com_computedomains.yaml @@ -42,6 +42,12 @@ spec: properties: deviceClassName: type: string + mode: + default: Immediate + enum: + - Immediate + - Delayed + type: string nodeAffinity: properties: preferred: @@ -236,8 +242,10 @@ spec: type: object numNodes: type: integer - resourceClaimName: - type: string + resourceClaimNames: + items: + type: string + type: array topologyAlignment: properties: preferred: @@ -299,12 +307,26 @@ spec: - message: At least one of 'preferred' or 'required' must be set. rule: has(self.preferred) || has(self.required) required: + - mode - numNodes type: object x-kubernetes-validations: - - message: Exactly one of 'resourceClaimName' or 'deviceClassName' must + - message: At least one name must be specified in 'resourceClaimNames' + if 'deviceClassName' is not specified. + rule: has(self.deviceClassName) || size(self.resourceClaimNames) > 0 + - message: '''Delayed'' mode is not yet supported.' + rule: self.mode != 'Delayed' + - message: When 'mode' is 'Delayed', 'resourceClaimNames' must have exactly + one entry. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && size(self.resourceClaimNames) + == 1) + - message: When 'mode' is 'Delayed', 'NodeSelector' must not be set. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.nodeSelector)) + - message: When 'mode' is 'Delayed', 'TopologyAlignment' must not be set. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAlignment)) + - message: When 'mode' is 'Delayed', 'TopologyAntiAlignment' must not be set. - rule: '(has(self.resourceClaimName) ? !has(self.deviceClassName) : has(self.deviceClassName))' + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAntiAlignment)) status: description: ComputeDomainStatus provides the status for a ComputeDomain. properties: diff --git a/deployments/helm/k8s-dra-imex-driver/crds/resource.nvidia.com_computedomains.yaml b/deployments/helm/k8s-dra-imex-driver/crds/resource.nvidia.com_computedomains.yaml index c9259bc2..d04a7c59 100644 --- a/deployments/helm/k8s-dra-imex-driver/crds/resource.nvidia.com_computedomains.yaml +++ b/deployments/helm/k8s-dra-imex-driver/crds/resource.nvidia.com_computedomains.yaml @@ -42,6 +42,12 @@ spec: properties: deviceClassName: type: string + mode: + default: Immediate + enum: + - Immediate + - Delayed + type: string nodeAffinity: properties: preferred: @@ -236,8 +242,10 @@ spec: type: object numNodes: type: integer - resourceClaimName: - type: string + resourceClaimNames: + items: + type: string + type: array topologyAlignment: properties: preferred: @@ -299,12 +307,26 @@ spec: - message: At least one of 'preferred' or 'required' must be set. rule: has(self.preferred) || has(self.required) required: + - mode - numNodes type: object x-kubernetes-validations: - - message: Exactly one of 'resourceClaimName' or 'deviceClassName' must + - message: At least one name must be specified in 'resourceClaimNames' + if 'deviceClassName' is not specified. + rule: has(self.deviceClassName) || size(self.resourceClaimNames) > 0 + - message: '''Delayed'' mode is not yet supported.' + rule: self.mode != 'Delayed' + - message: When 'mode' is 'Delayed', 'resourceClaimNames' must have exactly + one entry. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && size(self.resourceClaimNames) + == 1) + - message: When 'mode' is 'Delayed', 'NodeSelector' must not be set. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.nodeSelector)) + - message: When 'mode' is 'Delayed', 'TopologyAlignment' must not be set. + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAlignment)) + - message: When 'mode' is 'Delayed', 'TopologyAntiAlignment' must not be set. - rule: '(has(self.resourceClaimName) ? !has(self.deviceClassName) : has(self.deviceClassName))' + rule: self.mode == 'Immediate' || (self.mode == 'Delayed' && !has(self.topologyAntiAlignment)) status: description: ComputeDomainStatus provides the status for a ComputeDomain. properties: