diff --git a/api/nvidia.com/resource/gpu/nas/v1alpha1/api.go b/api/nvidia.com/resource/gpu/nas/v1alpha1/api.go index 703020ac..94e72469 100644 --- a/api/nvidia.com/resource/gpu/nas/v1alpha1/api.go +++ b/api/nvidia.com/resource/gpu/nas/v1alpha1/api.go @@ -24,10 +24,6 @@ const ( GroupName = "nas.gpu.resource.nvidia.com" Version = "v1alpha1" - GpuDeviceType = "gpu" - MigDeviceType = "mig" - UnknownDeviceType = "unknown" - NodeAllocationStateStatusReady = "Ready" NodeAllocationStateStatusNotReady = "NotReady" ) diff --git a/api/nvidia.com/resource/gpu/nas/v1alpha1/nas.go b/api/nvidia.com/resource/gpu/nas/v1alpha1/nas.go index 73aa0ab6..64b206a7 100644 --- a/api/nvidia.com/resource/gpu/nas/v1alpha1/nas.go +++ b/api/nvidia.com/resource/gpu/nas/v1alpha1/nas.go @@ -17,16 +17,12 @@ package v1alpha1 import ( + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// ClaimInfo holds the identifying information about a claim. -type ClaimInfo struct { - Namespace string `json:"namespace"` - Name string `json:"name"` - UID string `json:"uid"` -} - // MigDevicePlacement represents the placement of a MIG device within a GPU. type MigDevicePlacement struct { Start int `json:"start"` @@ -43,6 +39,8 @@ type AllocatableGpu struct { Brand string `json:"brand"` Architecture string `json:"architecture"` CUDAComputeCapability string `json:"cudaComputeCapability"` + DriverVersion string `json:"driverVersion"` + CUDADriverVersion string `json:"cudaDriverVersion"` } // AllocatableMigDevice represents an allocatable MIG device (and its possible placements) on a given type of GPU. @@ -61,12 +59,12 @@ type AllocatableDevice struct { // Type returns the type of AllocatableDevice this represents. func (d AllocatableDevice) Type() string { if d.Gpu != nil { - return GpuDeviceType + return types.GpuDeviceType } if d.Mig != nil { - return MigDeviceType + return types.MigDeviceType } - return UnknownDeviceType + return types.UnknownDeviceType } // AllocatedGpu represents an allocated GPU. @@ -83,19 +81,19 @@ type AllocatedMigDevice struct { // AllocatedGpus represents a set of allocated GPUs. type AllocatedGpus struct { - Devices []AllocatedGpu `json:"devices"` - Sharing *GpuSharing `json:"sharing,omitempty"` + Devices []AllocatedGpu `json:"devices"` + Sharing *sharing.GpuSharing `json:"sharing,omitempty"` } // AllocatedMigDevices represents a set of allocated MIG devices. type AllocatedMigDevices struct { - Devices []AllocatedMigDevice `json:"devices"` - Sharing *MigDeviceSharing `json:"sharing,omitempty"` + Devices []AllocatedMigDevice `json:"devices"` + Sharing *sharing.MigDeviceSharing `json:"sharing,omitempty"` } // AllocatedDevices represents a set of allocated devices. type AllocatedDevices struct { - ClaimInfo *ClaimInfo `json:"claimInfo"` + ClaimInfo *types.ClaimInfo `json:"claimInfo"` Gpu *AllocatedGpus `json:"gpu,omitempty"` Mig *AllocatedMigDevices `json:"mig,omitempty"` } @@ -103,12 +101,12 @@ type AllocatedDevices struct { // Type returns the type of AllocatedDevices this represents. func (r AllocatedDevices) Type() string { if r.Gpu != nil { - return GpuDeviceType + return types.GpuDeviceType } if r.Mig != nil { - return MigDeviceType + return types.MigDeviceType } - return UnknownDeviceType + return types.UnknownDeviceType } // PreparedGpu represents a prepared GPU on a node. @@ -143,12 +141,12 @@ type PreparedDevices struct { // Type returns the type of PreparedDevices this represents. func (d PreparedDevices) Type() string { if d.Gpu != nil { - return GpuDeviceType + return types.GpuDeviceType } if d.Mig != nil { - return MigDeviceType + return types.MigDeviceType } - return UnknownDeviceType + return types.UnknownDeviceType } // NodeAllocationStateSpec is the spec for the NodeAllocationState CRD. diff --git a/api/nvidia.com/resource/gpu/nas/v1alpha1/zz_generated.deepcopy.go b/api/nvidia.com/resource/gpu/nas/v1alpha1/zz_generated.deepcopy.go index 7e28683d..ec4e9257 100644 --- a/api/nvidia.com/resource/gpu/nas/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia.com/resource/gpu/nas/v1alpha1/zz_generated.deepcopy.go @@ -22,6 +22,8 @@ package v1alpha1 import ( + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -91,7 +93,7 @@ func (in *AllocatedDevices) DeepCopyInto(out *AllocatedDevices) { *out = *in if in.ClaimInfo != nil { in, out := &in.ClaimInfo, &out.ClaimInfo - *out = new(ClaimInfo) + *out = new(types.ClaimInfo) **out = **in } if in.Gpu != nil { @@ -141,7 +143,7 @@ func (in *AllocatedGpus) DeepCopyInto(out *AllocatedGpus) { } if in.Sharing != nil { in, out := &in.Sharing, &out.Sharing - *out = new(GpuSharing) + *out = new(sharing.GpuSharing) (*in).DeepCopyInto(*out) } } @@ -182,7 +184,7 @@ func (in *AllocatedMigDevices) DeepCopyInto(out *AllocatedMigDevices) { } if in.Sharing != nil { in, out := &in.Sharing, &out.Sharing - *out = new(MigDeviceSharing) + *out = new(sharing.MigDeviceSharing) (*in).DeepCopyInto(*out) } } @@ -197,46 +199,6 @@ func (in *AllocatedMigDevices) DeepCopy() *AllocatedMigDevices { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *ClaimInfo) DeepCopyInto(out *ClaimInfo) { - *out = *in -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ClaimInfo. -func (in *ClaimInfo) DeepCopy() *ClaimInfo { - if in == nil { - return nil - } - out := new(ClaimInfo) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GpuSharing) DeepCopyInto(out *GpuSharing) { - *out = *in - if in.TimeSlicingConfig != nil { - in, out := &in.TimeSlicingConfig, &out.TimeSlicingConfig - *out = new(TimeSlicingConfig) - (*in).DeepCopyInto(*out) - } - if in.MpsConfig != nil { - in, out := &in.MpsConfig, &out.MpsConfig - *out = new(MpsConfig) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuSharing. -func (in *GpuSharing) DeepCopy() *GpuSharing { - if in == nil { - return nil - } - out := new(GpuSharing) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *MigDevicePlacement) DeepCopyInto(out *MigDevicePlacement) { *out = *in @@ -252,79 +214,6 @@ func (in *MigDevicePlacement) DeepCopy() *MigDevicePlacement { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MigDeviceSharing) DeepCopyInto(out *MigDeviceSharing) { - *out = *in - if in.MpsConfig != nil { - in, out := &in.MpsConfig, &out.MpsConfig - *out = new(MpsConfig) - (*in).DeepCopyInto(*out) - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigDeviceSharing. -func (in *MigDeviceSharing) DeepCopy() *MigDeviceSharing { - if in == nil { - return nil - } - out := new(MigDeviceSharing) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *MpsConfig) DeepCopyInto(out *MpsConfig) { - *out = *in - if in.DefaultActiveThreadPercentage != nil { - in, out := &in.DefaultActiveThreadPercentage, &out.DefaultActiveThreadPercentage - *out = new(int) - **out = **in - } - if in.DefaultPinnedDeviceMemoryLimit != nil { - in, out := &in.DefaultPinnedDeviceMemoryLimit, &out.DefaultPinnedDeviceMemoryLimit - x := (*in).DeepCopy() - *out = &x - } - if in.DefaultPerDevicePinnedMemoryLimit != nil { - in, out := &in.DefaultPerDevicePinnedMemoryLimit, &out.DefaultPerDevicePinnedMemoryLimit - *out = make(MpsPerDevicePinnedMemoryLimit, len(*in)) - for key, val := range *in { - (*out)[key] = val.DeepCopy() - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MpsConfig. -func (in *MpsConfig) DeepCopy() *MpsConfig { - if in == nil { - return nil - } - out := new(MpsConfig) - in.DeepCopyInto(out) - return out -} - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in MpsPerDevicePinnedMemoryLimit) DeepCopyInto(out *MpsPerDevicePinnedMemoryLimit) { - { - in := &in - *out = make(MpsPerDevicePinnedMemoryLimit, len(*in)) - for key, val := range *in { - (*out)[key] = val.DeepCopy() - } - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MpsPerDevicePinnedMemoryLimit. -func (in MpsPerDevicePinnedMemoryLimit) DeepCopy() MpsPerDevicePinnedMemoryLimit { - if in == nil { - return nil - } - out := new(MpsPerDevicePinnedMemoryLimit) - in.DeepCopyInto(out) - return *out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NodeAllocationState) DeepCopyInto(out *NodeAllocationState) { *out = *in @@ -534,23 +423,3 @@ func (in *PreparedMigDevices) DeepCopy() *PreparedMigDevices { in.DeepCopyInto(out) return out } - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TimeSlicingConfig) DeepCopyInto(out *TimeSlicingConfig) { - *out = *in - if in.TimeSlice != nil { - in, out := &in.TimeSlice, &out.TimeSlice - *out = new(TimeSliceDuration) - **out = **in - } -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TimeSlicingConfig. -func (in *TimeSlicingConfig) DeepCopy() *TimeSlicingConfig { - if in == nil { - return nil - } - out := new(TimeSlicingConfig) - in.DeepCopyInto(out) - return out -} diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuclaim.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuclaim.go index 82f962e2..cd7e3065 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/gpuclaim.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuclaim.go @@ -19,14 +19,14 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" ) // GpuClaimParametersSpec is the spec for the GpuClaimParameters CRD. type GpuClaimParametersSpec struct { - Count *int `json:"count,omitempty"` - Selector *GpuSelector `json:"selector,omitempty"` - Sharing *nascrd.GpuSharing `json:"sharing,omitempty"` + Count *int `json:"count,omitempty"` + Selector *GpuSelector `json:"selector,omitempty"` + Sharing *sharing.GpuSharing `json:"sharing,omitempty"` } // +genclient diff --git a/api/nvidia.com/resource/gpu/v1alpha1/gpuselector.go b/api/nvidia.com/resource/gpu/v1alpha1/gpuselector.go index 0f70a23d..bf705977 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/gpuselector.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/gpuselector.go @@ -69,7 +69,7 @@ type GpuSelectorProperties struct { Architecture *selector.GlobProperty `json:"architecture,omitempty"` CUDAComputeCapability *selector.VersionComparator `json:"cudaComputeCapability,omitempty"` DriverVersion *selector.VersionComparator `json:"driverVersion,omitempty"` - CUDARuntimeVersion *selector.VersionComparator `json:"cudaRuntimeVersion,omitempty"` + CUDADriverVersion *selector.VersionComparator `json:"cudaDriverVersion,omitempty"` } // Matches evaluates a GpuSelector to see if it matches the boolean expression it represents diff --git a/api/nvidia.com/resource/gpu/v1alpha1/migclaim.go b/api/nvidia.com/resource/gpu/v1alpha1/migclaim.go index b529a587..02dd4de4 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/migclaim.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/migclaim.go @@ -19,14 +19,14 @@ package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" ) // MigDeviceClaimParametersSpec is the spec for the MigDeviceClaimParameters CRD. type MigDeviceClaimParametersSpec struct { - Profile string `json:"profile,omitempty"` - Sharing *nascrd.MigDeviceSharing `json:"sharing,omitempty"` - GpuClaimParametersName string `json:"gpuClaimName,omitempty"` + Profile string `json:"profile,omitempty"` + Sharing *sharing.MigDeviceSharing `json:"sharing,omitempty"` + GpuClaimParametersName string `json:"gpuClaimName,omitempty"` } // +genclient diff --git a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go index 75d3b27f..d90bf6ac 100644 --- a/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go +++ b/api/nvidia.com/resource/gpu/v1alpha1/zz_generated.deepcopy.go @@ -22,8 +22,8 @@ package v1alpha1 import ( - nasv1alpha1 "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" "github.com/NVIDIA/k8s-dra-driver/api/utils/selector" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" "k8s.io/apimachinery/pkg/runtime" ) @@ -251,7 +251,7 @@ func (in *GpuClaimParametersSpec) DeepCopyInto(out *GpuClaimParametersSpec) { } if in.Sharing != nil { in, out := &in.Sharing, &out.Sharing - *out = new(nasv1alpha1.GpuSharing) + *out = new(sharing.GpuSharing) (*in).DeepCopyInto(*out) } } @@ -436,8 +436,8 @@ func (in *GpuSelectorProperties) DeepCopyInto(out *GpuSelectorProperties) { *out = new(selector.VersionComparator) **out = **in } - if in.CUDARuntimeVersion != nil { - in, out := &in.CUDARuntimeVersion, &out.CUDARuntimeVersion + if in.CUDADriverVersion != nil { + in, out := &in.CUDADriverVersion, &out.CUDADriverVersion *out = new(selector.VersionComparator) **out = **in } @@ -516,7 +516,7 @@ func (in *MigDeviceClaimParametersSpec) DeepCopyInto(out *MigDeviceClaimParamete *out = *in if in.Sharing != nil { in, out := &in.Sharing, &out.Sharing - *out = new(nasv1alpha1.MigDeviceSharing) + *out = new(sharing.MigDeviceSharing) (*in).DeepCopyInto(*out) } } diff --git a/api/utils/sharing/doc.go b/api/utils/sharing/doc.go new file mode 100644 index 00000000..239ba5f1 --- /dev/null +++ b/api/utils/sharing/doc.go @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// +k8s:deepcopy-gen=package + +package sharing diff --git a/api/nvidia.com/resource/gpu/nas/v1alpha1/sharing.go b/api/utils/sharing/sharing.go similarity index 99% rename from api/nvidia.com/resource/gpu/nas/v1alpha1/sharing.go rename to api/utils/sharing/sharing.go index e052f411..37fc2380 100644 --- a/api/nvidia.com/resource/gpu/nas/v1alpha1/sharing.go +++ b/api/utils/sharing/sharing.go @@ -14,7 +14,7 @@ * limitations under the License. */ -package v1alpha1 +package sharing import ( "errors" @@ -40,7 +40,7 @@ const ( // Sharing provides methods to check if a given sharing strategy is selected and grab its configuration. // +k8s:deepcopy-gen=false -type Sharing interface { +type Interface interface { IsTimeSlicing() bool IsMps() bool GetTimeSlicingConfig() (*TimeSlicingConfig, error) diff --git a/api/nvidia.com/resource/gpu/nas/v1alpha1/sharing_test.go b/api/utils/sharing/sharing_test.go similarity index 80% rename from api/nvidia.com/resource/gpu/nas/v1alpha1/sharing_test.go rename to api/utils/sharing/sharing_test.go index 5722f172..44b8dfc9 100644 --- a/api/nvidia.com/resource/gpu/nas/v1alpha1/sharing_test.go +++ b/api/utils/sharing/sharing_test.go @@ -14,7 +14,7 @@ # limitations under the License. **/ -package v1alpha1_test +package sharing_test import ( "testing" @@ -22,7 +22,7 @@ import ( "github.com/stretchr/testify/require" "k8s.io/apimachinery/pkg/api/resource" - "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" ) func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { @@ -30,7 +30,7 @@ func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { description string uuids []string memoryLimit *resource.Quantity - perDeviceMemoryLimit v1alpha1.MpsPerDevicePinnedMemoryLimit + perDeviceMemoryLimit sharing.MpsPerDevicePinnedMemoryLimit expectedError error expectedLimits map[string]string }{ @@ -40,18 +40,18 @@ func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { }, { description: "no uuids, invalid device index", - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "0": resource.MustParse("1Gi"), }, - expectedError: v1alpha1.ErrInvalidDeviceSelector, + expectedError: sharing.ErrInvalidDeviceSelector, }, { description: "no uuids, default is overridden", memoryLimit: ptr(resource.MustParse("2Gi")), - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "0": resource.MustParse("1Gi"), }, - expectedError: v1alpha1.ErrInvalidDeviceSelector, + expectedError: sharing.ErrInvalidDeviceSelector, }, { description: "uuids, default is set", @@ -65,21 +65,21 @@ func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { description: "uuids, default is too low", uuids: []string{"UUID0"}, memoryLimit: ptr(resource.MustParse("1M")), - expectedError: v1alpha1.ErrInvalidLimit, + expectedError: sharing.ErrInvalidLimit, }, { description: "uuids, override is too low", uuids: []string{"UUID0"}, - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "UUID0": resource.MustParse("1M"), }, - expectedError: v1alpha1.ErrInvalidLimit, + expectedError: sharing.ErrInvalidLimit, }, { description: "uuids, default is overridden", uuids: []string{"UUID0"}, memoryLimit: ptr(resource.MustParse("2Gi")), - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "0": resource.MustParse("1Gi"), }, expectedLimits: map[string]string{ @@ -90,7 +90,7 @@ func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { description: "uuids, default is overridden by uuid", uuids: []string{"UUID0"}, memoryLimit: ptr(resource.MustParse("2Gi")), - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "UUID0": resource.MustParse("1Gi"), }, expectedLimits: map[string]string{ @@ -101,19 +101,19 @@ func TestMpsPerDevicePinnedMemoryLimitNormalize(t *testing.T) { description: "uuids, default is overridden, invalid UUID", uuids: []string{"UUID0"}, memoryLimit: ptr(resource.MustParse("2Gi")), - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "UUID1": resource.MustParse("1Gi"), }, - expectedError: v1alpha1.ErrInvalidDeviceSelector, + expectedError: sharing.ErrInvalidDeviceSelector, }, { description: "uuids, default is overridden, invalid index", uuids: []string{"UUID0"}, memoryLimit: ptr(resource.MustParse("2Gi")), - perDeviceMemoryLimit: v1alpha1.MpsPerDevicePinnedMemoryLimit{ + perDeviceMemoryLimit: sharing.MpsPerDevicePinnedMemoryLimit{ "1": resource.MustParse("1Gi"), }, - expectedError: v1alpha1.ErrInvalidDeviceSelector, + expectedError: sharing.ErrInvalidDeviceSelector, }, { description: "unit conversion Mi to M", diff --git a/api/utils/sharing/zz_generated.deepcopy.go b/api/utils/sharing/zz_generated.deepcopy.go new file mode 100644 index 00000000..5a813cd7 --- /dev/null +++ b/api/utils/sharing/zz_generated.deepcopy.go @@ -0,0 +1,139 @@ +//go:build !ignore_autogenerated + +/* + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Code generated by controller-gen. DO NOT EDIT. + +package sharing + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GpuSharing) DeepCopyInto(out *GpuSharing) { + *out = *in + if in.TimeSlicingConfig != nil { + in, out := &in.TimeSlicingConfig, &out.TimeSlicingConfig + *out = new(TimeSlicingConfig) + (*in).DeepCopyInto(*out) + } + if in.MpsConfig != nil { + in, out := &in.MpsConfig, &out.MpsConfig + *out = new(MpsConfig) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GpuSharing. +func (in *GpuSharing) DeepCopy() *GpuSharing { + if in == nil { + return nil + } + out := new(GpuSharing) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MigDeviceSharing) DeepCopyInto(out *MigDeviceSharing) { + *out = *in + if in.MpsConfig != nil { + in, out := &in.MpsConfig, &out.MpsConfig + *out = new(MpsConfig) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MigDeviceSharing. +func (in *MigDeviceSharing) DeepCopy() *MigDeviceSharing { + if in == nil { + return nil + } + out := new(MigDeviceSharing) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MpsConfig) DeepCopyInto(out *MpsConfig) { + *out = *in + if in.DefaultActiveThreadPercentage != nil { + in, out := &in.DefaultActiveThreadPercentage, &out.DefaultActiveThreadPercentage + *out = new(int) + **out = **in + } + if in.DefaultPinnedDeviceMemoryLimit != nil { + in, out := &in.DefaultPinnedDeviceMemoryLimit, &out.DefaultPinnedDeviceMemoryLimit + x := (*in).DeepCopy() + *out = &x + } + if in.DefaultPerDevicePinnedMemoryLimit != nil { + in, out := &in.DefaultPerDevicePinnedMemoryLimit, &out.DefaultPerDevicePinnedMemoryLimit + *out = make(MpsPerDevicePinnedMemoryLimit, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MpsConfig. +func (in *MpsConfig) DeepCopy() *MpsConfig { + if in == nil { + return nil + } + out := new(MpsConfig) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in MpsPerDevicePinnedMemoryLimit) DeepCopyInto(out *MpsPerDevicePinnedMemoryLimit) { + { + in := &in + *out = make(MpsPerDevicePinnedMemoryLimit, len(*in)) + for key, val := range *in { + (*out)[key] = val.DeepCopy() + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MpsPerDevicePinnedMemoryLimit. +func (in MpsPerDevicePinnedMemoryLimit) DeepCopy() MpsPerDevicePinnedMemoryLimit { + if in == nil { + return nil + } + out := new(MpsPerDevicePinnedMemoryLimit) + in.DeepCopyInto(out) + return *out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TimeSlicingConfig) DeepCopyInto(out *TimeSlicingConfig) { + *out = *in + if in.TimeSlice != nil { + in, out := &in.TimeSlice, &out.TimeSlice + *out = new(TimeSliceDuration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TimeSlicingConfig. +func (in *TimeSlicingConfig) DeepCopy() *TimeSlicingConfig { + if in == nil { + return nil + } + out := new(TimeSlicingConfig) + in.DeepCopyInto(out) + return out +} diff --git a/api/utils/types/types.go b/api/utils/types/types.go new file mode 100644 index 00000000..a52bd94e --- /dev/null +++ b/api/utils/types/types.go @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package types + +const ( + GpuDeviceType = "gpu" + MigDeviceType = "mig" + UnknownDeviceType = "unknown" +) + +// ClaimInfo holds the identifying information about a claim. +type ClaimInfo struct { + Namespace string `json:"namespace"` + Name string `json:"name"` + UID string `json:"uid"` +} diff --git a/cmd/nvidia-dra-controller/allocations_test.go b/cmd/nvidia-dra-controller/allocations_test.go index e18a1991..d830547a 100644 --- a/cmd/nvidia-dra-controller/allocations_test.go +++ b/cmd/nvidia-dra-controller/allocations_test.go @@ -22,6 +22,7 @@ import ( "github.com/stretchr/testify/assert" nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) func Test_PerNodeAllocatedClaims(t *testing.T) { @@ -34,8 +35,8 @@ func Test_PerNodeAllocatedClaims(t *testing.T) { assert.Equal(t, false, exists) // Test Set() - device1 := nascrd.AllocatedDevices{ClaimInfo: &nascrd.ClaimInfo{Namespace: "default", Name: "device1"}} - device2 := nascrd.AllocatedDevices{ClaimInfo: &nascrd.ClaimInfo{Namespace: "default", Name: "device2"}} + device1 := nascrd.AllocatedDevices{ClaimInfo: &types.ClaimInfo{Namespace: "default", Name: "device1"}} + device2 := nascrd.AllocatedDevices{ClaimInfo: &types.ClaimInfo{Namespace: "default", Name: "device2"}} allocationClaims.Set("fake-claim", "fake-node", device1) allocationClaims.Set("fake-claim", "fake-node", device2) diff --git a/cmd/nvidia-dra-controller/driver.go b/cmd/nvidia-dra-controller/driver.go index 507eacff..19d2e921 100644 --- a/cmd/nvidia-dra-controller/driver.go +++ b/cmd/nvidia-dra-controller/driver.go @@ -28,6 +28,7 @@ import ( nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" nasclient "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1/client" gpucrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" nvclientset "github.com/NVIDIA/k8s-dra-driver/pkg/nvidia.com/resource/clientset/versioned" ) @@ -166,7 +167,7 @@ func (d driver) allocate(ctx context.Context, claim *resourcev1.ResourceClaim, c } allocated := crd.Spec.AllocatedClaims[string(claim.UID)] - allocated.ClaimInfo = &nascrd.ClaimInfo{ + allocated.ClaimInfo = &types.ClaimInfo{ Namespace: claim.Namespace, Name: claim.Name, UID: string(claim.UID), @@ -214,9 +215,9 @@ func (d driver) Deallocate(ctx context.Context, claim *resourcev1.ResourceClaim) devices := crd.Spec.AllocatedClaims[string(claim.UID)] switch devices.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: err = d.gpu.Deallocate(crd, claim) - case nascrd.MigDeviceType: + case types.MigDeviceType: err = d.mig.Deallocate(crd, claim) default: err = fmt.Errorf("unknown AllocatedDevices.Type(): %v", devices.Type()) diff --git a/cmd/nvidia-dra-controller/gpu.go b/cmd/nvidia-dra-controller/gpu.go index 1295b3f0..710a78e7 100644 --- a/cmd/nvidia-dra-controller/gpu.go +++ b/cmd/nvidia-dra-controller/gpu.go @@ -26,6 +26,7 @@ import ( nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" gpucrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) type gpudriver struct { @@ -115,7 +116,7 @@ func (g *gpudriver) allocate(crd *nascrd.NodeAllocationState, pod *corev1.Pod, g available := make(map[string]*nascrd.AllocatableGpu) for _, device := range crd.Spec.AllocatableDevices { - if device.Type() != nascrd.GpuDeviceType { + if device.Type() != types.GpuDeviceType { continue } available[device.Gpu.UUID] = device.Gpu @@ -123,11 +124,11 @@ func (g *gpudriver) allocate(crd *nascrd.NodeAllocationState, pod *corev1.Pod, g for _, allocated := range crd.Spec.AllocatedClaims { switch allocated.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: for _, device := range allocated.Gpu.Devices { delete(available, device.UUID) } - case nascrd.MigDeviceType: + case types.MigDeviceType: for _, device := range allocated.Mig.Devices { delete(available, device.ParentUUID) } @@ -195,6 +196,12 @@ func selectorMatchesGpu(selector *gpucrd.GpuSelector, gpu *nascrd.AllocatableGpu if p.CUDAComputeCapability != nil { return p.CUDAComputeCapability.Matches(gpu.CUDAComputeCapability) } + if p.DriverVersion != nil { + return p.DriverVersion.Matches(gpu.DriverVersion) + } + if p.CUDADriverVersion != nil { + return p.CUDADriverVersion.Matches(gpu.CUDADriverVersion) + } return false }) if matches && !checkedMigEnabled { diff --git a/cmd/nvidia-dra-controller/mig.go b/cmd/nvidia-dra-controller/mig.go index 9184682b..a5c2a96e 100644 --- a/cmd/nvidia-dra-controller/mig.go +++ b/cmd/nvidia-dra-controller/mig.go @@ -25,6 +25,7 @@ import ( nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" gpucrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) type migdriver struct { @@ -124,7 +125,7 @@ func (m *migdriver) available(crd *nascrd.NodeAllocationState) MigDevicePlacemen placements := make(MigDevicePlacements) for _, device := range crd.Spec.AllocatableDevices { - if device.Type() != nascrd.GpuDeviceType { + if device.Type() != types.GpuDeviceType { continue } if !device.Gpu.MigEnabled { @@ -134,7 +135,7 @@ func (m *migdriver) available(crd *nascrd.NodeAllocationState) MigDevicePlacemen } for _, device := range crd.Spec.AllocatableDevices { - if device.Type() != nascrd.MigDeviceType { + if device.Type() != types.MigDeviceType { continue } var pps []MigDevicePlacement @@ -153,7 +154,7 @@ func (m *migdriver) available(crd *nascrd.NodeAllocationState) MigDevicePlacemen } for _, allocated := range crd.Spec.AllocatedClaims { - if allocated.Type() != nascrd.MigDeviceType { + if allocated.Type() != types.MigDeviceType { continue } for _, device := range allocated.Mig.Devices { @@ -273,7 +274,7 @@ func (m *migdriver) gpuClaimInfo(crd *nascrd.NodeAllocationState, cas []*control allocated := crd.Spec.AllocatedClaims[claimUID] - if allocated.Type() != nascrd.GpuDeviceType { + if allocated.Type() != types.GpuDeviceType { continue } for _, device := range allocated.Gpu.Devices { diff --git a/cmd/nvidia-dra-plugin/cdi.go b/cmd/nvidia-dra-plugin/cdi.go index 997865fb..ccbc3d33 100644 --- a/cmd/nvidia-dra-plugin/cdi.go +++ b/cmd/nvidia-dra-plugin/cdi.go @@ -32,7 +32,7 @@ import ( cdiparser "tags.cncf.io/container-device-interface/pkg/parser" cdispec "tags.cncf.io/container-device-interface/specs-go" - nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) const ( @@ -133,7 +133,7 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices *PreparedDev claimEdits := cdiapi.ContainerEdits{} switch devices.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: for _, device := range devices.Gpu.Devices { nvmlDevice, ret := cdi.nvml.DeviceGetHandleByUUID(device.uuid) if ret != nvml.SUCCESS { @@ -149,7 +149,7 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices *PreparedDev } claimEdits.Append(gpuEdits) } - case nascrd.MigDeviceType: + case types.MigDeviceType: for _, device := range devices.Mig.Devices { nvmlParentDevice, ret := cdi.nvml.DeviceGetHandleByUUID(device.parent.uuid) if ret != nvml.SUCCESS { diff --git a/cmd/nvidia-dra-plugin/device_state.go b/cmd/nvidia-dra-plugin/device_state.go index 8e8ea2eb..ffe4c78f 100644 --- a/cmd/nvidia-dra-plugin/device_state.go +++ b/cmd/nvidia-dra-plugin/device_state.go @@ -25,6 +25,8 @@ import ( "k8s.io/klog/v2" nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) type AllocatableDevices map[string]*AllocatableDeviceInfo @@ -40,6 +42,8 @@ type GpuInfo struct { brand string architecture string cudaComputeCapability string + driverVersion string + cudaDriverVersion string } type MigDeviceInfo struct { @@ -66,12 +70,12 @@ type PreparedDevices struct { func (d PreparedDevices) Type() string { if d.Gpu != nil { - return nascrd.GpuDeviceType + return types.GpuDeviceType } if d.Mig != nil { - return nascrd.MigDeviceType + return types.MigDeviceType } - return nascrd.UnknownDeviceType + return types.UnknownDeviceType } func (d PreparedDevices) Len() int { @@ -87,11 +91,11 @@ func (d PreparedDevices) Len() int { func (d *PreparedDevices) UUIDs() []string { var deviceStrings []string switch d.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: for _, device := range d.Gpu.Devices { deviceStrings = append(deviceStrings, device.uuid) } - case nascrd.MigDeviceType: + case types.MigDeviceType: for _, device := range d.Mig.Devices { deviceStrings = append(deviceStrings, device.uuid) } @@ -189,7 +193,7 @@ func (s *DeviceState) Prepare(ctx context.Context, claimUID string, allocated na var err error switch allocated.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: prepared.Gpu, err = s.prepareGpus(claimUID, allocated.Gpu) if err != nil { return nil, fmt.Errorf("GPU allocation failed: %w", err) @@ -198,7 +202,7 @@ func (s *DeviceState) Prepare(ctx context.Context, claimUID string, allocated na if err != nil { return nil, fmt.Errorf("error setting up sharing: %w", err) } - case nascrd.MigDeviceType: + case types.MigDeviceType: prepared.Mig, err = s.prepareMigDevices(claimUID, allocated.Mig) if err != nil { return nil, fmt.Errorf("MIG device allocation failed: %w", err) @@ -235,12 +239,12 @@ func (s *DeviceState) Unprepare(ctx context.Context, claimUID string) error { } switch s.prepared[claimUID].Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: err := s.unprepareGpus(claimUID, s.prepared[claimUID]) if err != nil { return fmt.Errorf("unprepare failed: %w", err) } - case nascrd.MigDeviceType: + case types.MigDeviceType: err := s.unprepareMigDevices(claimUID, s.prepared[claimUID]) if err != nil { return fmt.Errorf("unprepare failed: %w", err) @@ -335,7 +339,7 @@ func (s *DeviceState) unprepareMigDevices(claimUID string, devices *PreparedDevi return nil } -func (s *DeviceState) setupSharing(ctx context.Context, sharing nascrd.Sharing, claim *nascrd.ClaimInfo, devices *PreparedDevices) error { +func (s *DeviceState) setupSharing(ctx context.Context, sharing sharing.Interface, claim *types.ClaimInfo, devices *PreparedDevices) error { if sharing.IsTimeSlicing() { config, err := sharing.GetTimeSlicingConfig() if err != nil { @@ -381,6 +385,8 @@ func (s *DeviceState) syncAllocatableDevicesToCRDSpec(spec *nascrd.NodeAllocatio Brand: device.brand, Architecture: device.architecture, CUDAComputeCapability: device.cudaComputeCapability, + DriverVersion: device.driverVersion, + CUDADriverVersion: device.cudaDriverVersion, }, } @@ -453,7 +459,7 @@ func (s *DeviceState) syncPreparedDevicesFromCRDSpec(ctx context.Context, spec * allocated := spec.AllocatedClaims[claim] prepared[claim] = &PreparedDevices{} switch devices.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: prepared[claim].Gpu = &PreparedGpus{} for _, d := range devices.Gpu.Devices { prepared[claim].Gpu.Devices = append(prepared[claim].Gpu.Devices, gpus[d.UUID].GpuInfo) @@ -462,7 +468,7 @@ func (s *DeviceState) syncPreparedDevicesFromCRDSpec(ctx context.Context, spec * if err != nil { return fmt.Errorf("error setting up sharing: %w", err) } - case nascrd.MigDeviceType: + case types.MigDeviceType: prepared[claim].Mig = &PreparedMigDevices{} for _, d := range devices.Mig.Devices { migInfo := migs[d.ParentUUID][d.UUID] @@ -507,7 +513,7 @@ func (s *DeviceState) syncPreparedDevicesToCRDSpec(spec *nascrd.NodeAllocationSt for claim, devices := range s.prepared { var prepared nascrd.PreparedDevices switch devices.Type() { - case nascrd.GpuDeviceType: + case types.GpuDeviceType: prepared.Gpu = &nascrd.PreparedGpus{} for _, device := range devices.Gpu.Devices { outdevice := nascrd.PreparedGpu{ @@ -515,7 +521,7 @@ func (s *DeviceState) syncPreparedDevicesToCRDSpec(spec *nascrd.NodeAllocationSt } prepared.Gpu.Devices = append(prepared.Gpu.Devices, outdevice) } - case nascrd.MigDeviceType: + case types.MigDeviceType: prepared.Mig = &nascrd.PreparedMigDevices{} for _, device := range devices.Mig.Devices { placement := nascrd.MigDevicePlacement{ diff --git a/cmd/nvidia-dra-plugin/nvlib.go b/cmd/nvidia-dra-plugin/nvlib.go index 4ff5f138..1db2b9bc 100644 --- a/cmd/nvidia-dra-plugin/nvlib.go +++ b/cmd/nvidia-dra-plugin/nvlib.go @@ -156,6 +156,14 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) if err != nil { return nil, fmt.Errorf("error getting CUDA compute capability for device %d: %w", index, err) } + driverVersion, ret := l.nvmllib.SystemGetDriverVersion() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting driver version: %w", err) + } + cudaDriverVersion, ret := l.nvmllib.SystemGetCudaDriverVersion() + if ret != nvml.SUCCESS { + return nil, fmt.Errorf("error getting CUDA driver version: %w", err) + } gpuInfo := &GpuInfo{ minor: minor, @@ -167,6 +175,8 @@ func (l deviceLib) getGpuInfo(index int, device nvdev.Device) (*GpuInfo, error) brand: brand, architecture: architecture, cudaComputeCapability: cudaComputeCapability, + driverVersion: driverVersion, + cudaDriverVersion: fmt.Sprintf("%v.%v", cudaDriverVersion/1000, (cudaDriverVersion%1000)/10), } return gpuInfo, nil diff --git a/cmd/nvidia-dra-plugin/sharing.go b/cmd/nvidia-dra-plugin/sharing.go index 9f177373..fba60fa6 100644 --- a/cmd/nvidia-dra-plugin/sharing.go +++ b/cmd/nvidia-dra-plugin/sharing.go @@ -41,7 +41,8 @@ import ( cdiapi "tags.cncf.io/container-device-interface/pkg/cdi" cdispec "tags.cncf.io/container-device-interface/specs-go" - nascrd "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/nas/v1alpha1" + "github.com/NVIDIA/k8s-dra-driver/api/utils/sharing" + "github.com/NVIDIA/k8s-dra-driver/api/utils/types" ) const ( @@ -71,9 +72,9 @@ type MpsControlDaemon struct { pipeDir string shmDir string logDir string - claim *nascrd.ClaimInfo + claim *types.ClaimInfo devices *PreparedDevices - config *nascrd.MpsConfig + config *sharing.MpsConfig manager *MpsManager } @@ -96,12 +97,12 @@ func NewTimeSlicingManager(deviceLib *deviceLib) *TimeSlicingManager { } } -func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *nascrd.TimeSlicingConfig) error { +func (t *TimeSlicingManager) SetTimeSlice(devices *PreparedDevices, config *sharing.TimeSlicingConfig) error { if devices.Mig != nil { return fmt.Errorf("setting a TimeSlice duration on MIG devices is unsupported") } - timeSlice := nascrd.DefaultTimeSlice + timeSlice := sharing.DefaultTimeSlice if config != nil && config.TimeSlice != nil { timeSlice = *config.TimeSlice } @@ -129,7 +130,7 @@ func NewMpsManager(config *Config, deviceLib *deviceLib, controlFilesRoot, hostD } } -func (m *MpsManager) NewMpsControlDaemon(claim *nascrd.ClaimInfo, devices *PreparedDevices, config *nascrd.MpsConfig) *MpsControlDaemon { +func (m *MpsManager) NewMpsControlDaemon(claim *types.ClaimInfo, devices *PreparedDevices, config *sharing.MpsConfig) *MpsControlDaemon { return &MpsControlDaemon{ nodeName: m.config.nascr.Name, namespace: m.config.nascr.Namespace, @@ -145,7 +146,7 @@ func (m *MpsManager) NewMpsControlDaemon(claim *nascrd.ClaimInfo, devices *Prepa } } -func (m *MpsManager) IsControlDaemonStarted(ctx context.Context, claim *nascrd.ClaimInfo) (bool, error) { +func (m *MpsManager) IsControlDaemonStarted(ctx context.Context, claim *types.ClaimInfo) (bool, error) { name := fmt.Sprintf(MpsControlDaemonNameFmt, claim.UID) _, err := m.config.clientsets.Core.AppsV1().Deployments(m.config.nascr.Namespace).Get(ctx, name, metav1.GetOptions{}) if errors.IsNotFound(err) { @@ -157,7 +158,7 @@ func (m *MpsManager) IsControlDaemonStarted(ctx context.Context, claim *nascrd.C return true, nil } -func (m *MpsManager) IsControlDaemonStopped(ctx context.Context, claim *nascrd.ClaimInfo) (bool, error) { +func (m *MpsManager) IsControlDaemonStopped(ctx context.Context, claim *types.ClaimInfo) (bool, error) { name := fmt.Sprintf(MpsControlDaemonNameFmt, claim.UID) _, err := m.config.clientsets.Core.AppsV1().Deployments(m.config.nascr.Namespace).Get(ctx, name, metav1.GetOptions{}) if errors.IsNotFound(err) { @@ -256,7 +257,7 @@ func (m *MpsControlDaemon) Start(ctx context.Context) error { return fmt.Errorf("error mounting %v as tmpfs: %w", m.shmDir, err) } - if m.devices.Type() == nascrd.GpuDeviceType { + if m.devices.Type() == types.GpuDeviceType { err = m.manager.nvdevlib.setComputeMode(m.devices.UUIDs(), "EXCLUSIVE_PROCESS") if err != nil { return fmt.Errorf("error setting compute mode: %w", err) diff --git a/common.mk b/common.mk index ac5b97da..83ae39b5 100644 --- a/common.mk +++ b/common.mk @@ -21,6 +21,7 @@ CLIENT_APIS := gpu/nas/v1alpha1 gpu/v1alpha1 CLIENT_SOURCES += $(patsubst %, $(API_BASE)/%, $(CLIENT_APIS)) DEEPCOPY_SOURCES = api/utils/selector +DEEPCOPY_SOURCES += api/utils/sharing DEEPCOPY_SOURCES += $(CLIENT_SOURCES) PLURAL_EXCEPTIONS = DeviceClassParameters:DeviceClassParameters diff --git a/deployments/helm/k8s-dra-driver/crds/gpu.resource.nvidia.com_gpuclaimparameters.yaml b/deployments/helm/k8s-dra-driver/crds/gpu.resource.nvidia.com_gpuclaimparameters.yaml index 66218ade..0c63cbc8 100644 --- a/deployments/helm/k8s-dra-driver/crds/gpu.resource.nvidia.com_gpuclaimparameters.yaml +++ b/deployments/helm/k8s-dra-driver/crds/gpu.resource.nvidia.com_gpuclaimparameters.yaml @@ -100,7 +100,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -198,7 +198,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -294,7 +294,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -402,7 +402,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -506,7 +506,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -604,7 +604,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -700,7 +700,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -818,7 +818,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -930,7 +930,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1028,7 +1028,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1124,7 +1124,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1232,7 +1232,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1336,7 +1336,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1434,7 +1434,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: @@ -1530,7 +1530,7 @@ spec: value: type: string type: object - cudaRuntimeVersion: + cudaDriverVersion: description: VersionComparator compares a version SelectorCondition using a specific operator. properties: diff --git a/deployments/helm/k8s-dra-driver/crds/nas.gpu.resource.nvidia.com_nodeallocationstates.yaml b/deployments/helm/k8s-dra-driver/crds/nas.gpu.resource.nvidia.com_nodeallocationstates.yaml index 80a02cd5..867173f8 100644 --- a/deployments/helm/k8s-dra-driver/crds/nas.gpu.resource.nvidia.com_nodeallocationstates.yaml +++ b/deployments/helm/k8s-dra-driver/crds/nas.gpu.resource.nvidia.com_nodeallocationstates.yaml @@ -52,6 +52,10 @@ spec: type: string cudaComputeCapability: type: string + cudaDriverVersion: + type: string + driverVersion: + type: string index: type: integer memoryBytes: @@ -67,6 +71,8 @@ spec: - architecture - brand - cudaComputeCapability + - cudaDriverVersion + - driverVersion - index - memoryBytes - migEnabled