Skip to content

Commit

Permalink
GPU sharing on cuda compute capability >=7.5
Browse files Browse the repository at this point in the history
Signed-off-by: Swati Gupta <[email protected]>
  • Loading branch information
guptaNswati committed Jan 31, 2025
1 parent 54334a1 commit 86de1cb
Showing 1 changed file with 21 additions and 2 deletions.
23 changes: 21 additions & 2 deletions cmd/nvidia-dra-plugin/device_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"fmt"
"slices"
"strings"
"sync"

resourceapi "k8s.io/api/resource/v1beta1"
Expand All @@ -29,6 +30,8 @@ import (
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
cdiapi "tags.cncf.io/container-device-interface/pkg/cdi"

"golang.org/x/mod/semver"

configapi "github.com/NVIDIA/k8s-dra-driver/api/nvidia.com/resource/gpu/v1alpha1"
)

Expand Down Expand Up @@ -390,6 +393,21 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
allocatableDevices[r.Device] = s.allocatable[r.Device]
}

// allow devices only with cuda compute compatility >= 7.5 as time slicing and MPS does not work with old arch
shareableAllocatableDevices := make(AllocatableDevices)
for device, deviceType := range allocatableDevices {
if deviceType.Gpu != nil {
cudaCCv := "v" + strings.TrimPrefix(deviceType.Gpu.cudaComputeCapability, "v")
gpuUUID := deviceType.Gpu.UUID
if semver.Compare(semver.Canonical(cudaCCv), semver.Canonical("v7.5")) >= 0 {
klog.Infof("GPU sharing is available on this device UUID=%v with CudaComputeCapability=%v", gpuUUID, cudaCCv)
shareableAllocatableDevices[device] = deviceType
} else {
return nil, fmt.Errorf("GPU sharing is not available on this device UUID=%v", gpuUUID)
}
}
}

// Declare a device group state object to populate.
var configState DeviceConfigState

Expand All @@ -400,7 +418,7 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
return nil, fmt.Errorf("error getting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
if tsc != nil {
err = s.tsManager.SetTimeSlice(allocatableDevices, tsc)
err = s.tsManager.SetTimeSlice(shareableAllocatableDevices, tsc)
if err != nil {
return nil, fmt.Errorf("error setting timeslice config for requests '%v' in claim '%v': %w", requests, claim.UID, err)
}
Expand All @@ -413,7 +431,8 @@ func (s *DeviceState) applySharingConfig(ctx context.Context, config configapi.S
if err != nil {
return nil, fmt.Errorf("error getting MPS configuration: %w", err)
}
mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), allocatableDevices)

mpsControlDaemon := s.mpsManager.NewMpsControlDaemon(string(claim.UID), shareableAllocatableDevices)
if err := mpsControlDaemon.Start(ctx, mpsc); err != nil {
return nil, fmt.Errorf("error starting MPS control daemon: %w", err)
}
Expand Down

0 comments on commit 86de1cb

Please sign in to comment.