Skip to content

Create device nodes #976

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cmd/nvidia-ctk-installer/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -202,6 +205,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -266,6 +272,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -327,6 +336,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down Expand Up @@ -410,6 +422,9 @@ swarm-resource = ""
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "{{ .toolkitRoot }}/toolkit/nvidia-container-runtime-hook"
skip-mode-detection = true
Expand Down
18 changes: 3 additions & 15 deletions cmd/nvidia-ctk/system/create-dev-char-symlinks/all.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,21 +145,9 @@ func (m allPossible) getGPUDeviceNodes(gpu int) []deviceNode {
// getNVCapDeviceNodes generates a list of cap device nodes for a given GPU.
func (m allPossible) getNVCapDeviceNodes(gpu int) []deviceNode {
var selectedCapMinors []nvcaps.MigMinor
for gi := 0; ; gi++ {
giCap := nvcaps.NewGPUInstanceCap(gpu, gi)
giMinor, exist := m.migCaps[giCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, giMinor)
for ci := 0; ; ci++ {
ciCap := nvcaps.NewComputeInstanceCap(gpu, gi, ci)
ciMinor, exist := m.migCaps[ciCap]
if !exist {
break
}
selectedCapMinors = append(selectedCapMinors, ciMinor)
}

for _, capMinors := range m.migCaps.FilterForGPU(nvcaps.Index(gpu)) {
selectedCapMinors = append(selectedCapMinors, capMinors)
}

var deviceNodes []deviceNode
Expand Down
3 changes: 3 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,9 @@ func GetDefault() (*Config, error) {
AnnotationPrefixes: []string{cdi.AnnotationPrefix},
SpecDirs: cdi.DefaultSpecDirs,
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
21 changes: 21 additions & 0 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -102,6 +105,7 @@ func TestGetConfig(t *testing.T) {
"nvidia-container-runtime.modes.cdi.annotation-prefixes = [\"cdi.k8s.io/\", \"example.vendor.com/\",]",
"nvidia-container-runtime.modes.cdi.spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"nvidia-container-runtime.modes.csv.mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"nvidia-container-runtime.modes.jit-cdi.load-kernel-modules = [\"foo\"]",
"nvidia-container-runtime-hook.path = \"/foo/bar/nvidia-container-runtime-hook\"",
"nvidia-ctk.path = \"/foo/bar/nvidia-ctk\"",
},
Expand Down Expand Up @@ -134,6 +138,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -178,6 +185,9 @@ func TestGetConfig(t *testing.T) {
"/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -213,6 +223,8 @@ func TestGetConfig(t *testing.T) {
"spec-dirs = [\"/except/etc/cdi\", \"/not/var/run/cdi\",]",
"[nvidia-container-runtime.modes.csv]",
"mount-spec-path = \"/not/etc/nvidia-container-runtime/host-files-for-container.d\"",
"[nvidia-container-runtime.modes.jit-cdi]",
"load-kernel-modules = [\"foo\"]",
"[nvidia-container-runtime-hook]",
"path = \"/foo/bar/nvidia-container-runtime-hook\"",
"[nvidia-ctk]",
Expand Down Expand Up @@ -247,6 +259,9 @@ func TestGetConfig(t *testing.T) {
"/not/var/run/cdi",
},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"foo"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -283,6 +298,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down Expand Up @@ -322,6 +340,9 @@ func TestGetConfig(t *testing.T) {
AnnotationPrefixes: []string{"cdi.k8s.io/"},
SpecDirs: []string{"/etc/cdi", "/var/run/cdi"},
},
JitCDI: jitCDIModeConfig{
LoadKernelModules: []string{"nvidia", "nvidia-uvm", "nvidia-modeset"},
},
},
},
NVIDIAContainerRuntimeHookConfig: RuntimeHookConfig{
Expand Down
13 changes: 11 additions & 2 deletions internal/config/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ type RuntimeConfig struct {

// modesConfig defines (optional) per-mode configs
type modesConfig struct {
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
CSV csvModeConfig `toml:"csv"`
CDI cdiModeConfig `toml:"cdi"`
JitCDI jitCDIModeConfig `toml:"jit-cdi"`
}

type cdiModeConfig struct {
Expand All @@ -45,3 +46,11 @@ type cdiModeConfig struct {
type csvModeConfig struct {
MountSpecPath string `toml:"mount-spec-path"`
}

type jitCDIModeConfig struct {
// LoadKernelModules defines the names of the kernel modules that should be
// loaded before generating a just-in-time CDI specification.
// The module names must start with `nvidia` and if no modules are specified
// no kernel modules are loaded.
LoadKernelModules []string `toml:"load-kernel-modules"`
}
3 changes: 3 additions & 0 deletions internal/config/toml_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ spec-dirs = ["/etc/cdi", "/var/run/cdi"]
[nvidia-container-runtime.modes.csv]
mount-spec-path = "/etc/nvidia-container-runtime/host-files-for-container.d"

[nvidia-container-runtime.modes.jit-cdi]
load-kernel-modules = ["nvidia", "nvidia-uvm", "nvidia-modeset"]

[nvidia-container-runtime-hook]
path = "nvidia-container-runtime-hook"
skip-mode-detection = false
Expand Down
2 changes: 1 addition & 1 deletion internal/info/proc/devices/builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ func New(opts ...Option) Devices {
type Option func(*builder)

// WithDeviceToMajor specifies an explicit device name to major number map.
func WithDeviceToMajor(deviceToMajor map[string]int) Option {
func WithDeviceToMajor(deviceToMajor map[string]uint32) Option {
return func(b *builder) {
b.asMap = make(devices)
for name, major := range deviceToMajor {
Expand Down
10 changes: 5 additions & 5 deletions internal/info/proc/devices/devices.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ const (
type Name string

// Major represents a device major as specified under /proc/devices
type Major int
type Major uint32

// Devices represents the set of devices under /proc/devices
//
Expand Down Expand Up @@ -130,8 +130,8 @@ func nvidiaDeviceFrom(reader io.Reader) (Devices, error) {
return nvidiaDevices, nil
}

func devicesFrom(reader io.Reader) map[string]int {
allDevices := make(map[string]int)
func devicesFrom(reader io.Reader) map[string]uint32 {
allDevices := make(map[string]uint32)
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
device, major, err := processProcDeviceLine(scanner.Text())
Expand All @@ -143,11 +143,11 @@ func devicesFrom(reader io.Reader) map[string]int {
return allDevices
}

func processProcDeviceLine(line string) (string, int, error) {
func processProcDeviceLine(line string) (string, uint32, error) {
trimmed := strings.TrimSpace(line)

var name string
var major int
var major uint32

n, _ := fmt.Sscanf(trimmed, "%d %s", &major, &name)
if n == 2 {
Expand Down
4 changes: 2 additions & 2 deletions internal/info/proc/devices/devices_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
)

func TestNvidiaDevices(t *testing.T) {
perDriverDeviceMaps := map[string]map[string]int{
perDriverDeviceMaps := map[string]map[string]uint32{
"pre550": {
"nvidia-frontend": 195,
"nvidia-nvlink": 234,
Expand Down Expand Up @@ -100,7 +100,7 @@ func TestProcessDeviceFileLine(t *testing.T) {
testCases := []struct {
line string
name string
major int
major uint32
err bool
}{
{"", "", 0, true},
Expand Down
20 changes: 20 additions & 0 deletions internal/lookup/root/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,15 @@
package root

import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"

"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvmodules"
)

// Driver represents a filesystem in which a set of drivers or devices is defined.
Expand Down Expand Up @@ -125,3 +128,20 @@ func xdgDataDirs() []string {

return []string{"/usr/local/share", "/usr/share"}
}

// LoadKmods loads the specified kernel modules in the driver root.
// Errors in loading a module do not prevent other modules from being attempted.
func (r *Driver) LoadKernelModules(moduleNames ...string) error {
modules := nvmodules.New(
nvmodules.WithLogger(r.logger),
nvmodules.WithRoot(r.Root),
)

var errs error
for _, moduleName := range moduleNames {
if err := modules.Load(moduleName); err != nil {
errs = errors.Join(errs, fmt.Errorf("failed to load kernel module %q: %w", moduleName, err))
}
}
return errs
}
47 changes: 41 additions & 6 deletions internal/modifier/cdi.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,23 @@ import (

"tags.cncf.io/container-device-interface/pkg/parser"

"github.com/NVIDIA/go-nvlib/pkg/nvlib/device"

"github.com/NVIDIA/nvidia-container-toolkit/internal/config"
"github.com/NVIDIA/nvidia-container-toolkit/internal/config/image"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/lookup/root"
"github.com/NVIDIA/nvidia-container-toolkit/internal/modifier/cdi"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
"github.com/NVIDIA/nvidia-container-toolkit/internal/system/nvdevices"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi"
"github.com/NVIDIA/nvidia-container-toolkit/pkg/nvcdi/spec"
)

// NewCDIModifier creates an OCI spec modifier that determines the modifications to make based on the
// CDI specifications available on the system. The NVIDIA_VISIBLE_DEVICES environment variable is
// used to select the devices to include.
func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spec) (oci.SpecModifier, error) {
func NewCDIModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, ociSpec oci.Spec) (oci.SpecModifier, error) {
devices, err := getDevicesFromSpec(logger, ociSpec, cfg)
if err != nil {
return nil, fmt.Errorf("failed to get required devices from OCI specification: %v", err)
Expand All @@ -50,7 +54,7 @@ func NewCDIModifier(logger logger.Interface, cfg *config.Config, ociSpec oci.Spe
return nil, fmt.Errorf("requesting a CDI device with vendor 'runtime.nvidia.com' is not supported when requesting other CDI devices")
}
if len(automaticDevices) > 0 {
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, automaticDevices)
automaticModifier, err := newAutomaticCDISpecModifier(logger, cfg, driver, automaticDevices)
if err == nil {
return automaticModifier, nil
}
Expand Down Expand Up @@ -163,9 +167,9 @@ func filterAutomaticDevices(devices []string) []string {
return automatic
}

func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, devices []string) (oci.SpecModifier, error) {
func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (oci.SpecModifier, error) {
logger.Debugf("Generating in-memory CDI specs for devices %v", devices)
spec, err := generateAutomaticCDISpec(logger, cfg, devices)
spec, err := generateAutomaticCDISpec(logger, cfg, driver, devices)
if err != nil {
return nil, fmt.Errorf("failed to generate CDI spec: %w", err)
}
Expand All @@ -180,7 +184,7 @@ func newAutomaticCDISpecModifier(logger logger.Interface, cfg *config.Config, de
return cdiModifier, nil
}

func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devices []string) (spec.Interface, error) {
func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, driver *root.Driver, devices []string) (spec.Interface, error) {
cdilib, err := nvcdi.New(
nvcdi.WithLogger(logger),
nvcdi.WithNVIDIACDIHookPath(cfg.NVIDIACTKConfig.Path),
Expand All @@ -192,12 +196,19 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
return nil, fmt.Errorf("failed to construct CDI library: %w", err)
}

identifiers := []string{}
// TODO: Consider moving this into the nvcdi API.
if err := driver.LoadKernelModules(cfg.NVIDIAContainerRuntimeConfig.Modes.JitCDI.LoadKernelModules...); err != nil {
logger.Warningf("Ignoring error(s) loading kernel modules: %v", err)
}

var identifiers []string
for _, device := range devices {
_, _, id := parser.ParseDevice(device)
identifiers = append(identifiers, id)
}

tryCreateDeviceNodes(logger, driver, identifiers...)

deviceSpecs, err := cdilib.GetDeviceSpecsByID(identifiers...)
if err != nil {
return nil, fmt.Errorf("failed to get CDI device specs: %w", err)
Expand All @@ -215,3 +226,27 @@ func generateAutomaticCDISpec(logger logger.Interface, cfg *config.Config, devic
spec.WithClass("gpu"),
)
}

func tryCreateDeviceNodes(logger logger.Interface, driver *root.Driver, identifiers ...string) {
devices, err := nvdevices.New(
nvdevices.WithLogger(logger),
nvdevices.WithDevRoot(driver.Root),
)
if err != nil {
logger.Warningf("Failed to create devices library: %v", err)
return
}
if err := devices.CreateNVIDIAControlDevices(); err != nil {
logger.Warningf("Failed to create control devices: %v", err)
}
if err := devices.CreateNVIDIACapsControlDeviceNodes(); err != nil {
logger.Warningf("Failed to create nvidia-caps control devices: %v", err)
}

for _, id := range identifiers {
identifier := device.Identifier(id)
if err := devices.CreateDeviceNodes(identifier); err != nil {
logger.Warningf("Error creating device nodes for %v: %v", identifier, err)
}
}
}
Loading