Skip to content

Commit

Permalink
Update logic to set environment for calls out to nvidia-smi
Browse files Browse the repository at this point in the history
Previously, we were setting the envvars needed for nvidia-smi in the plugin's
environment itself. This caused errors, however, when shelling out to other
binaries that didn't need these envvars set.

This change pushes the setting of the envvars into the actual exec call for
nvidia-smi instead of setting them in the plugin's environment itself.

Closes #106

Signed-off-by: Kevin Klues <[email protected]>
  • Loading branch information
klueska committed May 7, 2024
1 parent 917e1ce commit b338e30
Showing 1 changed file with 34 additions and 17 deletions.
51 changes: 34 additions & 17 deletions cmd/nvidia-dra-plugin/nvlib.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ import (

type deviceLib struct {
nvdev.Interface
nvmllib nvml.Interface
nvidiaSMIPath string
nvmllib nvml.Interface
driverLibraryPath string
nvidiaSMIPath string
}

func newDeviceLib(driverRoot root) (*deviceLib, error) {
Expand All @@ -46,32 +47,40 @@ func newDeviceLib(driverRoot root) (*deviceLib, error) {
return nil, fmt.Errorf("failed to locate nvidia-smi: %w", err)
}

// In order for nvidia-smi to run, we need to set the PATH to the parent of
// the nvidia-smi executable and update LD_PRELOAD to include the path to
// libnvidia-ml.so.1
updatePathListEnvvar("LD_PRELOAD", driverLibraryPath)
updatePathListEnvvar("PATH", filepath.Dir(nvidiaSMIPath))

// We construct an NVML library specifying the path to libnvidia-ml.so.1
// explicitly so that we don't have to rely on the library path.
nvmllib := nvml.New(
nvml.WithLibraryPath(driverLibraryPath),
)
d := deviceLib{
Interface: nvdev.New(nvdev.WithNvml(nvmllib)),
nvmllib: nvmllib,
nvidiaSMIPath: nvidiaSMIPath,
Interface: nvdev.New(nvdev.WithNvml(nvmllib)),
nvmllib: nvmllib,
driverLibraryPath: driverLibraryPath,
nvidiaSMIPath: nvidiaSMIPath,
}
return &d, nil
}

// updatePathListEnvvar prepends a specified list of strings to a specified envvar.
func updatePathListEnvvar(envvar string, prepend ...string) {
// prependPathListEnvvar prepends a specified list of strings to a specified envvar and returns its value.
func prependPathListEnvvar(envvar string, prepend ...string) string {
if len(prepend) == 0 {
return
return os.Getenv(envvar)
}
current := filepath.SplitList(os.Getenv(envvar))
os.Setenv(envvar, strings.Join(append(prepend, current...), string(filepath.ListSeparator)))
return strings.Join(append(prepend, current...), string(filepath.ListSeparator))
}

// setOrOverrideEnvvar adds or updates an envar to the list of specified envvars and returns it.
func setOrOverrideEnvvar(envvars []string, key, value string) []string {
var updated []string
for _, envvar := range envvars {
pair := strings.SplitN(envvar, "=", 2)
if pair[0] == key {
continue
}
updated = append(updated, envvar)
}
return append(updated, fmt.Sprintf("%s=%s", key, value))
}

func (l deviceLib) Init() error {
Expand Down Expand Up @@ -481,10 +490,14 @@ func walkMigDevices(d nvml.Device, f func(i int, d nvml.Device) error) error {
func (l deviceLib) setTimeSlice(uuids []string, timeSlice int) error {
for _, uuid := range uuids {
cmd := exec.Command(
"nvidia-smi",
l.nvidiaSMIPath,
"compute-policy",
"-i", uuid,
"--set-timeslice", fmt.Sprintf("%d", timeSlice))

// In order for nvidia-smi to run, we need update LD_PRELOAD to include the path to libnvidia-ml.so.1.
cmd.Env := setOrOverrideEnvvar(os.Environ(), "LD_PRELOAD", prependPathListEnvvar("LD_PRELOAD", l.driverLibraryPath))

Check failure on line 499 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / build

non-name cmd.Env on left side of :=

Check failure on line 499 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / check

non-name cmd.Env on left side of :=

Check failure on line 499 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / Unit test

non-name cmd.Env on left side of :=

output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("\n%v", string(output))
Expand All @@ -497,9 +510,13 @@ func (l deviceLib) setTimeSlice(uuids []string, timeSlice int) error {
func (l deviceLib) setComputeMode(uuids []string, mode string) error {
for _, uuid := range uuids {
cmd := exec.Command(
"nvidia-smi",
l.nvidiaSMIPath,
"-i", uuid,
"-c", mode)

// In order for nvidia-smi to run, we need update LD_PRELOAD to include the path to libnvidia-ml.so.1.
cmd.Env := setOrOverrideEnvvar(os.Environ(), "LD_PRELOAD", prependPathListEnvvar("LD_PRELOAD", l.driverLibraryPath))

Check failure on line 518 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / build

non-name cmd.Env on left side of :=

Check failure on line 518 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / check

non-name cmd.Env on left side of := (typecheck)

Check failure on line 518 in cmd/nvidia-dra-plugin/nvlib.go

View workflow job for this annotation

GitHub Actions / Unit test

non-name cmd.Env on left side of :=

output, err := cmd.CombinedOutput()
if err != nil {
klog.Errorf("\n%v", string(output))
Expand Down

0 comments on commit b338e30

Please sign in to comment.