Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure that libcuda.so is in the ldcache #947

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cmd/nvidia-cdi-hook/commands/commands.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/urfave/cli/v2"

"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/chmod"
soname "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-soname-symlinks"
symlinks "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/create-symlinks"
"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/cudacompat"
ldcache "github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/update-ldcache"
Expand All @@ -34,5 +35,6 @@ func New(logger logger.Interface) []*cli.Command {
symlinks.NewCommand(logger),
chmod.NewCommand(logger),
cudacompat.NewCommand(logger),
soname.NewCommand(logger),
}
}
131 changes: 131 additions & 0 deletions cmd/nvidia-cdi-hook/create-soname-symlinks/soname-symlinks.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/**
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package soname

import (
"errors"
"fmt"
"path/filepath"

"github.com/urfave/cli/v2"

"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/utils"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)

type command struct {
logger logger.Interface
utils.SafeExecer
}

type options struct {
folders cli.StringSlice
ldconfigPath string
containerSpec string
}

// NewCommand constructs an create-soname-symlinks command with the specified logger
func NewCommand(logger logger.Interface) *cli.Command {
c := command{
logger: logger,
SafeExecer: utils.NewSafeExecer(logger),
}
return c.build()
}

// build the create-soname-symlinks command
func (m command) build() *cli.Command {
cfg := options{}

// Create the 'create-soname-symlinks' command
c := cli.Command{
Name: "create-soname-symlinks",
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@klueska I elected to add a new hook entirely instead of modifying the existing update-ldcache. This is in keeping with "purpose-built hooks" and also means that the hook name can be used to indicate the intent.

Usage: "Create soname symlinks for the specified folders using ldconfig -n -N",
Before: func(c *cli.Context) error {
return m.validateFlags(c, &cfg)
},
Action: func(c *cli.Context) error {
return m.run(c, &cfg)
},
}

c.Flags = []cli.Flag{
&cli.StringSliceFlag{
Name: "folder",
Usage: "Specify a folder to search for shared libraries for which soname symlinks need to be created",
Destination: &cfg.folders,
},
&cli.StringFlag{
Name: "ldconfig-path",
Usage: "Specify the path to the ldconfig program",
Destination: &cfg.ldconfigPath,
Value: "/sbin/ldconfig",
},
&cli.StringFlag{
Name: "container-spec",
Usage: "Specify the path to the OCI container spec. If empty or '-' the spec will be read from STDIN",
Destination: &cfg.containerSpec,
},
}

return &c
}

func (m command) validateFlags(c *cli.Context, cfg *options) error {
if cfg.ldconfigPath == "" {
return errors.New("ldconfig-path must be specified")
}
return nil
}

func (m command) run(c *cli.Context, cfg *options) error {
s, err := oci.LoadContainerState(cfg.containerSpec)
if err != nil {
return fmt.Errorf("failed to load container state: %v", err)
}

containerRoot, err := s.GetContainerRoot()
if err != nil {
return fmt.Errorf("failed to determined container root: %v", err)
}
if containerRoot == "" {
m.logger.Warningf("No container root detected")
return nil
}

dirs := cfg.folders.Value()
if len(dirs) == 0 {
return nil
}

ldconfigPath := utils.ResolveHostLDConfigPath(cfg.ldconfigPath)
args := []string{filepath.Base(ldconfigPath)}

args = append(args,
// Specify the containerRoot to use.
"-r", containerRoot,
// Specify -n to only process the specified folders.
"-n",
// Explicitly disable updating the LDCache.
"-N",
)
// Explicitly specific the directories to add.
args = append(args, dirs...)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question -- does this create .so symlinks for all libraries present in the specified directories? Does this differ from the behavior of the legacy libnvidia-container implementation, which IIRC would only create the .so symlinks for a small list of libraries (like libcuda.so)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not the .so symlinks, but the SONAME symlinks i.e. libcuda.so.1 -> libcuda.so.RM_VERSION in the case of libcuda. The .so symlinks are created using the "standard" create-symlinks hook.


return m.Exec(ldconfigPath, args, nil)
}
76 changes: 0 additions & 76 deletions cmd/nvidia-cdi-hook/cudacompat/container-root.go

This file was deleted.

60 changes: 8 additions & 52 deletions cmd/nvidia-cdi-hook/cudacompat/cudacompat.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,13 @@ package cudacompat

import (
"fmt"
"os"
"path/filepath"
"strconv"
"strings"

"github.com/urfave/cli/v2"

"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/utils"
"github.com/NVIDIA/nvidia-container-toolkit/internal/logger"
"github.com/NVIDIA/nvidia-container-toolkit/internal/oci"
)
Expand Down Expand Up @@ -107,33 +107,34 @@ func (m command) run(_ *cli.Context, cfg *options) error {
if err != nil {
return fmt.Errorf("failed to determined container root: %w", err)
}
containerRoot := utils.ContainerRoot(containerRootDir)

containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot(containerRootDir), cfg.hostDriverVersion)
containerForwardCompatDir, err := m.getContainerForwardCompatDir(containerRoot, cfg.hostDriverVersion)
if err != nil {
return fmt.Errorf("failed to get container forward compat directory: %w", err)
}
if containerForwardCompatDir == "" {
return nil
}

return m.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
return containerRoot.CreateLdsoconfdFile(cudaCompatLdsoconfdFilenamePattern, containerForwardCompatDir)
}

func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostDriverVersion string) (string, error) {
func (m command) getContainerForwardCompatDir(containerRoot utils.ContainerRoot, hostDriverVersion string) (string, error) {
if hostDriverVersion == "" {
m.logger.Debugf("Host driver version not specified")
return "", nil
}
if !containerRoot.hasPath(cudaCompatPath) {
if !containerRoot.HasPath(cudaCompatPath) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have picked a more or less arbitrary place in the code to put the following comment. Having more expressive variable names would make review for me easier. You can postpone looking at this comment until after merge, that's fine with me! In the spirit of learning what we do here however I'd appreciate an answer at some point :)

In my software engineering life I have always deeply cared about file system terminology and operations, and about making related code readable and self-expressive. For example,

  • I like to distinguish a "relative path" from an "absolute path" via variable name (if possible)
  • I like to distinguish a file object from a file path via variable name
  • I like to distinguish a path to a file from a path to a directory (expressing intent, of course these are technically the same)
  • I like to call things a "base name" for expressing intent a well (when there are no path separators, and this is supposed to be a "file name" or "directory name").

What is containerRoot in canonical unix file path terminology?

  • Is it guaranteed to point to a directory?
  • Is it an absolute path?
  • Is it always just /?

What are some properties about cudaCompatPath that we know/guarantee, that we could also express in the variable name or in the type?

  • Is it always a relative path (not starting with `/)?
  • Is it always a path to a directory?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for calling this out. I think we could do a pass through this and improve readability greatly. There is no rush on this particular PR (I've elected to hold the changes back from the v1.17.5 release) and as such, I think we can address these concerns here instead of as a follow-up.

The containerRootDir is the absolute path on the host filesystem to the root (/) of the container filesystem. It is specified in the OCI Runtime Specification as Root which informs the slightly ambiguous name. The containerRoot variable is the typed representation of this directory that allows us to attached helper functions such as HasPath / Resolve / Glob to it.

The cudaCompatPath is the absolute path to the directory containing the CUDA compat libraries in the container (if it exists). It defined as a constant with value /usr/local/cuda/compat. That is to say, it is always an absolute path to a directory in the container, and we're confirming that this exists, but have to calculate the path to this folder on the host filesystem to do this check.

m.logger.Debugf("No CUDA forward compatibility libraries directory in container")
return "", nil
}
if !containerRoot.hasPath("/etc/ld.so.cache") {
if !containerRoot.HasPath("/etc/ld.so.cache") {
m.logger.Debugf("The container does not have an LDCache")
return "", nil
}

libs, err := containerRoot.globFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
libs, err := containerRoot.GlobFiles(filepath.Join(cudaCompatPath, "libcuda.so.*.*"))
if err != nil {
m.logger.Warningf("Failed to find CUDA compat library: %w", err)
return "", nil
Expand Down Expand Up @@ -169,51 +170,6 @@ func (m command) getContainerForwardCompatDir(containerRoot containerRoot, hostD
return resolvedCompatDir, nil
}

// createLdsoconfdFile creates a file at /etc/ld.so.conf.d/ in the specified root.
// The file is created at /etc/ld.so.conf.d/{{ .pattern }} using `CreateTemp` and
// contains the specified directories on each line.
func (m command) createLdsoconfdFile(in containerRoot, pattern string, dirs ...string) error {
if len(dirs) == 0 {
m.logger.Debugf("No directories to add to /etc/ld.so.conf")
return nil
}

ldsoconfdDir, err := in.resolve("/etc/ld.so.conf.d")
if err != nil {
return err
}
if err := os.MkdirAll(ldsoconfdDir, 0755); err != nil {
return fmt.Errorf("failed to create ld.so.conf.d: %w", err)
}

configFile, err := os.CreateTemp(ldsoconfdDir, pattern)
if err != nil {
return fmt.Errorf("failed to create config file: %w", err)
}
defer configFile.Close()

m.logger.Debugf("Adding directories %v to %v", dirs, configFile.Name())

added := make(map[string]bool)
for _, dir := range dirs {
if added[dir] {
continue
}
_, err = configFile.WriteString(fmt.Sprintf("%s\n", dir))
if err != nil {
return fmt.Errorf("failed to update config file: %w", err)
}
added[dir] = true
}

// The created file needs to be world readable for the cases where the container is run as a non-root user.
if err := configFile.Chmod(0644); err != nil {
return fmt.Errorf("failed to chmod config file: %w", err)
}

return nil
}

// extractMajorVersion parses a version string and returns the major version as an int.
func extractMajorVersion(version string) (int, error) {
majorString := strings.SplitN(version, ".", 2)[0]
Expand Down
48 changes: 3 additions & 45 deletions cmd/nvidia-cdi-hook/cudacompat/cudacompat_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (

testlog "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/require"

"github.com/NVIDIA/nvidia-container-toolkit/cmd/nvidia-cdi-hook/utils"
)

func TestCompatLibs(t *testing.T) {
Expand Down Expand Up @@ -130,53 +132,9 @@ func TestCompatLibs(t *testing.T) {
c := command{
logger: logger,
}
containerForwardCompatDir, err := c.getContainerForwardCompatDir(containerRoot(containerRootDir), tc.hostDriverVersion)
containerForwardCompatDir, err := c.getContainerForwardCompatDir(utils.ContainerRoot(containerRootDir), tc.hostDriverVersion)
require.NoError(t, err)
require.EqualValues(t, tc.expectedContainerForwardCompatDir, containerForwardCompatDir)
})
}
}

func TestUpdateLdconfig(t *testing.T) {
logger, _ := testlog.NewNullLogger()
testCases := []struct {
description string
folders []string
expectedContents string
}{
{
description: "no folders; have no contents",
},
{
description: "single folder is added",
folders: []string{"/usr/local/cuda/compat"},
expectedContents: "/usr/local/cuda/compat\n",
},
}

for _, tc := range testCases {
t.Run(tc.description, func(t *testing.T) {
containerRootDir := t.TempDir()
c := command{
logger: logger,
}
err := c.createLdsoconfdFile(containerRoot(containerRootDir), cudaCompatLdsoconfdFilenamePattern, tc.folders...)
require.NoError(t, err)

matches, err := filepath.Glob(filepath.Join(containerRootDir, "/etc/ld.so.conf.d/00-compat-*.conf"))
require.NoError(t, err)

if tc.expectedContents == "" {
require.Empty(t, matches)
return
}

require.Len(t, matches, 1)
contents, err := os.ReadFile(matches[0])
require.NoError(t, err)

require.EqualValues(t, tc.expectedContents, string(contents))
})
}

}
Loading