diff --git a/README.md b/README.md index e9e3cb2b4..16e0365d7 100644 --- a/README.md +++ b/README.md @@ -228,6 +228,80 @@ The response will contain the model's reply: } ``` +## NVIDIA NIM Support + +Docker Model Runner supports running NVIDIA NIM (NVIDIA Inference Microservices) containers directly. This provides a simplified workflow for deploying NVIDIA's optimized inference containers. + +### Prerequisites + +- Docker with NVIDIA GPU support (nvidia-docker2 or Docker with NVIDIA Container Runtime) +- NGC API Key (optional, but required for some NIM models) +- Docker login to nvcr.io registry + +### Quick Start + +1. **Login to NVIDIA Container Registry:** + +```bash +docker login nvcr.io +Username: $oauthtoken +Password: +``` + +2. **Set NGC API Key (if required by the model):** + +```bash +export NGC_API_KEY= +``` + +3. **Run a NIM model:** + +```bash +docker model run nvcr.io/nim/google/gemma-3-1b-it:latest +``` + +That's it! The Docker Model Runner will: +- Automatically detect that this is a NIM image +- Pull the NIM container image +- Configure it with proper GPU support, shared memory (16GB), and NGC credentials +- Start the container and wait for it to be ready +- Provide an interactive chat interface + +### Features + +- **Automatic GPU Detection**: Automatically configures NVIDIA GPU support if available +- **Persistent Caching**: Models are cached in `~/.cache/nim` (or `$LOCAL_NIM_CACHE` if set) +- **Interactive Chat**: Supports both single prompt and interactive chat modes +- **Container Reuse**: Existing NIM containers are reused across runs + +### Example Usage + +**Single prompt:** +```bash +docker model run nvcr.io/nim/google/gemma-3-1b-it:latest "Explain quantum computing" +``` + +**Interactive chat:** +```bash +docker model run nvcr.io/nim/google/gemma-3-1b-it:latest +> Tell me a joke +... +> /bye +``` + +### Configuration + +- **NGC_API_KEY**: Set this environment variable to authenticate with NVIDIA's services +- **LOCAL_NIM_CACHE**: Override the default cache location (default: `~/.cache/nim`) + +### Technical Details + +NIM containers: +- Run on port 8000 (localhost only) +- Use 16GB shared memory by default +- Mount `~/.cache/nim` for model caching +- Support NVIDIA GPU acceleration when available + ## Metrics The Model Runner exposes [the metrics endpoint](https://github.com/ggml-org/llama.cpp/tree/master/tools/server#get-metrics-prometheus-compatible-metrics-exporter) of llama.cpp server at the `/metrics` endpoint. This allows you to monitor model performance, request statistics, and resource usage. diff --git a/cmd/cli/commands/nim.go b/cmd/cli/commands/nim.go new file mode 100644 index 000000000..2cb6b0a5d --- /dev/null +++ b/cmd/cli/commands/nim.go @@ -0,0 +1,353 @@ +package commands + +import ( + "bufio" + "context" + "fmt" + "io" + "net/http" + "os" + "strconv" + "strings" + "time" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/image" + "github.com/docker/docker/api/types/mount" + "github.com/docker/docker/client" + "github.com/docker/go-connections/nat" + gpupkg "github.com/docker/model-runner/cmd/cli/pkg/gpu" + "github.com/spf13/cobra" +) + +const ( + // nimPrefix is the registry prefix for NVIDIA NIM images + nimPrefix = "nvcr.io/nim/" + // nimContainerPrefix is the prefix for NIM container names + nimContainerPrefix = "docker-model-nim-" + // nimDefaultPort is the default port for NIM containers + nimDefaultPort = 8000 + // nimDefaultShmSize is the default shared memory size for NIM containers (16GB) + nimDefaultShmSize = 17179869184 +) + +// isNIMImage checks if the given model reference is an NVIDIA NIM image +func isNIMImage(model string) bool { + return strings.HasPrefix(model, nimPrefix) +} + +// nimContainerName generates a container name for a NIM image +func nimContainerName(model string) string { + // Extract the model name from the reference (e.g., nvcr.io/nim/google/gemma-3-1b-it:latest -> google-gemma-3-1b-it) + parts := strings.Split(strings.TrimPrefix(model, nimPrefix), "/") + name := strings.Join(parts, "-") + // Remove tag if present + if idx := strings.Index(name, ":"); idx != -1 { + name = name[:idx] + } + // Replace any remaining special characters + name = strings.ReplaceAll(name, ":", "-") + name = strings.ReplaceAll(name, "/", "-") + return nimContainerPrefix + name +} + +// pullNIMImage pulls the NIM Docker image +func pullNIMImage(ctx context.Context, dockerClient *client.Client, model string, cmd *cobra.Command) error { + cmd.Printf("Pulling NIM image %s...\n", model) + + reader, err := dockerClient.ImagePull(ctx, model, image.PullOptions{}) + if err != nil { + return fmt.Errorf("failed to pull NIM image: %w", err) + } + defer reader.Close() + + // Stream pull progress + io.Copy(cmd.OutOrStdout(), reader) + + return nil +} + +// findNIMContainer finds an existing NIM container for the given model +func findNIMContainer(ctx context.Context, dockerClient *client.Client, model string) (string, error) { + containerName := nimContainerName(model) + + containers, err := dockerClient.ContainerList(ctx, container.ListOptions{ + All: true, + }) + if err != nil { + return "", fmt.Errorf("failed to list containers: %w", err) + } + + for _, c := range containers { + for _, name := range c.Names { + if strings.TrimPrefix(name, "/") == containerName { + return c.ID, nil + } + } + } + + return "", nil +} + +// createNIMContainer creates and starts a NIM container +func createNIMContainer(ctx context.Context, dockerClient *client.Client, model string, cmd *cobra.Command) (string, error) { + containerName := nimContainerName(model) + + // Get NGC API key from environment + ngcAPIKey := os.Getenv("NGC_API_KEY") + if ngcAPIKey == "" { + cmd.Println("Warning: NGC_API_KEY environment variable is not set. NIM may require authentication.") + } + + // Check for GPU support + gpu, err := gpupkg.ProbeGPUSupport(ctx, dockerClient) + if err != nil { + cmd.Printf("Warning: Failed to probe GPU support: %v\n", err) + gpu = gpupkg.GPUSupportNone + } + + // Create cache directory + cacheDir := os.Getenv("LOCAL_NIM_CACHE") + if cacheDir == "" { + homeDir, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("failed to get home directory: %w", err) + } + cacheDir = homeDir + "/.cache/nim" + } + + // Create the cache directory if it doesn't exist + if err := os.MkdirAll(cacheDir, 0755); err != nil { + return "", fmt.Errorf("failed to create NIM cache directory: %w", err) + } + + // Container configuration + env := []string{} + if ngcAPIKey != "" { + env = append(env, "NGC_API_KEY="+ngcAPIKey) + } + + portStr := strconv.Itoa(nimDefaultPort) + config := &container.Config{ + Image: model, + Env: env, + ExposedPorts: nat.PortSet{ + nat.Port(portStr + "/tcp"): struct{}{}, + }, + } + + hostConfig := &container.HostConfig{ + ShmSize: nimDefaultShmSize, + Mounts: []mount.Mount{ + { + Type: mount.TypeBind, + Source: cacheDir, + Target: "/opt/nim/.cache", + }, + }, + PortBindings: nat.PortMap{ + nat.Port(portStr + "/tcp"): []nat.PortBinding{ + { + HostIP: "127.0.0.1", + HostPort: portStr, + }, + }, + }, + } + + // Add GPU support if available + if gpu == gpupkg.GPUSupportCUDA { + if ok, err := gpupkg.HasNVIDIARuntime(ctx, dockerClient); err == nil && ok { + hostConfig.Runtime = "nvidia" + } + hostConfig.DeviceRequests = []container.DeviceRequest{{ + Count: -1, + Capabilities: [][]string{{"gpu"}}, + }} + } + + // Create the container + resp, err := dockerClient.ContainerCreate(ctx, config, hostConfig, nil, nil, containerName) + if err != nil { + return "", fmt.Errorf("failed to create NIM container: %w", err) + } + + // Start the container + if err := dockerClient.ContainerStart(ctx, resp.ID, container.StartOptions{}); err != nil { + return "", fmt.Errorf("failed to start NIM container: %w", err) + } + + cmd.Printf("Started NIM container %s\n", containerName) + if gpu == gpupkg.GPUSupportCUDA { + cmd.Println("GPU support enabled") + } else { + cmd.Println("Warning: No GPU detected. NIM performance may be limited.") + } + + return resp.ID, nil +} + +// waitForNIMReady waits for the NIM container to be ready +func waitForNIMReady(ctx context.Context, cmd *cobra.Command) error { + cmd.Println("Waiting for NIM to be ready (this may take several minutes)...") + + client := &http.Client{ + Timeout: 5 * time.Second, + } + + maxRetries := 120 // 10 minutes with 5 second intervals + for i := 0; i < maxRetries; i++ { + resp, err := client.Get(fmt.Sprintf("http://127.0.0.1:%d/v1/models", nimDefaultPort)) + if err == nil { + resp.Body.Close() + if resp.StatusCode == http.StatusOK { + cmd.Println("NIM is ready!") + return nil + } + } + + if i%12 == 0 { // Print status every minute + elapsed := i * 5 + cmd.Printf("Still waiting for NIM to initialize... (%d seconds elapsed)\n", elapsed) + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(5 * time.Second): + // Continue waiting + } + } + + return fmt.Errorf("NIM failed to become ready within timeout. Check container logs with: docker logs $(docker ps -q --filter name=docker-model-nim-)") +} + +// runNIMModel handles running an NVIDIA NIM image +func runNIMModel(ctx context.Context, dockerClient *client.Client, model string, cmd *cobra.Command) error { + // Check if container already exists + containerID, err := findNIMContainer(ctx, dockerClient, model) + if err != nil { + return err + } + + if containerID != "" { + // Container exists, check if it's running + inspect, err := dockerClient.ContainerInspect(ctx, containerID) + if err != nil { + return fmt.Errorf("failed to inspect NIM container: %w", err) + } + + if !inspect.State.Running { + // Container exists but is not running, start it + if err := dockerClient.ContainerStart(ctx, containerID, container.StartOptions{}); err != nil { + return fmt.Errorf("failed to start existing NIM container: %w", err) + } + cmd.Printf("Started existing NIM container %s\n", nimContainerName(model)) + } else { + cmd.Printf("Using existing NIM container %s\n", nimContainerName(model)) + } + } else { + // Pull the image + if err := pullNIMImage(ctx, dockerClient, model, cmd); err != nil { + return err + } + + // Create and start container + containerID, err = createNIMContainer(ctx, dockerClient, model, cmd) + if err != nil { + return err + } + } + + // Wait for NIM to be ready + if err := waitForNIMReady(ctx, cmd); err != nil { + return err + } + + return nil +} + +// chatWithNIM sends chat requests to a NIM container +func chatWithNIM(cmd *cobra.Command, model, prompt string) error { + // Use the desktop client to chat with the NIM through its OpenAI-compatible API + // The NIM container runs on localhost:8000 and provides an OpenAI-compatible API + + // Create a simple HTTP client to talk to the NIM + client := &http.Client{ + Timeout: 300 * time.Second, + } + + // Build the request payload - use just the model base name without registry + modelName := strings.TrimPrefix(model, nimPrefix) + if idx := strings.LastIndex(modelName, ":"); idx != -1 { + modelName = modelName[:idx] + } + + reqBody := fmt.Sprintf(`{ + "model": "%s", + "messages": [ + {"role": "user", "content": %q} + ], + "stream": true + }`, modelName, prompt) + + req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/v1/chat/completions", nimDefaultPort), strings.NewReader(reqBody)) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("failed to send request to NIM: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("NIM returned error status %d: %s", resp.StatusCode, string(body)) + } + + // Stream the response - parse SSE events + scanner := bufio.NewScanner(resp.Body) + for scanner.Scan() { + line := scanner.Text() + + // SSE events start with "data: " + if strings.HasPrefix(line, "data: ") { + data := strings.TrimPrefix(line, "data: ") + + // Skip [DONE] message + if data == "[DONE]" { + continue + } + + // Parse the JSON and extract the content + // For simplicity, we'll use basic string parsing + // In production, we'd use proper JSON parsing + if strings.Contains(data, `"content"`) { + // Extract content field - simple approach + contentStart := strings.Index(data, `"content":"`) + if contentStart != -1 { + contentStart += len(`"content":"`) + contentEnd := strings.Index(data[contentStart:], `"`) + if contentEnd != -1 { + content := data[contentStart : contentStart+contentEnd] + // Unescape basic JSON escapes + content = strings.ReplaceAll(content, `\n`, "\n") + content = strings.ReplaceAll(content, `\t`, "\t") + content = strings.ReplaceAll(content, `\"`, `"`) + cmd.Print(content) + } + } + } + } + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading response: %w", err) + } + + return nil +} diff --git a/cmd/cli/commands/nim_test.go b/cmd/cli/commands/nim_test.go new file mode 100644 index 000000000..c89e88cf4 --- /dev/null +++ b/cmd/cli/commands/nim_test.go @@ -0,0 +1,86 @@ +package commands + +import ( + "testing" +) + +func TestIsNIMImage(t *testing.T) { + tests := []struct { + name string + model string + expected bool + }{ + { + name: "NIM image with full path", + model: "nvcr.io/nim/google/gemma-3-1b-it:latest", + expected: true, + }, + { + name: "NIM image without tag", + model: "nvcr.io/nim/meta/llama-3.1-8b-instruct", + expected: true, + }, + { + name: "Regular Docker Hub image", + model: "docker.io/library/ubuntu:latest", + expected: false, + }, + { + name: "Regular image without registry", + model: "ubuntu:latest", + expected: false, + }, + { + name: "HuggingFace model", + model: "hf.co/TheBloke/Llama-2-7B-Chat-GGUF", + expected: false, + }, + { + name: "Local model path", + model: "./models/llama-2-7b.gguf", + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isNIMImage(tt.model) + if result != tt.expected { + t.Errorf("isNIMImage(%q) = %v, want %v", tt.model, result, tt.expected) + } + }) + } +} + +func TestNIMContainerName(t *testing.T) { + tests := []struct { + name string + model string + expected string + }{ + { + name: "NIM image with tag", + model: "nvcr.io/nim/google/gemma-3-1b-it:latest", + expected: "docker-model-nim-google-gemma-3-1b-it", + }, + { + name: "NIM image without tag", + model: "nvcr.io/nim/meta/llama-3.1-8b-instruct", + expected: "docker-model-nim-meta-llama-3.1-8b-instruct", + }, + { + name: "NIM image with version tag", + model: "nvcr.io/nim/nvidia/nemo:24.01", + expected: "docker-model-nim-nvidia-nemo", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := nimContainerName(tt.model) + if result != tt.expected { + t.Errorf("nimContainerName(%q) = %q, want %q", tt.model, result, tt.expected) + } + }) + } +} diff --git a/cmd/cli/commands/run.go b/cmd/cli/commands/run.go index a3e672f7e..06a193dae 100644 --- a/cmd/cli/commands/run.go +++ b/cmd/cli/commands/run.go @@ -363,6 +363,63 @@ func newRunCmd() *cobra.Command { } } + // Check if this is an NVIDIA NIM image + if isNIMImage(model) { + // NIM images are handled differently - they run as Docker containers + // Create a Docker client + dockerCLI := getDockerCLI() + dockerClient, err := desktop.DockerClientForContext(dockerCLI, dockerCLI.CurrentContext()) + if err != nil { + return fmt.Errorf("failed to create Docker client: %w", err) + } + + // Run the NIM model + if err := runNIMModel(cmd.Context(), dockerClient, model, cmd); err != nil { + return fmt.Errorf("failed to run NIM model: %w", err) + } + + // If no prompt provided, enter interactive mode + if prompt == "" { + scanner := bufio.NewScanner(os.Stdin) + cmd.Println("Interactive chat mode started. Type '/bye' to exit.") + + for { + userInput, err := readMultilineInput(cmd, scanner) + if err != nil { + if err.Error() == "EOF" { + cmd.Println("\nChat session ended.") + break + } + return fmt.Errorf("Error reading input: %v", err) + } + + if strings.ToLower(strings.TrimSpace(userInput)) == "/bye" { + cmd.Println("Chat session ended.") + break + } + + if strings.TrimSpace(userInput) == "" { + continue + } + + if err := chatWithNIM(cmd, model, userInput); err != nil { + cmd.PrintErr(fmt.Errorf("failed to chat with NIM: %w", err)) + continue + } + + cmd.Println() + } + return nil + } + + // Single prompt mode + if err := chatWithNIM(cmd, model, prompt); err != nil { + return fmt.Errorf("failed to chat with NIM: %w", err) + } + cmd.Println() + return nil + } + if _, err := ensureStandaloneRunnerAvailable(cmd.Context(), cmd); err != nil { return fmt.Errorf("unable to initialize standalone model runner: %w", err) }