diff --git a/commands/compose.go b/commands/compose.go index 26a330c9..6045eb96 100644 --- a/commands/compose.go +++ b/commands/compose.go @@ -4,15 +4,16 @@ import ( "encoding/json" "errors" "fmt" - "github.com/docker/model-cli/pkg/types" - "github.com/spf13/pflag" "slices" "strings" + "github.com/docker/model-cli/pkg/types" + "github.com/spf13/pflag" + "github.com/docker/model-cli/desktop" "github.com/docker/model-runner/pkg/inference/backends/llamacpp" - "github.com/docker/model-runner/pkg/inference/scheduling" dmrm "github.com/docker/model-runner/pkg/inference/models" + "github.com/docker/model-runner/pkg/inference/scheduling" "github.com/spf13/cobra" ) @@ -155,7 +156,7 @@ func downloadModelsOnlyIfNotFound(desktopClient *desktop.Client, models []string } return false }) { - _, _, err = desktopClient.Pull(model, func(s string) { + _, _, err = desktopClient.Pull(model, false, func(s string) { _ = sendInfo(s) }) if err != nil { diff --git a/commands/pull.go b/commands/pull.go index 65affffc..a85f2024 100644 --- a/commands/pull.go +++ b/commands/pull.go @@ -11,13 +11,15 @@ import ( ) func newPullCmd() *cobra.Command { + var ignoreRuntimeMemoryCheck bool + c := &cobra.Command{ Use: "pull MODEL", Short: "Pull a model from Docker Hub or HuggingFace to your local environment", Args: func(cmd *cobra.Command, args []string) error { if len(args) != 1 { return fmt.Errorf( - "'docker model run' requires 1 argument.\n\n" + + "'docker model pull' requires 1 argument.\n\n" + "Usage: docker model pull MODEL\n\n" + "See 'docker model pull --help' for more information", ) @@ -28,21 +30,24 @@ func newPullCmd() *cobra.Command { if _, err := ensureStandaloneRunnerAvailable(cmd.Context(), cmd); err != nil { return fmt.Errorf("unable to initialize standalone model runner: %w", err) } - return pullModel(cmd, desktopClient, args[0]) + return pullModel(cmd, desktopClient, args[0], ignoreRuntimeMemoryCheck) }, ValidArgsFunction: completion.NoComplete, } + + c.Flags().BoolVar(&ignoreRuntimeMemoryCheck, "ignore-runtime-memory-check", false, "Do not block pull if estimated runtime memory for model exceeds system resources.") + return c } -func pullModel(cmd *cobra.Command, desktopClient *desktop.Client, model string) error { +func pullModel(cmd *cobra.Command, desktopClient *desktop.Client, model string, ignoreRuntimeMemoryCheck bool) error { var progress func(string) if isatty.IsTerminal(os.Stdout.Fd()) { progress = TUIProgress } else { progress = RawProgress } - response, progressShown, err := desktopClient.Pull(model, progress) + response, progressShown, err := desktopClient.Pull(model, ignoreRuntimeMemoryCheck, progress) // Add a newline before any output (success or error) if progress was shown. if progressShown { diff --git a/commands/run.go b/commands/run.go index 6b8c3c89..5837d630 100644 --- a/commands/run.go +++ b/commands/run.go @@ -80,6 +80,7 @@ func readMultilineInput(cmd *cobra.Command, scanner *bufio.Scanner) (string, err func newRunCmd() *cobra.Command { var debug bool var backend string + var ignoreRuntimeMemoryCheck bool const cmdArgs = "MODEL [PROMPT]" c := &cobra.Command{ @@ -124,7 +125,7 @@ func newRunCmd() *cobra.Command { return handleNotRunningError(handleClientError(err, "Failed to inspect model")) } cmd.Println("Unable to find model '" + model + "' locally. Pulling from the server.") - if err := pullModel(cmd, desktopClient, model); err != nil { + if err := pullModel(cmd, desktopClient, model, ignoreRuntimeMemoryCheck); err != nil { return err } } @@ -188,6 +189,7 @@ func newRunCmd() *cobra.Command { c.Flags().BoolVar(&debug, "debug", false, "Enable debug logging") c.Flags().StringVar(&backend, "backend", "", fmt.Sprintf("Specify the backend to use (%s)", ValidBackendsKeys())) c.Flags().MarkHidden("backend") + c.Flags().BoolVar(&ignoreRuntimeMemoryCheck, "ignore-runtime-memory-check", false, "Do not block pull if estimated runtime memory for model exceeds system resources.") return c } diff --git a/desktop/desktop.go b/desktop/desktop.go index 58a662f7..3fff3bfd 100644 --- a/desktop/desktop.go +++ b/desktop/desktop.go @@ -106,9 +106,9 @@ func (c *Client) Status() Status { } } -func (c *Client) Pull(model string, progress func(string)) (string, bool, error) { +func (c *Client) Pull(model string, ignoreRuntimeMemoryCheck bool, progress func(string)) (string, bool, error) { model = normalizeHuggingFaceModelName(model) - jsonData, err := json.Marshal(dmrm.ModelCreateRequest{From: model}) + jsonData, err := json.Marshal(dmrm.ModelCreateRequest{From: model, IgnoreRuntimeMemoryCheck: ignoreRuntimeMemoryCheck}) if err != nil { return "", false, fmt.Errorf("error marshaling request: %w", err) } diff --git a/desktop/desktop_test.go b/desktop/desktop_test.go index b470bd04..db0f68ef 100644 --- a/desktop/desktop_test.go +++ b/desktop/desktop_test.go @@ -36,7 +36,7 @@ func TestPullHuggingFaceModel(t *testing.T) { Body: io.NopCloser(bytes.NewBufferString(`{"type":"success","message":"Model pulled successfully"}`)), }, nil) - _, _, err := client.Pull(modelName, func(s string) {}) + _, _, err := client.Pull(modelName, false, func(s string) {}) assert.NoError(t, err) } @@ -122,7 +122,7 @@ func TestNonHuggingFaceModel(t *testing.T) { Body: io.NopCloser(bytes.NewBufferString(`{"type":"success","message":"Model pulled successfully"}`)), }, nil) - _, _, err := client.Pull(modelName, func(s string) {}) + _, _, err := client.Pull(modelName, false, func(s string) {}) assert.NoError(t, err) } diff --git a/docs/reference/docker_model_pull.yaml b/docs/reference/docker_model_pull.yaml index f0843b02..86ede6cb 100644 --- a/docs/reference/docker_model_pull.yaml +++ b/docs/reference/docker_model_pull.yaml @@ -5,6 +5,18 @@ long: | usage: docker model pull MODEL pname: docker model plink: docker_model.yaml +options: + - option: ignore-runtime-memory-check + value_type: bool + default_value: "false" + description: | + Do not block pull if estimated runtime memory for model exceeds system resources. + deprecated: false + hidden: false + experimental: false + experimentalcli: false + kubernetes: false + swarm: false examples: |- ### Pulling a model from Docker Hub diff --git a/docs/reference/docker_model_run.yaml b/docs/reference/docker_model_run.yaml index 698c4dac..10ae59f4 100644 --- a/docs/reference/docker_model_run.yaml +++ b/docs/reference/docker_model_run.yaml @@ -29,6 +29,17 @@ options: experimentalcli: false kubernetes: false swarm: false + - option: ignore-runtime-memory-check + value_type: bool + default_value: "false" + description: | + Do not block pull if estimated runtime memory for model exceeds system resources. + deprecated: false + hidden: false + experimental: false + experimentalcli: false + kubernetes: false + swarm: false examples: |- ### One-time prompt diff --git a/docs/reference/model_pull.md b/docs/reference/model_pull.md index 246cc59d..a8f6a929 100644 --- a/docs/reference/model_pull.md +++ b/docs/reference/model_pull.md @@ -3,6 +3,12 @@ Pull a model from Docker Hub or HuggingFace to your local environment +### Options + +| Name | Type | Default | Description | +|:--------------------------------|:-------|:--------|:----------------------------------------------------------------------------------| +| `--ignore-runtime-memory-check` | `bool` | | Do not block pull if estimated runtime memory for model exceeds system resources. | + diff --git a/docs/reference/model_run.md b/docs/reference/model_run.md index 3010f26c..2880c031 100644 --- a/docs/reference/model_run.md +++ b/docs/reference/model_run.md @@ -5,9 +5,10 @@ Run a model and interact with it using a submitted prompt or chat mode ### Options -| Name | Type | Default | Description | -|:----------|:-------|:--------|:---------------------| -| `--debug` | `bool` | | Enable debug logging | +| Name | Type | Default | Description | +|:--------------------------------|:-------|:--------|:----------------------------------------------------------------------------------| +| `--debug` | `bool` | | Enable debug logging | +| `--ignore-runtime-memory-check` | `bool` | | Do not block pull if estimated runtime memory for model exceeds system resources. | diff --git a/go.mod b/go.mod index a0b3ef69..279de962 100644 --- a/go.mod +++ b/go.mod @@ -11,8 +11,8 @@ require ( github.com/docker/docker v28.2.2+incompatible github.com/docker/go-connections v0.5.0 github.com/docker/go-units v0.5.0 - github.com/docker/model-distribution v0.0.0-20250724114133-a11d745e582c - github.com/docker/model-runner v0.0.0-20250724122432-ecfa5e7e6807 + github.com/docker/model-distribution v0.0.0-20250813080006-2a983516ebb8 + github.com/docker/model-runner v0.0.0-20250822151118-d8ed37445584 github.com/fatih/color v1.15.0 github.com/google/go-containerregistry v0.20.6 github.com/mattn/go-isatty v0.0.20 diff --git a/go.sum b/go.sum index fcc17464..5a76301c 100644 --- a/go.sum +++ b/go.sum @@ -78,10 +78,10 @@ github.com/docker/go-metrics v0.0.1/go.mod h1:cG1hvH2utMXtqgqqYE9plW6lDxS3/5ayHz github.com/docker/go-units v0.5.0 h1:69rxXcBk27SvSaaxTtLh/8llcHD8vYHT7WSdRZ/jvr4= github.com/docker/go-units v0.5.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= github.com/docker/libtrust v0.0.0-20160708172513-aabc10ec26b7/go.mod h1:cyGadeNEkKy96OOhEzfZl+yxihPEzKnqJwvfuSUqbZE= -github.com/docker/model-distribution v0.0.0-20250724114133-a11d745e582c h1:w9MekYamXmWLe9ZWXWgNXJ7BLDDemXwB8WcF7wzHF5Q= -github.com/docker/model-distribution v0.0.0-20250724114133-a11d745e582c/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c= -github.com/docker/model-runner v0.0.0-20250724122432-ecfa5e7e6807 h1:02vImD8wqUDv6VJ2cBLbqzbjn17IMYEi4ileCEjXMQ8= -github.com/docker/model-runner v0.0.0-20250724122432-ecfa5e7e6807/go.mod h1:rCzRjRXJ42E8JVIA69E9hErJVV5mnUpWdJ2POsktfRs= +github.com/docker/model-distribution v0.0.0-20250813080006-2a983516ebb8 h1:agH5zeO6tf8lHgMcBZxqCFKPuXHM/cA53gdsn895eMI= +github.com/docker/model-distribution v0.0.0-20250813080006-2a983516ebb8/go.mod h1:dThpO9JoG5Px3i+rTluAeZcqLGw8C0qepuEL4gL2o/c= +github.com/docker/model-runner v0.0.0-20250822151118-d8ed37445584 h1:8YAzh9lihwcFGyHTK9pTFqdM7IwYwb0R/YkrNxmQ2do= +github.com/docker/model-runner v0.0.0-20250822151118-d8ed37445584/go.mod h1:0IAh5ekLg8ipcPAF+Rdav1wbt9xF+zQPoRC1bblk/ik= github.com/dvsekhvalnov/jose2go v0.0.0-20170216131308-f21a8cedbbae/go.mod h1:7BvyPhdbLxMXIYTFPLsyJRFMsKmOZnQmzh6Gb+uquuM= github.com/elastic/go-sysinfo v1.15.3 h1:W+RnmhKFkqPTCRoFq2VCTmsT4p/fwpo+3gKNQsn1XU0= github.com/elastic/go-sysinfo v1.15.3/go.mod h1:K/cNrqYTDrSoMh2oDkYEMS2+a72GRxMvNP+GC+vRIlo= diff --git a/vendor/github.com/docker/model-distribution/distribution/client.go b/vendor/github.com/docker/model-distribution/distribution/client.go index 7fba7fa1..5c008c41 100644 --- a/vendor/github.com/docker/model-distribution/distribution/client.go +++ b/vendor/github.com/docker/model-distribution/distribution/client.go @@ -287,6 +287,17 @@ func (c *Client) GetModel(reference string) (types.Model, error) { return model, nil } +// IsModelInStore checks if a model with the given reference is in the local store +func (c *Client) IsModelInStore(reference string) (bool, error) { + c.log.Infoln("Checking model by reference:", reference) + if _, err := c.store.Read(reference); errors.Is(err, ErrModelNotFound) { + return false, nil + } else if err != nil { + return false, err + } + return true, nil +} + type DeleteModelAction struct { Untagged *string `json:"Untagged,omitempty"` Deleted *string `json:"Deleted,omitempty"` diff --git a/vendor/github.com/docker/model-distribution/registry/client.go b/vendor/github.com/docker/model-distribution/registry/client.go index b0a1fd2a..49a063cb 100644 --- a/vendor/github.com/docker/model-distribution/registry/client.go +++ b/vendor/github.com/docker/model-distribution/registry/client.go @@ -11,7 +11,9 @@ import ( "github.com/docker/model-distribution/types" "github.com/google/go-containerregistry/pkg/authn" "github.com/google/go-containerregistry/pkg/name" + v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/google/go-containerregistry/pkg/v1/remote" + "github.com/google/go-containerregistry/pkg/v1/remote/transport" ) const ( @@ -106,9 +108,53 @@ func (c *Client) Model(ctx context.Context, reference string) (types.ModelArtifa } return nil, NewRegistryError(reference, "UNKNOWN", err.Error(), err) } + return &artifact{remoteImg}, nil } +func (c *Client) BlobURL(reference string, digest v1.Hash) (string, error) { + // Parse the reference + ref, err := name.ParseReference(reference) + if err != nil { + return "", NewReferenceError(reference, err) + } + + return fmt.Sprintf("%s://%s/v2/%s/blobs/%s", + ref.Context().Registry.Scheme(), + ref.Context().Registry.RegistryStr(), + ref.String(), + digest.String()), nil +} + +func (c *Client) BearerToken(ctx context.Context, reference string) (string, error) { + // Parse the reference + ref, err := name.ParseReference(reference) + if err != nil { + return "", NewReferenceError(reference, err) + } + + var auth authn.Authenticator + if c.auth != nil { + auth = c.auth + } else { + auth, err = c.keychain.Resolve(ref.Context()) + if err != nil { + return "", fmt.Errorf("resolving credentials: %w", err) + } + } + + pr, err := transport.Ping(ctx, ref.Context().Registry, c.transport) + if err != nil { + return "", fmt.Errorf("pinging registry: %w", err) + } + + tok, err := transport.Exchange(ctx, ref.Context().Registry, auth, c.transport, []string{ref.Scope(transport.PullScope)}, pr) + if err != nil { + return "", fmt.Errorf("getting registry token: %w", err) + } + return tok.Token, nil +} + type Target struct { reference name.Reference transport http.RoundTripper diff --git a/vendor/github.com/docker/model-runner/pkg/inference/backend.go b/vendor/github.com/docker/model-runner/pkg/inference/backend.go index 944ec126..26bd3fdf 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/backend.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/backend.go @@ -17,6 +17,14 @@ const ( BackendModeEmbedding ) +type ErrGGUFParse struct { + Err error +} + +func (e *ErrGGUFParse) Error() string { + return "failed to parse GGUF: " + e.Err.Error() +} + // String implements Stringer.String for BackendMode. func (m BackendMode) String() string { switch m { @@ -83,5 +91,5 @@ type Backend interface { GetDiskUsage() (int64, error) // GetRequiredMemoryForModel returns the required working memory for a given // model. - GetRequiredMemoryForModel(model string, config *BackendConfiguration) (*RequiredMemory, error) + GetRequiredMemoryForModel(ctx context.Context, model string, config *BackendConfiguration) (*RequiredMemory, error) } diff --git a/vendor/github.com/docker/model-runner/pkg/inference/backends/llamacpp/llamacpp.go b/vendor/github.com/docker/model-runner/pkg/inference/backends/llamacpp/llamacpp.go index 09de11f5..a5ae2a73 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/backends/llamacpp/llamacpp.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/backends/llamacpp/llamacpp.go @@ -15,8 +15,10 @@ import ( "runtime" "strings" + v1 "github.com/google/go-containerregistry/pkg/v1" parser "github.com/gpustack/gguf-parser-go" + "github.com/docker/model-distribution/types" "github.com/docker/model-runner/pkg/diskusage" "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/config" @@ -223,22 +225,23 @@ func (l *llamaCpp) GetDiskUsage() (int64, error) { return size, nil } -func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { - mdl, err := l.modelManager.GetModel(model) - if err != nil { - return nil, fmt.Errorf("getting model(%s): %w", model, err) - } - mdlPath, err := mdl.GGUFPath() - if err != nil { - return nil, fmt.Errorf("getting gguf path for model(%s): %w", model, err) - } - mdlGguf, err := parser.ParseGGUFFile(mdlPath) +func (l *llamaCpp) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { + var mdlGguf *parser.GGUFFile + var mdlConfig types.Config + inStore, err := l.modelManager.IsModelInStore(model) if err != nil { - return nil, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err) + return nil, fmt.Errorf("checking if model is in local store: %w", err) } - mdlConfig, err := mdl.Config() - if err != nil { - return nil, fmt.Errorf("accessing model(%s) config: %w", model, err) + if inStore { + mdlGguf, mdlConfig, err = l.parseLocalModel(model) + if err != nil { + return nil, &inference.ErrGGUFParse{Err: err} + } + } else { + mdlGguf, mdlConfig, err = l.parseRemoteModel(ctx, model) + if err != nil { + return nil, &inference.ErrGGUFParse{Err: err} + } } contextSize := GetContextSize(&mdlConfig, config) @@ -276,6 +279,71 @@ func (l *llamaCpp) GetRequiredMemoryForModel(model string, config *inference.Bac }, nil } +func (l *llamaCpp) parseLocalModel(model string) (*parser.GGUFFile, types.Config, error) { + mdl, err := l.modelManager.GetModel(model) + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting model(%s): %w", model, err) + } + mdlPath, err := mdl.GGUFPath() + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting gguf path for model(%s): %w", model, err) + } + mdlGguf, err := parser.ParseGGUFFile(mdlPath) + if err != nil { + return nil, types.Config{}, fmt.Errorf("parsing gguf(%s): %w", mdlPath, err) + } + mdlConfig, err := mdl.Config() + if err != nil { + return nil, types.Config{}, fmt.Errorf("accessing model(%s) config: %w", model, err) + } + return mdlGguf, mdlConfig, nil +} + +func (l *llamaCpp) parseRemoteModel(ctx context.Context, model string) (*parser.GGUFFile, types.Config, error) { + mdl, err := l.modelManager.GetRemoteModel(ctx, model) + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting remote model(%s): %w", model, err) + } + layers, err := mdl.Layers() + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting layers of model(%s): %w", model, err) + } + var ggufDigest v1.Hash + for _, layer := range layers { + mt, err := layer.MediaType() + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting media type of model(%s) layer: %w", model, err) + } + if mt == types.MediaTypeGGUF { + ggufDigest, err = layer.Digest() + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting digest of GGUF layer for model(%s): %w", model, err) + } + break + } + } + if ggufDigest.String() == "" { + return nil, types.Config{}, fmt.Errorf("model(%s) has no GGUF layer", model) + } + blobURL, err := l.modelManager.GetRemoteModelBlobURL(model, ggufDigest) + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting GGUF blob URL for model(%s): %w", model, err) + } + tok, err := l.modelManager.BearerTokenForModel(ctx, model) + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting bearer token for model(%s): %w", model, err) + } + mdlGguf, err := parser.ParseGGUFFileRemote(ctx, blobURL, parser.UseBearerAuth(tok)) + if err != nil { + return nil, types.Config{}, fmt.Errorf("parsing GGUF for model(%s): %w", model, err) + } + config, err := mdl.Config() + if err != nil { + return nil, types.Config{}, fmt.Errorf("getting config for model(%s): %w", model, err) + } + return mdlGguf, config, nil +} + func (l *llamaCpp) checkGPUSupport(ctx context.Context) bool { binPath := l.vendoredServerStoragePath if l.updatedLlamaCpp { diff --git a/vendor/github.com/docker/model-runner/pkg/inference/memory/estimator.go b/vendor/github.com/docker/model-runner/pkg/inference/memory/estimator.go new file mode 100644 index 00000000..6d66e7f6 --- /dev/null +++ b/vendor/github.com/docker/model-runner/pkg/inference/memory/estimator.go @@ -0,0 +1,48 @@ +package memory + +import ( + "context" + "errors" + "fmt" + + "github.com/docker/model-runner/pkg/inference" +) + +type MemoryEstimator interface { + SetDefaultBackend(MemoryEstimatorBackend) + GetRequiredMemoryForModel(context.Context, string, *inference.BackendConfiguration) (*inference.RequiredMemory, error) + HaveSufficientMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (bool, error) +} + +type MemoryEstimatorBackend interface { + GetRequiredMemoryForModel(context.Context, string, *inference.BackendConfiguration) (*inference.RequiredMemory, error) +} + +type memoryEstimator struct { + systemMemoryInfo SystemMemoryInfo + defaultBackend MemoryEstimatorBackend +} + +func NewEstimator(systemMemoryInfo SystemMemoryInfo) MemoryEstimator { + return &memoryEstimator{systemMemoryInfo: systemMemoryInfo} +} + +func (m *memoryEstimator) SetDefaultBackend(backend MemoryEstimatorBackend) { + m.defaultBackend = backend +} + +func (m *memoryEstimator) GetRequiredMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (*inference.RequiredMemory, error) { + if m.defaultBackend == nil { + return nil, errors.New("default backend not configured") + } + + return m.defaultBackend.GetRequiredMemoryForModel(ctx, model, config) +} + +func (m *memoryEstimator) HaveSufficientMemoryForModel(ctx context.Context, model string, config *inference.BackendConfiguration) (bool, error) { + req, err := m.GetRequiredMemoryForModel(ctx, model, config) + if err != nil { + return false, fmt.Errorf("estimating required memory for model: %w", err) + } + return m.systemMemoryInfo.HaveSufficientMemory(*req), nil +} diff --git a/vendor/github.com/docker/model-runner/pkg/inference/memory/system.go b/vendor/github.com/docker/model-runner/pkg/inference/memory/system.go new file mode 100644 index 00000000..fb0a3842 --- /dev/null +++ b/vendor/github.com/docker/model-runner/pkg/inference/memory/system.go @@ -0,0 +1,55 @@ +package memory + +import ( + "github.com/docker/model-runner/pkg/gpuinfo" + "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/logging" + "github.com/elastic/go-sysinfo" +) + +type SystemMemoryInfo interface { + HaveSufficientMemory(inference.RequiredMemory) bool + GetTotalMemory() inference.RequiredMemory +} + +type systemMemoryInfo struct { + log logging.Logger + totalMemory inference.RequiredMemory +} + +func NewSystemMemoryInfo(log logging.Logger, gpuInfo *gpuinfo.GPUInfo) (SystemMemoryInfo, error) { + // Compute the amount of available memory. + // TODO(p1-0tr): improve error handling + vramSize, err := gpuInfo.GetVRAMSize() + if err != nil { + vramSize = 1 + log.Warnf("Could not read VRAM size: %s", err) + } else { + log.Infof("Running on system with %d MB VRAM", vramSize/1024/1024) + } + ramSize := uint64(1) + hostInfo, err := sysinfo.Host() + if err != nil { + log.Warnf("Could not read host info: %s", err) + } else { + ram, err := hostInfo.Memory() + if err != nil { + log.Warnf("Could not read host RAM size: %s", err) + } else { + ramSize = ram.Total + log.Infof("Running on system with %d MB RAM", ramSize/1024/1024) + } + } + return &systemMemoryInfo{ + log: log, + totalMemory: inference.RequiredMemory{RAM: ramSize, VRAM: vramSize}, + }, nil +} + +func (s *systemMemoryInfo) HaveSufficientMemory(req inference.RequiredMemory) bool { + return req.RAM <= s.totalMemory.RAM && req.VRAM <= s.totalMemory.VRAM +} + +func (s *systemMemoryInfo) GetTotalMemory() inference.RequiredMemory { + return s.totalMemory +} diff --git a/vendor/github.com/docker/model-runner/pkg/inference/models/api.go b/vendor/github.com/docker/model-runner/pkg/inference/models/api.go index a21864dd..196d64ff 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/models/api.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/models/api.go @@ -14,6 +14,9 @@ import ( type ModelCreateRequest struct { // From is the name of the model to pull. From string `json:"from"` + // IgnoreRuntimeMemoryCheck indicates whether the server should check if it has sufficient + // memory to run the given model (assuming default configuration). + IgnoreRuntimeMemoryCheck bool `json:"ignore-runtime-memory-check,omitempty"` } // ToOpenAIList converts the model list to its OpenAI API representation. This function never diff --git a/vendor/github.com/docker/model-runner/pkg/inference/models/manager.go b/vendor/github.com/docker/model-runner/pkg/inference/models/manager.go index b0cf68bf..4f79dcf1 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/models/manager.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/models/manager.go @@ -17,7 +17,9 @@ import ( "github.com/docker/model-distribution/types" "github.com/docker/model-runner/pkg/diskusage" "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/memory" "github.com/docker/model-runner/pkg/logging" + v1 "github.com/google/go-containerregistry/pkg/v1" "github.com/sirupsen/logrus" ) @@ -42,6 +44,8 @@ type Manager struct { registryClient *registry.Client // lock is used to synchronize access to the models manager's router. lock sync.RWMutex + // memoryEstimator is used to calculate runtime memory requirements for models. + memoryEstimator memory.MemoryEstimator } type ClientConfig struct { @@ -56,7 +60,7 @@ type ClientConfig struct { } // NewManager creates a new model's manager. -func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string) *Manager { +func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string, memoryEstimator memory.MemoryEstimator) *Manager { // Create the model distribution client. distributionClient, err := distribution.NewClient( distribution.WithStoreRootPath(c.StoreRootPath), @@ -83,6 +87,7 @@ func NewManager(log logging.Logger, c ClientConfig, allowedOrigins []string) *Ma router: http.NewServeMux(), distributionClient: distributionClient, registryClient: registryClient, + memoryEstimator: memoryEstimator, } // Register routes. @@ -163,6 +168,20 @@ func (m *Manager) handleCreateModel(w http.ResponseWriter, r *http.Request) { // Pull the model. In the future, we may support additional operations here // besides pulling (such as model building). + if !request.IgnoreRuntimeMemoryCheck { + m.log.Infof("Will estimate memory required for %q", request.From) + proceed, err := m.memoryEstimator.HaveSufficientMemoryForModel(r.Context(), request.From, nil) + if err != nil { + m.log.Warnf("Failed to calculate memory required for model %q: %s", request.From, err) + // Prefer staying functional in case of unexpected estimation errors. + proceed = true + } + if !proceed { + m.log.Warnf("Runtime memory requirement for model %q exceeds total system memory", request.From) + http.Error(w, "Runtime memory requirement for model exceeds total system memory", http.StatusInsufficientStorage) + return + } + } if err := m.PullModel(request.From, r, w); err != nil { if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { m.log.Infof("Request canceled/timed out while pulling model %q", request.From) @@ -562,6 +581,11 @@ func (m *Manager) ServeHTTP(w http.ResponseWriter, r *http.Request) { m.router.ServeHTTP(w, r) } +// IsModelInStore checks if a given model is in the local store. +func (m *Manager) IsModelInStore(ref string) (bool, error) { + return m.distributionClient.IsModelInStore(ref) +} + // GetModel returns a single model. func (m *Manager) GetModel(ref string) (types.Model, error) { model, err := m.distributionClient.GetModel(ref) @@ -571,6 +595,33 @@ func (m *Manager) GetModel(ref string) (types.Model, error) { return model, err } +// GetRemoteModel returns a single remote model. +func (m *Manager) GetRemoteModel(ctx context.Context, ref string) (types.ModelArtifact, error) { + model, err := m.registryClient.Model(ctx, ref) + if err != nil { + return nil, fmt.Errorf("error while getting remote model: %w", err) + } + return model, nil +} + +// GetRemoteModelBlobURL returns the URL of a given model blob. +func (m *Manager) GetRemoteModelBlobURL(ref string, digest v1.Hash) (string, error) { + blobURL, err := m.registryClient.BlobURL(ref, digest) + if err != nil { + return "", fmt.Errorf("error while getting remote model blob URL: %w", err) + } + return blobURL, nil +} + +// BearerTokenForModel returns the bearer token needed to pull a given model. +func (m *Manager) BearerTokenForModel(ctx context.Context, ref string) (string, error) { + tok, err := m.registryClient.BearerToken(ctx, ref) + if err != nil { + return "", fmt.Errorf("error while getting bearer token for model: %w", err) + } + return tok, nil +} + // GetModelPath returns the path to a model's files. func (m *Manager) GetModelPath(ref string) (string, error) { model, err := m.GetModel(ref) diff --git a/vendor/github.com/docker/model-runner/pkg/inference/scheduling/loader.go b/vendor/github.com/docker/model-runner/pkg/inference/scheduling/loader.go index ec7e1f5c..69166e06 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/scheduling/loader.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/scheduling/loader.go @@ -10,12 +10,11 @@ import ( "time" "github.com/docker/model-runner/pkg/environment" - "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/memory" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" "github.com/docker/model-runner/pkg/metrics" - "github.com/elastic/go-sysinfo" ) const ( @@ -113,7 +112,7 @@ func newLoader( backends map[string]inference.Backend, modelManager *models.Manager, openAIRecorder *metrics.OpenAIRecorder, - gpuInfo *gpuinfo.GPUInfo, + sysMemInfo memory.SystemMemoryInfo, ) *loader { // Compute the number of runner slots to allocate. Because of RAM and VRAM // limitations, it's unlikely that we'll ever be able to fully populate @@ -135,32 +134,7 @@ func newLoader( } // Compute the amount of available memory. - // TODO(p1-0tr): improve error handling - vramSize, err := gpuInfo.GetVRAMSize() - if err != nil { - vramSize = 1 - log.Warnf("Could not read VRAM size: %s", err) - } else { - log.Infof("Running on system with %dMB VRAM", vramSize/1024/1024) - } - ramSize := uint64(1) - hostInfo, err := sysinfo.Host() - if err != nil { - log.Warnf("Could not read host info: %s", err) - } else { - ram, err := hostInfo.Memory() - if err != nil { - log.Warnf("Could not read host RAM size: %s", err) - } else { - ramSize = ram.Total - log.Infof("Running on system with %dMB RAM", ramSize/1024/1024) - } - } - - totalMemory := inference.RequiredMemory{ - RAM: ramSize, - VRAM: vramSize, - } + totalMemory := sysMemInfo.GetTotalMemory() // Create the loader. l := &loader{ @@ -420,11 +394,21 @@ func (l *loader) load(ctx context.Context, backendName, modelID, modelRef string if rc, ok := l.runnerConfigs[runnerKey{backendName, modelID, mode}]; ok { runnerConfig = &rc } - memory, err := backend.GetRequiredMemoryForModel(modelID, runnerConfig) - if err != nil { + memory, err := backend.GetRequiredMemoryForModel(ctx, modelID, runnerConfig) + var parseErr *inference.ErrGGUFParse + if errors.As(err, &parseErr) { + // TODO(p1-0tr): For now override memory checks in case model can't be parsed + // e.g. model is too new for gguf-parser-go to know. We should provide a cleaner + // way to bypass these checks. + l.log.Warnf("Could not parse model(%s), memory checks will be ignored for it. Error: %s", modelID, parseErr) + memory = &inference.RequiredMemory{ + RAM: 0, + VRAM: 0, + } + } else if err != nil { return nil, err } - l.log.Infof("Loading %s, which will require %dMB RAM and %dMB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024) + l.log.Infof("Loading %s, which will require %d MB RAM and %d MB VRAM on a system with %d MB RAM and %d MB VRAM", modelID, memory.RAM/1024/1024, memory.VRAM/1024/1024, l.totalMemory.RAM/1024/1024, l.totalMemory.VRAM/1024/1024) if l.totalMemory.RAM == 1 { l.log.Warnf("RAM size unknown. Assume model will fit, but only one.") memory.RAM = 1 diff --git a/vendor/github.com/docker/model-runner/pkg/inference/scheduling/scheduler.go b/vendor/github.com/docker/model-runner/pkg/inference/scheduling/scheduler.go index 3f716e9a..b99db11a 100644 --- a/vendor/github.com/docker/model-runner/pkg/inference/scheduling/scheduler.go +++ b/vendor/github.com/docker/model-runner/pkg/inference/scheduling/scheduler.go @@ -13,8 +13,8 @@ import ( "time" "github.com/docker/model-distribution/distribution" - "github.com/docker/model-runner/pkg/gpuinfo" "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/memory" "github.com/docker/model-runner/pkg/inference/models" "github.com/docker/model-runner/pkg/logging" "github.com/docker/model-runner/pkg/metrics" @@ -56,7 +56,7 @@ func NewScheduler( httpClient *http.Client, allowedOrigins []string, tracker *metrics.Tracker, - gpuInfo *gpuinfo.GPUInfo, + sysMemInfo memory.SystemMemoryInfo, ) *Scheduler { openAIRecorder := metrics.NewOpenAIRecorder(log.WithField("component", "openai-recorder"), modelManager) @@ -67,7 +67,7 @@ func NewScheduler( defaultBackend: defaultBackend, modelManager: modelManager, installer: newInstaller(log, backends, httpClient), - loader: newLoader(log, backends, modelManager, openAIRecorder, gpuInfo), + loader: newLoader(log, backends, modelManager, openAIRecorder, sysMemInfo), router: http.NewServeMux(), tracker: tracker, openAIRecorder: openAIRecorder, diff --git a/vendor/github.com/docker/model-runner/pkg/metrics/openai_recorder.go b/vendor/github.com/docker/model-runner/pkg/metrics/openai_recorder.go index 93a73349..d3d11946 100644 --- a/vendor/github.com/docker/model-runner/pkg/metrics/openai_recorder.go +++ b/vendor/github.com/docker/model-runner/pkg/metrics/openai_recorder.go @@ -46,7 +46,8 @@ type RequestResponsePair struct { Method string `json:"method"` URL string `json:"url"` Request string `json:"request"` - Response string `json:"response"` + Response string `json:"response,omitempty"` + Error string `json:"error,omitempty"` Timestamp time.Time `json:"timestamp"` StatusCode int `json:"status_code"` UserAgent string `json:"user_agent,omitempty"` @@ -168,8 +169,15 @@ func (r *OpenAIRecorder) RecordResponse(id, model string, rw http.ResponseWriter if modelData, exists := r.records[modelID]; exists { for _, record := range modelData.Records { if record.ID == id { - record.Response = response record.StatusCode = statusCode + // Populate either Response or Error field based on status code + if statusCode >= 400 { + record.Error = response + record.Response = "" // Ensure Response is empty for errors + } else { + record.Response = response + record.Error = "" // Ensure Error is empty for successful responses + } return } } diff --git a/vendor/modules.txt b/vendor/modules.txt index 784cde42..e92f13c6 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -144,7 +144,7 @@ github.com/docker/go-connections/tlsconfig # github.com/docker/go-units v0.5.0 ## explicit github.com/docker/go-units -# github.com/docker/model-distribution v0.0.0-20250724114133-a11d745e582c +# github.com/docker/model-distribution v0.0.0-20250813080006-2a983516ebb8 ## explicit; go 1.23.0 github.com/docker/model-distribution/builder github.com/docker/model-distribution/distribution @@ -156,7 +156,7 @@ github.com/docker/model-distribution/internal/store github.com/docker/model-distribution/registry github.com/docker/model-distribution/tarball github.com/docker/model-distribution/types -# github.com/docker/model-runner v0.0.0-20250724122432-ecfa5e7e6807 +# github.com/docker/model-runner v0.0.0-20250822151118-d8ed37445584 ## explicit; go 1.23.7 github.com/docker/model-runner/pkg/diskusage github.com/docker/model-runner/pkg/environment @@ -164,6 +164,7 @@ github.com/docker/model-runner/pkg/gpuinfo github.com/docker/model-runner/pkg/inference github.com/docker/model-runner/pkg/inference/backends/llamacpp github.com/docker/model-runner/pkg/inference/config +github.com/docker/model-runner/pkg/inference/memory github.com/docker/model-runner/pkg/inference/models github.com/docker/model-runner/pkg/inference/scheduling github.com/docker/model-runner/pkg/internal/archive