Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2b06217
build: add new vLLM CUDA image
doringeman Oct 13, 2025
aa89eca
fix(Dockerfile): add build-essential package
ilopezluna Oct 14, 2025
4c5bc73
fix(Dockerfile): add python3-venv and python3-dev packages
ilopezluna Oct 14, 2025
e27c7d0
fix(Dockerfile): use the final target for CPU and CUDA builds
doringeman Oct 14, 2025
5311dfe
feat(vllm): implement vLLM backend
ilopezluna Oct 14, 2025
b6d3432
fix(client): remove unsupported model format checks for Safetensors
ilopezluna Oct 14, 2025
c2af7ce
perf: optimize Docker build by restructuring stage dependencies
doringeman Oct 14, 2025
c870ab0
feat(cli): allow vLLM backend
doringeman Oct 14, 2025
b554107
fix(vllm): use model bundle directory
doringeman Oct 14, 2025
609abf6
refactor(backend): add modelRef parameter to Backend.Run interface
doringeman Oct 14, 2025
717c407
fix(client): remove unsupported model format checks for Safetensors o…
doringeman Oct 14, 2025
0920524
fix(vllm): check vllm.New error
doringeman Oct 16, 2025
b94bd9f
vllm: Linux support only
doringeman Oct 16, 2025
2be8126
vllm: add version detection from build-time capture
doringeman Oct 16, 2025
0bb2f02
feat(scheduler): automatically identify models for vLLM
doringeman Oct 21, 2025
bf8d9e7
feat(vllm): enhance argument handling for vLLM backend configuration
ilopezluna Oct 21, 2025
c771e94
fix(vllm): update model path handling to use directory for safetensors
ilopezluna Oct 21, 2025
1a6f20e
fix(vllm): add model ID as served-model-name
doringeman Oct 21, 2025
0a9893b
feat(vllm): add argument sanitization for safe logging
ilopezluna Oct 21, 2025
a8a6dc0
update actions versin to be pinned to a full length commit SHA
ilopezluna Oct 21, 2025
6ca1a5a
fix(vllm): improve logging for backend errors and warn about unsuppor…
ilopezluna Oct 22, 2025
20272f2
fix(vllm): sanitize model reference in backend error logging
ilopezluna Oct 22, 2025
b0b6ccc
fix(vllm): sanitize model reference in warning log for unsupported sa…
ilopezluna Oct 22, 2025
13e2b4f
fix(vllm): validate safetensors path before serving and update test c…
ilopezluna Oct 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
required: false
type: string
default: "latest"
vllmVersion:
description: 'vLLM version'
required: false
type: string
default: "0.11.0"

jobs:
test:
Expand Down Expand Up @@ -59,6 +64,12 @@
echo "docker/model-runner:latest-cuda" >> "$GITHUB_OUTPUT"
fi
echo 'EOF' >> "$GITHUB_OUTPUT"
echo "vllm-cuda<<EOF" >> "$GITHUB_OUTPUT"
echo "docker/model-runner:${{ inputs.releaseTag }}-vllm-cuda" >> "$GITHUB_OUTPUT"
if [ "${{ inputs.pushLatest }}" == "true" ]; then
echo "docker/model-runner:latest-vllm-cuda" >> "$GITHUB_OUTPUT"
fi
echo 'EOF' >> "$GITHUB_OUTPUT"
Comment on lines +67 to +72
Copy link

Copilot AI Oct 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tag generation logic for vllm-cuda duplicates the pattern used for the cuda tags above. Consider extracting this into a reusable function or template to reduce code duplication and improve maintainability.

Copilot uses AI. Check for mistakes.
- name: Log in to DockerHub
uses: docker/login-action@v3
Expand All @@ -78,6 +89,7 @@
uses: docker/build-push-action@v5
with:
file: Dockerfile
target: final-llamacpp
platforms: linux/amd64, linux/arm64
build-args: |
"LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
Expand All @@ -90,6 +102,7 @@
uses: docker/build-push-action@v5
with:
file: Dockerfile
target: final-llamacpp
platforms: linux/amd64, linux/arm64
build-args: |
"LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
Expand All @@ -99,3 +112,19 @@
sbom: true
provenance: mode=max
tags: ${{ steps.tags.outputs.cuda }}

- name: Build vLLM CUDA image
uses: docker/build-push-action@v5
with:
file: Dockerfile
target: final-vllm
platforms: linux/amd64
build-args: |
"LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
"LLAMA_SERVER_VARIANT=cuda"
"BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
"VLLM_VERSION=${{ inputs.vllmVersion }}"
push: true
sbom: true
provenance: mode=max
tags: ${{ steps.tags.outputs.vllm-cuda }}
33 changes: 29 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ RUN --mount=type=cache,target=/go/pkg/mod \
FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server

# --- Final image ---
FROM docker.io/${BASE_IMAGE} AS final
FROM docker.io/${BASE_IMAGE} AS llamacpp

ARG LLAMA_SERVER_VARIANT

Expand All @@ -55,9 +55,6 @@ RUN mkdir -p /var/run/model-runner /app/bin /models && \
chown -R modelrunner:modelrunner /var/run/model-runner /app /models && \
chmod -R 755 /models

# Copy the built binary from builder
COPY --from=builder /app/model-runner /app/model-runner

# Copy the llama.cpp binary from the llama-server stage
ARG LLAMA_BINARY_PATH
COPY --from=llama-server ${LLAMA_BINARY_PATH}/ /app/.
Expand All @@ -77,3 +74,31 @@ ENV LD_LIBRARY_PATH=/app/lib
LABEL com.docker.desktop.service="model-runner"

ENTRYPOINT ["/app/model-runner"]

# --- vLLM variant ---
FROM llamacpp AS vllm

ARG VLLM_VERSION

USER root

RUN apt update && apt install -y python3 python3-venv python3-dev curl ca-certificates build-essential && rm -rf /var/lib/apt/lists/*

RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env

USER modelrunner

# Install uv and vLLM as modelrunner user
RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
&& ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
Copy link
Contributor

@ericcurtin ericcurtin Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we change this to copy from the vllm/vllm-openai:v0.11.0 container we get DGX Spark support (I know I suggested doing it this less hacky way, apologies, didn't realize the container had aarch64 and this way doesn't appear to).

Could be a follow on PR too.

Copy link
Contributor

@ericcurtin ericcurtin Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or this would be even better install the wheels from here:

https://wheels.vllm.ai/b8b302cde434df8c9289a2b465406b47ebab1c2d/vllm/

That commit sha is the 0.11.0 one.

They tipped me off in vLLM stack that they build CUDA x86_64 and aarch64 wheels for every commit. So this is the same thing, but has an aarch64 version also.

Be better than the hacky container copy (which is prone to error, missing files, OS mismatch, library version mismatch).

Copy link
Contributor

@ericcurtin ericcurtin Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a way to get that programatically:

$ git rev-list -n 1 v0.11.0
b8b302cde434df8c9289a2b465406b47ebab1c2d

&& ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"

RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version

FROM llamacpp AS final-llamacpp
# Copy the built binary from builder
COPY --from=builder /app/model-runner /app/model-runner

FROM vllm AS final-vllm
# Copy the built binary from builder
COPY --from=builder /app/model-runner /app/model-runner
1 change: 1 addition & 0 deletions cmd/cli/commands/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
var ValidBackends = map[string]bool{
"llama.cpp": true,
"openai": true,
"vllm": true,
}

// validateBackend checks if the provided backend is valid
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/docs/reference/docker_model_list.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ plink: docker_model.yaml
options:
- option: backend
value_type: string
description: Specify the backend to use (llama.cpp, openai)
description: Specify the backend to use (llama.cpp, openai, vllm)
deprecated: false
hidden: true
experimental: false
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/docs/reference/docker_model_run.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ plink: docker_model.yaml
options:
- option: backend
value_type: string
description: Specify the backend to use (llama.cpp, openai)
description: Specify the backend to use (llama.cpp, openai, vllm)
deprecated: false
hidden: true
experimental: false
Expand Down
13 changes: 12 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"github.com/docker/model-runner/pkg/gpuinfo"
"github.com/docker/model-runner/pkg/inference"
"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
"github.com/docker/model-runner/pkg/inference/backends/vllm"
"github.com/docker/model-runner/pkg/inference/config"
"github.com/docker/model-runner/pkg/inference/memory"
"github.com/docker/model-runner/pkg/inference/models"
Expand Down Expand Up @@ -119,9 +120,19 @@ func main() {

memEstimator.SetDefaultBackend(llamaCppBackend)

vllmBackend, err := vllm.New(
log,
modelManager,
log.WithFields(logrus.Fields{"component": "vllm"}),
nil,
)
if err != nil {
log.Fatalf("unable to initialize %s backend: %v", vllm.Name, err)
}

scheduler := scheduling.NewScheduler(
log,
map[string]inference.Backend{llamacpp.Name: llamaCppBackend},
map[string]inference.Backend{llamacpp.Name: llamaCppBackend, vllm.Name: vllmBackend},
llamaCppBackend,
modelManager,
http.DefaultClient,
Expand Down
13 changes: 11 additions & 2 deletions pkg/distribution/distribution/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,17 @@ import (
"fmt"
"io"
"net/http"
"slices"

"github.com/docker/model-runner/pkg/distribution/internal/utils"
"github.com/sirupsen/logrus"

"github.com/docker/model-runner/pkg/distribution/internal/progress"
"github.com/docker/model-runner/pkg/distribution/internal/store"
"github.com/docker/model-runner/pkg/distribution/internal/utils"
"github.com/docker/model-runner/pkg/distribution/registry"
"github.com/docker/model-runner/pkg/distribution/tarball"
"github.com/docker/model-runner/pkg/distribution/types"
"github.com/docker/model-runner/pkg/inference/platform"
)

// Client provides model distribution functionality
Expand Down Expand Up @@ -408,6 +410,13 @@ func (c *Client) GetBundle(ref string) (types.ModelBundle, error) {
return c.store.BundleForModel(ref)
}

func GetSupportedFormats() []types.Format {
if platform.SupportsVLLM() {
return []types.Format{types.FormatGGUF, types.FormatSafetensors}
}
return []types.Format{types.FormatGGUF}
}

func checkCompat(image types.ModelArtifact) error {
manifest, err := image.Manifest()
if err != nil {
Expand All @@ -423,7 +432,7 @@ func checkCompat(image types.ModelArtifact) error {
return fmt.Errorf("reading model config: %w", err)
}

if config.Format == types.FormatSafetensors {
if !slices.Contains(GetSupportedFormats(), config.Format) {
return ErrUnsupportedFormat
}

Expand Down
17 changes: 13 additions & 4 deletions pkg/distribution/distribution/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"github.com/docker/model-runner/pkg/distribution/internal/progress"
"github.com/docker/model-runner/pkg/distribution/internal/safetensors"
mdregistry "github.com/docker/model-runner/pkg/distribution/registry"
"github.com/docker/model-runner/pkg/inference/platform"
)

var (
Expand Down Expand Up @@ -418,7 +419,7 @@ func TestClientPullModel(t *testing.T) {
}
})

t.Run("pull safetensors model returns error", func(t *testing.T) {
t.Run("pull safetensors model returns error on unsupported platforms", func(t *testing.T) {
// Create temp directory for the safetensors file
tempDir, err := os.MkdirTemp("", "safetensors-test-*")
if err != nil {
Expand Down Expand Up @@ -461,10 +462,18 @@ func TestClientPullModel(t *testing.T) {
t.Fatalf("Failed to create test client: %v", err)
}

// Try to pull the safetensors model - should fail with ErrUnsupportedFormat
// Try to pull the safetensors model
err = testClient.PullModel(context.Background(), tag, nil)
if !errors.Is(err, ErrUnsupportedFormat) {
t.Fatalf("Expected ErrUnsupportedFormat, got: %v", err)
if platform.SupportsVLLM() {
// On Linux, safetensors should be supported
if err != nil {
t.Fatalf("Expected no error on Linux, got: %v", err)
}
} else {
// On non-Linux, should fail with ErrUnsupportedFormat
if !errors.Is(err, ErrUnsupportedFormat) {
t.Fatalf("Expected ErrUnsupportedFormat on non-Linux platforms, got: %v", err)
}
}
})

Expand Down
2 changes: 1 addition & 1 deletion pkg/inference/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ type Backend interface {
// to be loaded. Backends should not load multiple models at once and should
// instead load only the specified model. Backends should still respond to
// OpenAI API requests for other models with a 421 error code.
Run(ctx context.Context, socket, model string, mode BackendMode, config *BackendConfiguration) error
Run(ctx context.Context, socket, model string, modelRef string, mode BackendMode, config *BackendConfiguration) error
// Status returns a description of the backend's state.
Status() string
// GetDiskUsage returns the disk usage of the backend.
Expand Down
2 changes: 1 addition & 1 deletion pkg/inference/backends/llamacpp/llamacpp.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
}

// Run implements inference.Backend.Run.
func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
bundle, err := l.modelManager.GetBundle(model)
if err != nil {
return fmt.Errorf("failed to get model: %w", err)
Expand Down
2 changes: 1 addition & 1 deletion pkg/inference/backends/mlx/mlx.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func (m *mlx) Install(ctx context.Context, httpClient *http.Client) error {
}

// Run implements inference.Backend.Run.
func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
func (m *mlx) Run(ctx context.Context, socket, model string, modelRef string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
// TODO: Implement.
m.log.Warn("MLX backend is not yet supported")
return errors.New("not implemented")
Expand Down
Loading
Loading