docker · doringeman · Oct 22, 2025 · Oct 13, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -19,6 +19,11 @@
         required: false
         type: string
         default: "latest"
+      vllmVersion:
+        description: 'vLLM version'
+        required: false
+        type: string
+        default: "0.11.0"
 
 jobs:
   test:
@@ -59,6 +64,12 @@
             echo "docker/model-runner:latest-cuda" >> "$GITHUB_OUTPUT"
           fi
           echo 'EOF' >> "$GITHUB_OUTPUT"
+          echo "vllm-cuda<<EOF" >> "$GITHUB_OUTPUT"
+          echo "docker/model-runner:${{ inputs.releaseTag }}-vllm-cuda" >> "$GITHUB_OUTPUT"
+          if [ "${{ inputs.pushLatest }}" == "true" ]; then
+            echo "docker/model-runner:latest-vllm-cuda" >> "$GITHUB_OUTPUT"
+          fi
+          echo 'EOF' >> "$GITHUB_OUTPUT"
 
       - name: Log in to DockerHub
         uses: docker/login-action@v3
@@ -78,6 +89,7 @@
         uses: docker/build-push-action@v5
         with:
           file: Dockerfile
+          target: final-llamacpp
           platforms: linux/amd64, linux/arm64
           build-args: |
             "LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
@@ -90,6 +102,7 @@
         uses: docker/build-push-action@v5
         with:
           file: Dockerfile
+          target: final-llamacpp
           platforms: linux/amd64, linux/arm64
           build-args: |
             "LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
@@ -99,3 +112,19 @@
           sbom: true
           provenance: mode=max
           tags: ${{ steps.tags.outputs.cuda }}
+
+      - name: Build vLLM CUDA image
+        uses: docker/build-push-action@v5
+        with:
+          file: Dockerfile
+          target: final-vllm
+          platforms: linux/amd64
+          build-args: |
+            "LLAMA_SERVER_VERSION=${{ inputs.llamaServerVersion }}"
+            "LLAMA_SERVER_VARIANT=cuda"
+            "BASE_IMAGE=nvidia/cuda:12.9.0-runtime-ubuntu24.04"
+            "VLLM_VERSION=${{ inputs.vllmVersion }}"
+          push: true
+          sbom: true
+          provenance: mode=max
+          tags: ${{ steps.tags.outputs.vllm-cuda }}
diff --git a/Dockerfile b/Dockerfile
@@ -35,7 +35,7 @@ RUN --mount=type=cache,target=/go/pkg/mod \
 FROM docker/docker-model-backend-llamacpp:${LLAMA_SERVER_VERSION}-${LLAMA_SERVER_VARIANT} AS llama-server
 
 # --- Final image ---
-FROM docker.io/${BASE_IMAGE} AS final
+FROM docker.io/${BASE_IMAGE} AS llamacpp
 
 ARG LLAMA_SERVER_VARIANT
 
@@ -55,9 +55,6 @@ RUN mkdir -p /var/run/model-runner /app/bin /models && \
     chown -R modelrunner:modelrunner /var/run/model-runner /app /models && \
     chmod -R 755 /models
 
-# Copy the built binary from builder
-COPY --from=builder /app/model-runner /app/model-runner
-
 # Copy the llama.cpp binary from the llama-server stage
 ARG LLAMA_BINARY_PATH
 COPY --from=llama-server ${LLAMA_BINARY_PATH}/ /app/.
@@ -77,3 +74,31 @@ ENV LD_LIBRARY_PATH=/app/lib
 LABEL com.docker.desktop.service="model-runner"
 
 ENTRYPOINT ["/app/model-runner"]
+
+# --- vLLM variant ---
+FROM llamacpp AS vllm
+
+ARG VLLM_VERSION
+
+USER root
+
+RUN apt update && apt install -y python3 python3-venv python3-dev curl ca-certificates build-essential && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /opt/vllm-env && chown -R modelrunner:modelrunner /opt/vllm-env
+
+USER modelrunner
+
+# Install uv and vLLM as modelrunner user
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
+ && ~/.local/bin/uv venv --python /usr/bin/python3 /opt/vllm-env \
+ && ~/.local/bin/uv pip install --python /opt/vllm-env/bin/python "vllm==${VLLM_VERSION}"
+
+RUN /opt/vllm-env/bin/python -c "import vllm; print(vllm.__version__)" > /opt/vllm-env/version
+
+FROM llamacpp AS final-llamacpp
+# Copy the built binary from builder
+COPY --from=builder /app/model-runner /app/model-runner
+
+FROM vllm AS final-vllm
+# Copy the built binary from builder
+COPY --from=builder /app/model-runner /app/model-runner
diff --git a/cmd/cli/commands/backend.go b/cmd/cli/commands/backend.go
@@ -13,6 +13,7 @@ import (
 var ValidBackends = map[string]bool{
 	"llama.cpp": true,
 	"openai":    true,
+	"vllm":      true,
 }
 
 // validateBackend checks if the provided backend is valid

diff --git a/cmd/cli/docs/reference/docker_model_list.yaml b/cmd/cli/docs/reference/docker_model_list.yaml
@@ -8,7 +8,7 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: Specify the backend to use (llama.cpp, openai)
+      description: Specify the backend to use (llama.cpp, openai, vllm)
       deprecated: false
       hidden: true
       experimental: false

diff --git a/cmd/cli/docs/reference/docker_model_run.yaml b/cmd/cli/docs/reference/docker_model_run.yaml
@@ -12,7 +12,7 @@ plink: docker_model.yaml
 options:
     - option: backend
       value_type: string
-      description: Specify the backend to use (llama.cpp, openai)
+      description: Specify the backend to use (llama.cpp, openai, vllm)
       deprecated: false
       hidden: true
       experimental: false

diff --git a/main.go b/main.go
@@ -14,6 +14,7 @@ import (
 	"github.com/docker/model-runner/pkg/gpuinfo"
 	"github.com/docker/model-runner/pkg/inference"
 	"github.com/docker/model-runner/pkg/inference/backends/llamacpp"
+	"github.com/docker/model-runner/pkg/inference/backends/vllm"
 	"github.com/docker/model-runner/pkg/inference/config"
 	"github.com/docker/model-runner/pkg/inference/memory"
 	"github.com/docker/model-runner/pkg/inference/models"
@@ -119,9 +120,19 @@ func main() {
 
 	memEstimator.SetDefaultBackend(llamaCppBackend)
 
+	vllmBackend, err := vllm.New(
+		log,
+		modelManager,
+		log.WithFields(logrus.Fields{"component": "vllm"}),
+		nil,
+	)
+	if err != nil {
+		log.Fatalf("unable to initialize %s backend: %v", vllm.Name, err)
+	}
+
 	scheduler := scheduling.NewScheduler(
 		log,
-		map[string]inference.Backend{llamacpp.Name: llamaCppBackend},
+		map[string]inference.Backend{llamacpp.Name: llamaCppBackend, vllm.Name: vllmBackend},
 		llamaCppBackend,
 		modelManager,
 		http.DefaultClient,

diff --git a/pkg/distribution/distribution/client.go b/pkg/distribution/distribution/client.go
@@ -6,15 +6,17 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"slices"
 
-	"github.com/docker/model-runner/pkg/distribution/internal/utils"
 	"github.com/sirupsen/logrus"
 
 	"github.com/docker/model-runner/pkg/distribution/internal/progress"
 	"github.com/docker/model-runner/pkg/distribution/internal/store"
+	"github.com/docker/model-runner/pkg/distribution/internal/utils"
 	"github.com/docker/model-runner/pkg/distribution/registry"
 	"github.com/docker/model-runner/pkg/distribution/tarball"
 	"github.com/docker/model-runner/pkg/distribution/types"
+	"github.com/docker/model-runner/pkg/inference/platform"
 )
 
 // Client provides model distribution functionality
@@ -408,6 +410,13 @@ func (c *Client) GetBundle(ref string) (types.ModelBundle, error) {
 	return c.store.BundleForModel(ref)
 }
 
+func GetSupportedFormats() []types.Format {
+	if platform.SupportsVLLM() {
+		return []types.Format{types.FormatGGUF, types.FormatSafetensors}
+	}
+	return []types.Format{types.FormatGGUF}
+}
+
 func checkCompat(image types.ModelArtifact) error {
 	manifest, err := image.Manifest()
 	if err != nil {
@@ -423,7 +432,7 @@ func checkCompat(image types.ModelArtifact) error {
 		return fmt.Errorf("reading model config: %w", err)
 	}
 
-	if config.Format == types.FormatSafetensors {
+	if !slices.Contains(GetSupportedFormats(), config.Format) {
 		return ErrUnsupportedFormat
 	}
 

diff --git a/pkg/distribution/distribution/client_test.go b/pkg/distribution/distribution/client_test.go
@@ -26,6 +26,7 @@ import (
 	"github.com/docker/model-runner/pkg/distribution/internal/progress"
 	"github.com/docker/model-runner/pkg/distribution/internal/safetensors"
 	mdregistry "github.com/docker/model-runner/pkg/distribution/registry"
+	"github.com/docker/model-runner/pkg/inference/platform"
 )
 
 var (
@@ -418,7 +419,7 @@ func TestClientPullModel(t *testing.T) {
 		}
 	})
 
-	t.Run("pull safetensors model returns error", func(t *testing.T) {
+	t.Run("pull safetensors model returns error on unsupported platforms", func(t *testing.T) {
 		// Create temp directory for the safetensors file
 		tempDir, err := os.MkdirTemp("", "safetensors-test-*")
 		if err != nil {
@@ -461,10 +462,18 @@ func TestClientPullModel(t *testing.T) {
 			t.Fatalf("Failed to create test client: %v", err)
 		}
 
-		// Try to pull the safetensors model - should fail with ErrUnsupportedFormat
+		// Try to pull the safetensors model
 		err = testClient.PullModel(context.Background(), tag, nil)
-		if !errors.Is(err, ErrUnsupportedFormat) {
-			t.Fatalf("Expected ErrUnsupportedFormat, got: %v", err)
+		if platform.SupportsVLLM() {
+			// On Linux, safetensors should be supported
+			if err != nil {
+				t.Fatalf("Expected no error on Linux, got: %v", err)
+			}
+		} else {
+			// On non-Linux, should fail with ErrUnsupportedFormat
+			if !errors.Is(err, ErrUnsupportedFormat) {
+				t.Fatalf("Expected ErrUnsupportedFormat on non-Linux platforms, got: %v", err)
+			}
 		}
 	})
 

diff --git a/pkg/inference/backend.go b/pkg/inference/backend.go
@@ -84,7 +84,7 @@ type Backend interface {
 	// to be loaded. Backends should not load multiple models at once and should
 	// instead load only the specified model. Backends should still respond to
 	// OpenAI API requests for other models with a 421 error code.
-	Run(ctx context.Context, socket, model string, mode BackendMode, config *BackendConfiguration) error
+	Run(ctx context.Context, socket, model string, modelRef string, mode BackendMode, config *BackendConfiguration) error
 	// Status returns a description of the backend's state.
 	Status() string
 	// GetDiskUsage returns the disk usage of the backend.

diff --git a/pkg/inference/backends/llamacpp/llamacpp.go b/pkg/inference/backends/llamacpp/llamacpp.go
@@ -133,7 +133,7 @@ func (l *llamaCpp) Install(ctx context.Context, httpClient *http.Client) error {
 }
 
 // Run implements inference.Backend.Run.
-func (l *llamaCpp) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
+func (l *llamaCpp) Run(ctx context.Context, socket, model string, _ string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
 	bundle, err := l.modelManager.GetBundle(model)
 	if err != nil {
 		return fmt.Errorf("failed to get model: %w", err)

diff --git a/pkg/inference/backends/mlx/mlx.go b/pkg/inference/backends/mlx/mlx.go
@@ -49,7 +49,7 @@ func (m *mlx) Install(ctx context.Context, httpClient *http.Client) error {
 }
 
 // Run implements inference.Backend.Run.
-func (m *mlx) Run(ctx context.Context, socket, model string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
+func (m *mlx) Run(ctx context.Context, socket, model string, modelRef string, mode inference.BackendMode, config *inference.BackendConfiguration) error {
 	// TODO: Implement.
 	m.log.Warn("MLX backend is not yet supported")
 	return errors.New("not implemented")