From 4e188f6faf3b7ebbc17c8e80da1def8c00c73b88 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Tue, 14 Apr 2026 13:34:33 -0500 Subject: [PATCH 1/7] feat(vllm): add Gemma 4 models, image, and ROCm serving recipes - Register pyt_vllm_gemma-4-26b-a4b-it and pyt_vllm_gemma-4-31b-it in models.json (gemma4 Docker stack). - Add docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile from vllm/vllm-openai-rocm:gemma4 with transformers 5.5.0. - Extend scripts/vllm/configs/default.yaml with Gemma 4 serving blocks (TRITON_ATTN, gfx942 float16; 26B MoE disables AITER fused MoE). - Quote JSON-like extra_args in run_vllm.py (shlex) for --limit-mm-per-prompt with existing --flag YAML keys. - Document Gemma 4 in benchmark/vllm/README.md. --- benchmark/vllm/README.md | 26 +++++++++++- docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile | 42 ++++++++++++++++++++ models.json | 38 ++++++++++++++++++ scripts/vllm/configs/default.yaml | 41 +++++++++++++++++++ scripts/vllm/run_vllm.py | 12 +++++- 5 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md index 522a104..b4e9782 100644 --- a/benchmark/vllm/README.md +++ b/benchmark/vllm/README.md @@ -61,6 +61,12 @@ The following command pulls the Docker image from Docker Hub. docker pull vllm/vllm-openai-rocm:v0.17.1 ``` +For Gemma 4, use the Gemma4-tagged image (also referenced by [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)): + +```sh +docker pull vllm/vllm-openai-rocm:gemma4 +``` + ### MAD-integrated benchmarking Clone the ROCm Model Automation and Dashboarding (MAD) repository to a local directory and install the required packages on the host machine. @@ -86,7 +92,17 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki #### Available models >[!NOTE] ->The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators. +>The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators. + +>[!NOTE] +>Gemma 4 models (`pyt_vllm_gemma-4-*`) are built from `vllm/vllm-openai-rocm:gemma4` (see [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads. + +Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported. + +| Model | Notes | +| ----- | ----- | +| **google/gemma-4-31B-it** | Dense instruct. Full serving sweep: **`max_concurrency` 1, 8, 32, 128** (four cold starts). | +| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. **Concurrency sweep is narrowed to 1 and 8** only for typical MAD Docker memory limits. | | MAD model name | Model repo | | -------------------------------------- | -------------------------------------- | @@ -112,6 +128,8 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki | pyt_vllm_mixtral-8x22b | [mistralai/Mixtral-8x22B-Instruct-v0.1](https://hugggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | | pyt_vllm_mixtral-8x22b_fp8 | [amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV](https://hugggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV) | | pyt_vllm_phi-4 | [microsoft/phi-4](https://huggingface.co/microsoft/phi-4) | +| pyt_vllm_gemma-4-26b-a4b-it | [google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it) | +| pyt_vllm_gemma-4-31b-it | [google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it) | | pyt_vllm_qwen3-8b | [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) | | pyt_vllm_qwen3-32b | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) | | pyt_vllm_qwen3-30b-a3b | [Qwen/Qwen3-30B-A3B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507) | @@ -132,6 +150,8 @@ docker pull vllm/vllm-openai-rocm:v0.17.1 docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1 ``` +For Gemma 4 standalone runs, substitute `vllm/vllm-openai-rocm:gemma4` for the image tag in the `docker run` line above. For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path. + >[!NOTE] >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance >on MI3xx (i.e. gfx942 and gfx950) platforms. If you're using this docker image on other AMD GPUs e.g. MI2xx or Radeon, @@ -345,6 +365,10 @@ owners and are only mentioned for informative purposes.    ---------- This release note summarizes notable changes since the previous docker release. +MAD `pyt_vllm_gemma-4-*` configs (see [`default.yaml`](../../scripts/vllm/configs/default.yaml)): +- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); narrowed default `max_concurrency` to **1 8** to avoid OOM on repeated server restarts. +- **gemma-4-31B-it:** unchanged full sweep **1 8 32 128**; no `VLLM_ROCM_USE_AITER_MOE` override. + v0.17.1 release: - Includes documentation and patches for upstream releases. Please track https://github.com/vllm-project/vllm/releases for all future release notes. diff --git a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile new file mode 100644 index 0000000..c5876ad --- /dev/null +++ b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile @@ -0,0 +1,42 @@ +# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} +############################################################################### +# +# MIT License +# +# Copyright (c) Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +################################################################################# +# Gemma 4 requires a vLLM build with Gemma4 support; see vLLM recipes (Google/Gemma4.md). +ARG BASE_DOCKER=vllm/vllm-openai-rocm:gemma4 +FROM $BASE_DOCKER + +USER root +ENV WORKSPACE_DIR=/workspace +RUN mkdir -p $WORKSPACE_DIR +WORKDIR $WORKSPACE_DIR + +RUN pip3 install --no-cache-dir "transformers==5.5.0" + +# record configuration for posterity +RUN pip3 list + +# Specify entrypoint to override upstream +ENTRYPOINT [""] diff --git a/models.json b/models.json index b24ac6c..e764c52 100644 --- a/models.json +++ b/models.json @@ -487,6 +487,44 @@ "args": "--model_repo Qwen/Qwen3-8B --config configs/extended.yaml" }, + { + "name": "pyt_vllm_gemma-4-26b-a4b-it", + "data": "huggingface", + "dockerfile": "docker/pyt_vllm_gemma4", + "scripts": "scripts/vllm/run.sh", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "", + "multiple_results": "perf_gemma-4-26B-A4B-it.csv", + "tags": [ + "pyt", + "vllm", + "vllm_extended", + "inference" + ], + "timeout": -1, + "args": + "--model_repo google/gemma-4-26B-A4B-it --config configs/default.yaml" + }, + { + "name": "pyt_vllm_gemma-4-31b-it", + "data": "huggingface", + "dockerfile": "docker/pyt_vllm_gemma4", + "scripts": "scripts/vllm/run.sh", + "n_gpus": "-1", + "owner": "mad.support@amd.com", + "training_precision": "", + "multiple_results": "perf_gemma-4-31B-it.csv", + "tags": [ + "pyt", + "vllm", + "vllm_extended", + "inference" + ], + "timeout": -1, + "args": + "--model_repo google/gemma-4-31B-it --config configs/default.yaml" + }, { "name": "pyt_vllm_qwen3-32b", "data": "huggingface", diff --git a/scripts/vllm/configs/default.yaml b/scripts/vllm/configs/default.yaml index 038480d..ce09938 100644 --- a/scripts/vllm/configs/default.yaml +++ b/scripts/vllm/configs/default.yaml @@ -92,6 +92,47 @@ VLLM_ROCM_USE_AITER: 1 extra_args: --attention-backend: ROCM_ATTN + arch_overrides: + gfx942: + dtype: float16 + +## Gemma 4: vLLM recipe recommends 1x MI300-class GPU (BF16); tp 1 for text-only bench +## Use TRITON_ATTN (Gemma4 default); 26B-A4B MoE: VLLM_ROCM_USE_AITER_MOE=0; narrow concurrency for 26B to avoid OOM +- benchmark: serving + model: google/gemma-4-26B-A4B-it + tp: 1 + inp: 1024 + out: 1024 + dtype: auto + max_concurrency: 1 8 + env: + VLLM_ROCM_USE_AITER: 1 + VLLM_ROCM_USE_AITER_MOE: 0 + extra_args: + --attention-backend: TRITON_ATTN + --max-model-len: 32768 + --gpu-memory-utilization: 0.90 + --limit-mm-per-prompt: '{"image":0,"audio":0}' + --async-scheduling: True + arch_overrides: + gfx942: + dtype: float16 + +- benchmark: serving + model: google/gemma-4-31B-it + tp: 1 + inp: 1024 + out: 1024 + dtype: auto + max_concurrency: 1 8 32 128 + env: + VLLM_ROCM_USE_AITER: 1 + extra_args: + --attention-backend: TRITON_ATTN + --max-model-len: 32768 + --gpu-memory-utilization: 0.90 + --limit-mm-per-prompt: '{"image":0,"audio":0}' + --async-scheduling: True arch_overrides: gfx942: dtype: float16 \ No newline at end of file diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py index b5b5db0..3d20c08 100644 --- a/scripts/vllm/run_vllm.py +++ b/scripts/vllm/run_vllm.py @@ -34,6 +34,7 @@ import signal import argparse import itertools +import shlex import subprocess from typing import List, Dict @@ -490,7 +491,16 @@ def main(): if isinstance(v, bool): extra_args_str += f" {k}" else: - extra_args_str += f" {k} {v}" + s = str(v) + st = s.strip() + if ( + k == "--limit-mm-per-prompt" + or (st[:1] in "{[") + or any(ch.isspace() for ch in s) + ): + extra_args_str += f" {k} {shlex.quote(s)}" + else: + extra_args_str += f" {k} {v}" config["env"] = env_vars_str config["extra_args"] = extra_args_str From b4de892ac504ddc061c32451e365b0dab645e937 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 30 Apr 2026 21:17:36 +0000 Subject: [PATCH 2/7] chore(vllm): bump base image to vllm-openai-rocm v0.20.0, pin transformers>=5.5.0 Co-Authored-By: Claude Sonnet 4 --- docker/pyt_vllm.ubuntu.amd.Dockerfile | 3 +- docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile | 42 -------------------- 2 files changed, 2 insertions(+), 43 deletions(-) delete mode 100644 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile index eb59155..0d6b00c 100644 --- a/docker/pyt_vllm.ubuntu.amd.Dockerfile +++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile @@ -24,10 +24,11 @@ # SOFTWARE. # ################################################################################# -ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.17.1 +ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.20.0 FROM $BASE_DOCKER USER root +RUN pip3 install --no-cache-dir "transformers>=5.5.0" ENV WORKSPACE_DIR=/workspace RUN mkdir -p $WORKSPACE_DIR WORKDIR $WORKSPACE_DIR diff --git a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile deleted file mode 100644 index c5876ad..0000000 --- a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile +++ /dev/null @@ -1,42 +0,0 @@ -# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'} -############################################################################### -# -# MIT License -# -# Copyright (c) Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -################################################################################# -# Gemma 4 requires a vLLM build with Gemma4 support; see vLLM recipes (Google/Gemma4.md). -ARG BASE_DOCKER=vllm/vllm-openai-rocm:gemma4 -FROM $BASE_DOCKER - -USER root -ENV WORKSPACE_DIR=/workspace -RUN mkdir -p $WORKSPACE_DIR -WORKDIR $WORKSPACE_DIR - -RUN pip3 install --no-cache-dir "transformers==5.5.0" - -# record configuration for posterity -RUN pip3 list - -# Specify entrypoint to override upstream -ENTRYPOINT [""] From 77f0161681420e3bab10b5568fd117e2542051a5 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 30 Apr 2026 21:17:39 +0000 Subject: [PATCH 3/7] feat(vllm): consolidate Gemma 4 models into standard pyt_vllm stack Co-Authored-By: Claude Sonnet 4 --- models.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/models.json b/models.json index e764c52..6b1c99a 100644 --- a/models.json +++ b/models.json @@ -490,7 +490,7 @@ { "name": "pyt_vllm_gemma-4-26b-a4b-it", "data": "huggingface", - "dockerfile": "docker/pyt_vllm_gemma4", + "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", @@ -509,7 +509,7 @@ { "name": "pyt_vllm_gemma-4-31b-it", "data": "huggingface", - "dockerfile": "docker/pyt_vllm_gemma4", + "dockerfile": "docker/pyt_vllm", "scripts": "scripts/vllm/run.sh", "n_gpus": "-1", "owner": "mad.support@amd.com", From 76bc2767f8cc9f6655cbf987950b7841aa70704e Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 30 Apr 2026 21:17:41 +0000 Subject: [PATCH 4/7] =?UTF-8?q?docs(vllm):=20update=20README=20=E2=80=94?= =?UTF-8?q?=20Gemma=204=20now=20uses=20standard=20pyt=5Fvllm=20image?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4 --- benchmark/vllm/README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md index b4e9782..79046e2 100644 --- a/benchmark/vllm/README.md +++ b/benchmark/vllm/README.md @@ -61,10 +61,10 @@ The following command pulls the Docker image from Docker Hub. docker pull vllm/vllm-openai-rocm:v0.17.1 ``` -For Gemma 4, use the Gemma4-tagged image (also referenced by [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)): +Gemma 4 models are served from the standard vLLM image (vLLM ≥0.20.0 ships Transformers v5 with native Gemma 4 support): ```sh -docker pull vllm/vllm-openai-rocm:gemma4 +docker pull vllm/vllm-openai-rocm:v0.20.0 ``` ### MAD-integrated benchmarking @@ -95,7 +95,7 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki >The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators. >[!NOTE] ->Gemma 4 models (`pyt_vllm_gemma-4-*`) are built from `vllm/vllm-openai-rocm:gemma4` (see [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads. +>Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM ≥0.20.0 / Transformers ≥5.5.0). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads. Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported. @@ -150,7 +150,7 @@ docker pull vllm/vllm-openai-rocm:v0.17.1 docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1 ``` -For Gemma 4 standalone runs, substitute `vllm/vllm-openai-rocm:gemma4` for the image tag in the `docker run` line above. For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path. +For Gemma 4 standalone runs use `vllm/vllm-openai-rocm:v0.20.0` (same image as other vLLM models). For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path. >[!NOTE] >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance From b455096d696fdcf39e23980327b995682ca38963 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Thu, 30 Apr 2026 21:17:45 +0000 Subject: [PATCH 5/7] fix(vllm): apply shlex.quote to all non-bool extra_args values (was partial) Co-Authored-By: Claude Sonnet 4 --- scripts/vllm/run_vllm.py | 11 +-- tests/vllm/__init__.py | 0 tests/vllm/test_run_vllm_extra_args.py | 95 ++++++++++++++++++++++++++ 3 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 tests/vllm/__init__.py create mode 100644 tests/vllm/test_run_vllm_extra_args.py diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py index 3d20c08..8103478 100644 --- a/scripts/vllm/run_vllm.py +++ b/scripts/vllm/run_vllm.py @@ -491,16 +491,7 @@ def main(): if isinstance(v, bool): extra_args_str += f" {k}" else: - s = str(v) - st = s.strip() - if ( - k == "--limit-mm-per-prompt" - or (st[:1] in "{[") - or any(ch.isspace() for ch in s) - ): - extra_args_str += f" {k} {shlex.quote(s)}" - else: - extra_args_str += f" {k} {v}" + extra_args_str += f" {k} {shlex.quote(str(v))}" config["env"] = env_vars_str config["extra_args"] = extra_args_str diff --git a/tests/vllm/__init__.py b/tests/vllm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py new file mode 100644 index 0000000..62fa180 --- /dev/null +++ b/tests/vllm/test_run_vllm_extra_args.py @@ -0,0 +1,95 @@ +"""Tests for extra_args quoting in run_vllm.py.""" +import sys +import os +import shlex + +import pytest + + +def build_extra_args_str_old(extra_args: dict) -> str: + """Replicates the OLD selective-quoting logic from run_vllm.py (pre-fix).""" + extra_args_str = "" + for k, v in extra_args.items(): + if isinstance(v, bool): + extra_args_str += f" {k}" + else: + s = str(v) + st = s.strip() + if ( + k == "--limit-mm-per-prompt" + or (st[:1] in "{[") + or any(ch.isspace() for ch in s) + ): + extra_args_str += f" {k} {shlex.quote(s)}" + else: + extra_args_str += f" {k} {s}" + return extra_args_str.strip() + + +def build_extra_args_str_new(extra_args: dict) -> str: + """Replicates the NEW universal-quoting logic (post-fix).""" + extra_args_str = "" + for k, v in extra_args.items(): + if isinstance(v, bool): + extra_args_str += f" {k}" + else: + extra_args_str += f" {k} {shlex.quote(str(v))}" + return extra_args_str.strip() + + +# --- Tests that FAIL with the old logic, PASS with the new --- + +def test_shell_metachar_no_space_is_quoted_by_new(): + """Values with shell metacharacters but no spaces are NOT quoted by old code. + + The old code only quotes when there's whitespace, a JSON-like prefix, or the + --limit-mm-per-prompt key. A value like 'foo;bar' (no space) slips through + unquoted, allowing shell injection. The new code always quotes. + """ + args = {"--some-arg": "foo;bar"} + old = build_extra_args_str_old(args) + new = build_extra_args_str_new(args) + # Old code: no whitespace -> not quoted, semicolon is a live shell metachar + assert old == "--some-arg foo;bar", f"unexpected old output: {old!r}" + # New code: shlex.quote wraps the value in single quotes + assert new == "--some-arg 'foo;bar'", f"unexpected new output: {new!r}" + assert old != new + + +def test_plain_string_with_metachar_is_unquoted_by_old(): + """Old code leaves plain strings with $ unquoted (variable expansion risk).""" + args = {"--trust-remote-code": "yes$HOME"} + old = build_extra_args_str_old(args) + new = build_extra_args_str_new(args) + # Old code: no whitespace, no JSON prefix -> raw string passed to shell + assert old == "--trust-remote-code yes$HOME", f"unexpected old output: {old!r}" + # New code: always quoted + assert new == "--trust-remote-code 'yes$HOME'", f"unexpected new output: {new!r}" + + +# --- Tests that PASS with BOTH old and new logic --- + +def test_json_value_is_quoted(): + args = {"--limit-mm-per-prompt": '{"image":0,"audio":0}'} + result = build_extra_args_str_new(args) + assert result == """--limit-mm-per-prompt '{"image":0,"audio":0}'""" + + +def test_bool_flag_has_no_value(): + args = {"--async-scheduling": True} + result = build_extra_args_str_new(args) + assert result == "--async-scheduling" + + +def test_string_with_space_is_quoted(): + args = {"--served-model-name": "my model"} + result = build_extra_args_str_new(args) + assert result == "--served-model-name 'my model'" + + +def test_plain_safe_scalar_passthrough(): + """shlex.quote does not add quotes to safe alphanumeric values.""" + args = {"--max-model-len": 32768} + result = build_extra_args_str_new(args) + # shlex.quote('32768') == '32768' (no shell quoting needed for pure digits) + assert result == "--max-model-len 32768" From 11d6bd8cd84c38317f9fdb1e698b26bc869f1c7b Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 1 May 2026 02:41:57 +0000 Subject: [PATCH 6/7] fix(vllm): address PR review feedback and enable full concurrency - Remove redundant pip install transformers (v0.20.0 ships with v5) - Delete test_run_vllm_extra_args.py (duplicated inline logic) - Remove --async-scheduling from Gemma 4 configs (on by default) - Enable concurrency 32/128 for gemma-4-26B-A4B-it - Update README to reflect v0.20.0 as the standard base image Co-Authored-By: Claude Opus 4 --- benchmark/vllm/README.md | 22 +++--- docker/pyt_vllm.ubuntu.amd.Dockerfile | 1 - scripts/vllm/configs/default.yaml | 11 +-- tests/vllm/test_run_vllm_extra_args.py | 95 -------------------------- 4 files changed, 14 insertions(+), 115 deletions(-) delete mode 100644 tests/vllm/test_run_vllm_extra_args.py diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md index 79046e2..c735530 100644 --- a/benchmark/vllm/README.md +++ b/benchmark/vllm/README.md @@ -14,7 +14,7 @@ This Docker image packages vLLM with PyTorch for AMD Instinct™ MI300X, MI325X, accelerators. It includes: - ✅ ROCm™ 7.0.0 -- ✅ vLLM 0.17.1 +- ✅ vLLM 0.20.0 - ✅ PyTorch 2.9.0 (2.9.0a0+git1c57644) - ✅ hipBLASLt 1.0 @@ -57,12 +57,6 @@ To override the benchmark configs, specify a certain benchmark to use, or add yo The following command pulls the Docker image from Docker Hub. -```sh -docker pull vllm/vllm-openai-rocm:v0.17.1 -``` - -Gemma 4 models are served from the standard vLLM image (vLLM ≥0.20.0 ships Transformers v5 with native Gemma 4 support): - ```sh docker pull vllm/vllm-openai-rocm:v0.20.0 ``` @@ -95,14 +89,14 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki >The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators. >[!NOTE] ->Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM ≥0.20.0 / Transformers ≥5.5.0). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads. +>Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM 0.20.0, which bundles Transformers v5 with native Gemma 4 support). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads. -Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported. +Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`**. | Model | Notes | | ----- | ----- | | **google/gemma-4-31B-it** | Dense instruct. Full serving sweep: **`max_concurrency` 1, 8, 32, 128** (four cold starts). | -| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. **Concurrency sweep is narrowed to 1 and 8** only for typical MAD Docker memory limits. | +| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. Full concurrency sweep: **`max_concurrency` 1, 8, 32, 128**. | | MAD model name | Model repo | | -------------------------------------- | -------------------------------------- | @@ -145,12 +139,12 @@ Users also can run the benchmark tool after they launch a Docker container. For #### Docker launch ```sh -docker pull vllm/vllm-openai-rocm:v0.17.1 +docker pull vllm/vllm-openai-rocm:v0.20.0 -docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1 +docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.20.0 ``` -For Gemma 4 standalone runs use `vllm/vllm-openai-rocm:v0.20.0` (same image as other vLLM models). For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path. +For **`google/gemma-4-26B-A4B-it`** standalone runs, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path. >[!NOTE] >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance @@ -366,7 +360,7 @@ owners and are only mentioned for informative purposes.    This release note summarizes notable changes since the previous docker release. MAD `pyt_vllm_gemma-4-*` configs (see [`default.yaml`](../../scripts/vllm/configs/default.yaml)): -- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); narrowed default `max_concurrency` to **1 8** to avoid OOM on repeated server restarts. +- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); full concurrency sweep **1 8 32 128**. - **gemma-4-31B-it:** unchanged full sweep **1 8 32 128**; no `VLLM_ROCM_USE_AITER_MOE` override. v0.17.1 release: diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile index 0d6b00c..387a798 100644 --- a/docker/pyt_vllm.ubuntu.amd.Dockerfile +++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile @@ -28,7 +28,6 @@ ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.20.0 FROM $BASE_DOCKER USER root -RUN pip3 install --no-cache-dir "transformers>=5.5.0" ENV WORKSPACE_DIR=/workspace RUN mkdir -p $WORKSPACE_DIR WORKDIR $WORKSPACE_DIR diff --git a/scripts/vllm/configs/default.yaml b/scripts/vllm/configs/default.yaml index ce09938..e556918 100644 --- a/scripts/vllm/configs/default.yaml +++ b/scripts/vllm/configs/default.yaml @@ -97,14 +97,17 @@ dtype: float16 ## Gemma 4: vLLM recipe recommends 1x MI300-class GPU (BF16); tp 1 for text-only bench -## Use TRITON_ATTN (Gemma4 default); 26B-A4B MoE: VLLM_ROCM_USE_AITER_MOE=0; narrow concurrency for 26B to avoid OOM +## Use TRITON_ATTN (Gemma4 default); ROCM_ATTN does not support head_dim 512 without extra work +## --limit-mm-per-prompt is JSON for vLLM (json.loads), not image=0,audio=0 +## 26B-A4B is MoE: disable AITER fused MoE only (unsupported CK GEMM for this shape); other AITER features unchanged +## Full concurrency sweep (1 8 32 128) - benchmark: serving model: google/gemma-4-26B-A4B-it tp: 1 inp: 1024 out: 1024 dtype: auto - max_concurrency: 1 8 + max_concurrency: 1 8 32 128 env: VLLM_ROCM_USE_AITER: 1 VLLM_ROCM_USE_AITER_MOE: 0 @@ -113,7 +116,6 @@ --max-model-len: 32768 --gpu-memory-utilization: 0.90 --limit-mm-per-prompt: '{"image":0,"audio":0}' - --async-scheduling: True arch_overrides: gfx942: dtype: float16 @@ -132,7 +134,6 @@ --max-model-len: 32768 --gpu-memory-utilization: 0.90 --limit-mm-per-prompt: '{"image":0,"audio":0}' - --async-scheduling: True arch_overrides: gfx942: - dtype: float16 \ No newline at end of file + dtype: float16 diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py deleted file mode 100644 index 62fa180..0000000 --- a/tests/vllm/test_run_vllm_extra_args.py +++ /dev/null @@ -1,95 +0,0 @@ -"""Tests for extra_args quoting in run_vllm.py.""" -import sys -import os -import shlex - -import pytest - - -def build_extra_args_str_old(extra_args: dict) -> str: - """Replicates the OLD selective-quoting logic from run_vllm.py (pre-fix).""" - extra_args_str = "" - for k, v in extra_args.items(): - if isinstance(v, bool): - extra_args_str += f" {k}" - else: - s = str(v) - st = s.strip() - if ( - k == "--limit-mm-per-prompt" - or (st[:1] in "{[") - or any(ch.isspace() for ch in s) - ): - extra_args_str += f" {k} {shlex.quote(s)}" - else: - extra_args_str += f" {k} {s}" - return extra_args_str.strip() - - -def build_extra_args_str_new(extra_args: dict) -> str: - """Replicates the NEW universal-quoting logic (post-fix).""" - extra_args_str = "" - for k, v in extra_args.items(): - if isinstance(v, bool): - extra_args_str += f" {k}" - else: - extra_args_str += f" {k} {shlex.quote(str(v))}" - return extra_args_str.strip() - - -# --- Tests that FAIL with the old logic, PASS with the new --- - -def test_shell_metachar_no_space_is_quoted_by_new(): - """Values with shell metacharacters but no spaces are NOT quoted by old code. - - The old code only quotes when there's whitespace, a JSON-like prefix, or the - --limit-mm-per-prompt key. A value like 'foo;bar' (no space) slips through - unquoted, allowing shell injection. The new code always quotes. - """ - args = {"--some-arg": "foo;bar"} - old = build_extra_args_str_old(args) - new = build_extra_args_str_new(args) - # Old code: no whitespace -> not quoted, semicolon is a live shell metachar - assert old == "--some-arg foo;bar", f"unexpected old output: {old!r}" - # New code: shlex.quote wraps the value in single quotes - assert new == "--some-arg 'foo;bar'", f"unexpected new output: {new!r}" - assert old != new - - -def test_plain_string_with_metachar_is_unquoted_by_old(): - """Old code leaves plain strings with $ unquoted (variable expansion risk).""" - args = {"--trust-remote-code": "yes$HOME"} - old = build_extra_args_str_old(args) - new = build_extra_args_str_new(args) - # Old code: no whitespace, no JSON prefix -> raw string passed to shell - assert old == "--trust-remote-code yes$HOME", f"unexpected old output: {old!r}" - # New code: always quoted - assert new == "--trust-remote-code 'yes$HOME'", f"unexpected new output: {new!r}" - - -# --- Tests that PASS with BOTH old and new logic --- - -def test_json_value_is_quoted(): - args = {"--limit-mm-per-prompt": '{"image":0,"audio":0}'} - result = build_extra_args_str_new(args) - assert result == """--limit-mm-per-prompt '{"image":0,"audio":0}'""" - - -def test_bool_flag_has_no_value(): - args = {"--async-scheduling": True} - result = build_extra_args_str_new(args) - assert result == "--async-scheduling" - - -def test_string_with_space_is_quoted(): - args = {"--served-model-name": "my model"} - result = build_extra_args_str_new(args) - assert result == "--served-model-name 'my model'" - - -def test_plain_safe_scalar_passthrough(): - """shlex.quote does not add quotes to safe alphanumeric values.""" - args = {"--max-model-len": 32768} - result = build_extra_args_str_new(args) - # shlex.quote('32768') == '32768' (no shell quoting needed for pure digits) - assert result == "--max-model-len 32768" From cac3f030eb9182cc69642ccc3d6829efd2292fa7 Mon Sep 17 00:00:00 2001 From: Stephen Shao Date: Fri, 1 May 2026 03:00:35 +0000 Subject: [PATCH 7/7] =?UTF-8?q?fix(vllm):=20address=20Copilot=20review=20?= =?UTF-8?q?=E2=80=94=20ENTRYPOINT,=20bool=20flags,=20testable=20helper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix ENTRYPOINT [""] → ENTRYPOINT [] to properly clear upstream entrypoint - Skip bool False flags instead of emitting them on the command line - Extract build_extra_args_str() as importable module-level function - Rewrite tests to import and exercise the real production code path Co-Authored-By: Claude Opus 4 --- docker/pyt_vllm.ubuntu.amd.Dockerfile | 2 +- scripts/vllm/run_vllm.py | 20 ++++++--- tests/vllm/test_run_vllm_extra_args.py | 61 ++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 tests/vllm/test_run_vllm_extra_args.py diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile index 387a798..4707632 100644 --- a/docker/pyt_vllm.ubuntu.amd.Dockerfile +++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile @@ -36,4 +36,4 @@ WORKDIR $WORKSPACE_DIR RUN pip3 list # Specify entrypoint to override upstream -ENTRYPOINT [""] +ENTRYPOINT [] diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py index 8103478..fd96e0c 100644 --- a/scripts/vllm/run_vllm.py +++ b/scripts/vllm/run_vllm.py @@ -39,6 +39,19 @@ from typing import List, Dict SUPPORTED_LIST_ARGS = ['model', 'tp', 'inp', 'out', 'bs', 'num_prompts', 'max_concurrency'] + + +def build_extra_args_str(extra_args: Dict) -> str: + parts = [] + for k, v in extra_args.items(): + if isinstance(v, bool): + if v: + parts.append(k) + else: + parts.append(f"{k} {shlex.quote(str(v))}") + return " ".join(parts) + + CSV_HEADER = [ "model", "benchmark", @@ -486,12 +499,7 @@ def main(): env_vars = config.get("env", {}) extra_args = config.get("extra_args", {}) env_vars_str = " ".join(f"{k}={v}" for k, v in env_vars.items()) - extra_args_str = "" - for k, v in extra_args.items(): - if isinstance(v, bool): - extra_args_str += f" {k}" - else: - extra_args_str += f" {k} {shlex.quote(str(v))}" + extra_args_str = build_extra_args_str(extra_args) config["env"] = env_vars_str config["extra_args"] = extra_args_str diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py new file mode 100644 index 0000000..7bcb1c3 --- /dev/null +++ b/tests/vllm/test_run_vllm_extra_args.py @@ -0,0 +1,61 @@ +import sys +import os + +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "scripts", "vllm")) +from run_vllm import build_extra_args_str + + +def test_simple_string_value(): + result = build_extra_args_str({"--attention-backend": "TRITON_ATTN"}) + assert result == "--attention-backend TRITON_ATTN" + + +def test_json_value_is_quoted(): + result = build_extra_args_str({"--limit-mm-per-prompt": '{"image":0,"audio":0}'}) + assert "'" in result or "\\" in result + assert "--limit-mm-per-prompt" in result + + +def test_bool_true_emits_flag(): + result = build_extra_args_str({"--enable-prefix-caching": True}) + assert result == "--enable-prefix-caching" + + +def test_bool_false_skips_flag(): + result = build_extra_args_str({"--enable-prefix-caching": False}) + assert result == "" + + +def test_numeric_value(): + result = build_extra_args_str({"--max-model-len": 32768}) + assert result == "--max-model-len 32768" + + +def test_mixed_args(): + args = { + "--attention-backend": "TRITON_ATTN", + "--enable-prefix-caching": True, + "--disable-log-stats": False, + "--max-model-len": 32768, + "--limit-mm-per-prompt": '{"image":0,"audio":0}', + } + result = build_extra_args_str(args) + assert "--attention-backend TRITON_ATTN" in result + assert "--enable-prefix-caching" in result + assert "--disable-log-stats" not in result + assert "--max-model-len 32768" in result + assert "--limit-mm-per-prompt" in result + + +def test_empty_args(): + assert build_extra_args_str({}) == "" + + +def test_value_with_spaces_is_quoted(): + result = build_extra_args_str({"--chat-template": "path with spaces/template.jinja"}) + assert "'" in result or "\\" in result + + +def test_shell_metacharacters_are_quoted(): + result = build_extra_args_str({"--some-arg": "value;rm -rf /"}) + assert "'" in result or "\\" in result