From 4e188f6faf3b7ebbc17c8e80da1def8c00c73b88 Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Tue, 14 Apr 2026 13:34:33 -0500
Subject: [PATCH 1/7] feat(vllm): add Gemma 4 models, image, and ROCm serving
 recipes

- Register pyt_vllm_gemma-4-26b-a4b-it and pyt_vllm_gemma-4-31b-it in models.json (gemma4 Docker stack).
- Add docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile from vllm/vllm-openai-rocm:gemma4 with transformers 5.5.0.
- Extend scripts/vllm/configs/default.yaml with Gemma 4 serving blocks (TRITON_ATTN, gfx942 float16; 26B MoE disables AITER fused MoE).
- Quote JSON-like extra_args in run_vllm.py (shlex) for --limit-mm-per-prompt with existing --flag YAML keys.
- Document Gemma 4 in benchmark/vllm/README.md.
---
 benchmark/vllm/README.md                     | 26 +++++++++++-
 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile | 42 ++++++++++++++++++++
 models.json                                  | 38 ++++++++++++++++++
 scripts/vllm/configs/default.yaml            | 41 +++++++++++++++++++
 scripts/vllm/run_vllm.py                     | 12 +++++-
 5 files changed, 157 insertions(+), 2 deletions(-)
 create mode 100644 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile

diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md
index 522a104..b4e9782 100644
--- a/benchmark/vllm/README.md
+++ b/benchmark/vllm/README.md
@@ -61,6 +61,12 @@ The following command pulls the Docker image from Docker Hub.
 docker pull vllm/vllm-openai-rocm:v0.17.1
 ```
 
+For Gemma 4, use the Gemma4-tagged image (also referenced by [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)):
+
+```sh
+docker pull vllm/vllm-openai-rocm:gemma4
+```
+
 ### MAD-integrated benchmarking
 
 Clone the ROCm Model Automation and Dashboarding (MAD) repository to a local directory and install the required packages on the host machine.
@@ -86,7 +92,17 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki
 #### Available models
 
 >[!NOTE]
->The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators.	
+>The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators.
+
+>[!NOTE]
+>Gemma 4 models (`pyt_vllm_gemma-4-*`) are built from `vllm/vllm-openai-rocm:gemma4` (see [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads.
+
+Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported.
+
+| Model | Notes |
+| ----- | ----- |
+| **google/gemma-4-31B-it** | Dense instruct. Full serving sweep: **`max_concurrency` 1, 8, 32, 128** (four cold starts). |
+| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. **Concurrency sweep is narrowed to 1 and 8** only for typical MAD Docker memory limits. |
 
 | MAD model name                         | Model repo                             |
 | -------------------------------------- | -------------------------------------- |
@@ -112,6 +128,8 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki
 | pyt_vllm_mixtral-8x22b                 | [mistralai/Mixtral-8x22B-Instruct-v0.1](https://hugggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) |
 | pyt_vllm_mixtral-8x22b_fp8             | [amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV](https://hugggingface.co/amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV) |
 | pyt_vllm_phi-4                         | [microsoft/phi-4](https://huggingface.co/microsoft/phi-4) |
+| pyt_vllm_gemma-4-26b-a4b-it            | [google/gemma-4-26B-A4B-it](https://huggingface.co/google/gemma-4-26B-A4B-it) |
+| pyt_vllm_gemma-4-31b-it                | [google/gemma-4-31B-it](https://huggingface.co/google/gemma-4-31B-it) |
 | pyt_vllm_qwen3-8b                      | [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) |
 | pyt_vllm_qwen3-32b                     | [Qwen/Qwen3-32B](https://huggingface.co/Qwen/Qwen3-32B) |
 | pyt_vllm_qwen3-30b-a3b                 | [Qwen/Qwen3-30B-A3B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-30B-A3B-Thinking-2507) |
@@ -132,6 +150,8 @@ docker pull vllm/vllm-openai-rocm:v0.17.1
 docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1
 ```
 
+For Gemma 4 standalone runs, substitute `vllm/vllm-openai-rocm:gemma4` for the image tag in the `docker run` line above. For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path.
+
 >[!NOTE]
 >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance
 >on MI3xx (i.e. gfx942 and gfx950) platforms. If you're using this docker image on other AMD GPUs e.g. MI2xx or Radeon,
@@ -345,6 +365,10 @@ owners and are only mentioned for informative purposes.   
 ----------
 This release note summarizes notable changes since the previous docker release.
 
+MAD `pyt_vllm_gemma-4-*` configs (see [`default.yaml`](../../scripts/vllm/configs/default.yaml)):
+- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); narrowed default `max_concurrency` to **1 8** to avoid OOM on repeated server restarts.
+- **gemma-4-31B-it:** unchanged full sweep **1 8 32 128**; no `VLLM_ROCM_USE_AITER_MOE` override.
+
 v0.17.1 release:
 - Includes documentation and patches for upstream releases. Please track https://github.com/vllm-project/vllm/releases
   for all future release notes.
diff --git a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile
new file mode 100644
index 0000000..c5876ad
--- /dev/null
+++ b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile
@@ -0,0 +1,42 @@
+# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
+###############################################################################
+#
+# MIT License
+#
+# Copyright (c) Advanced Micro Devices, Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#################################################################################
+# Gemma 4 requires a vLLM build with Gemma4 support; see vLLM recipes (Google/Gemma4.md).
+ARG BASE_DOCKER=vllm/vllm-openai-rocm:gemma4
+FROM $BASE_DOCKER
+
+USER root
+ENV WORKSPACE_DIR=/workspace
+RUN mkdir -p $WORKSPACE_DIR
+WORKDIR $WORKSPACE_DIR
+
+RUN pip3 install --no-cache-dir "transformers==5.5.0"
+
+# record configuration for posterity
+RUN pip3 list
+
+# Specify entrypoint to override upstream
+ENTRYPOINT [""]
diff --git a/models.json b/models.json
index b24ac6c..e764c52 100644
--- a/models.json
+++ b/models.json
@@ -487,6 +487,44 @@
     "args":
     "--model_repo Qwen/Qwen3-8B --config configs/extended.yaml"
   },
+  {
+    "name": "pyt_vllm_gemma-4-26b-a4b-it",
+    "data": "huggingface",
+    "dockerfile": "docker/pyt_vllm_gemma4",
+    "scripts": "scripts/vllm/run.sh",
+    "n_gpus": "-1",
+    "owner": "mad.support@amd.com",
+    "training_precision": "",
+    "multiple_results": "perf_gemma-4-26B-A4B-it.csv",
+    "tags": [
+      "pyt",
+      "vllm",
+      "vllm_extended",
+      "inference"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo google/gemma-4-26B-A4B-it --config configs/default.yaml"
+  },
+  {
+    "name": "pyt_vllm_gemma-4-31b-it",
+    "data": "huggingface",
+    "dockerfile": "docker/pyt_vllm_gemma4",
+    "scripts": "scripts/vllm/run.sh",
+    "n_gpus": "-1",
+    "owner": "mad.support@amd.com",
+    "training_precision": "",
+    "multiple_results": "perf_gemma-4-31B-it.csv",
+    "tags": [
+      "pyt",
+      "vllm",
+      "vllm_extended",
+      "inference"
+    ],
+    "timeout": -1,
+    "args":
+    "--model_repo google/gemma-4-31B-it --config configs/default.yaml"
+  },
   {
     "name": "pyt_vllm_qwen3-32b",
     "data": "huggingface",
diff --git a/scripts/vllm/configs/default.yaml b/scripts/vllm/configs/default.yaml
index 038480d..ce09938 100644
--- a/scripts/vllm/configs/default.yaml
+++ b/scripts/vllm/configs/default.yaml
@@ -92,6 +92,47 @@
     VLLM_ROCM_USE_AITER: 1
   extra_args:
     --attention-backend: ROCM_ATTN
+  arch_overrides:
+    gfx942:
+      dtype: float16
+
+## Gemma 4: vLLM recipe recommends 1x MI300-class GPU (BF16); tp 1 for text-only bench
+## Use TRITON_ATTN (Gemma4 default); 26B-A4B MoE: VLLM_ROCM_USE_AITER_MOE=0; narrow concurrency for 26B to avoid OOM
+- benchmark: serving
+  model: google/gemma-4-26B-A4B-it
+  tp: 1
+  inp: 1024
+  out: 1024
+  dtype: auto
+  max_concurrency: 1 8
+  env:
+    VLLM_ROCM_USE_AITER: 1
+    VLLM_ROCM_USE_AITER_MOE: 0
+  extra_args:
+    --attention-backend: TRITON_ATTN
+    --max-model-len: 32768
+    --gpu-memory-utilization: 0.90
+    --limit-mm-per-prompt: '{"image":0,"audio":0}'
+    --async-scheduling: True
+  arch_overrides:
+    gfx942:
+      dtype: float16
+
+- benchmark: serving
+  model: google/gemma-4-31B-it
+  tp: 1
+  inp: 1024
+  out: 1024
+  dtype: auto
+  max_concurrency: 1 8 32 128
+  env:
+    VLLM_ROCM_USE_AITER: 1
+  extra_args:
+    --attention-backend: TRITON_ATTN
+    --max-model-len: 32768
+    --gpu-memory-utilization: 0.90
+    --limit-mm-per-prompt: '{"image":0,"audio":0}'
+    --async-scheduling: True
   arch_overrides:
     gfx942:
       dtype: float16
\ No newline at end of file
diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py
index b5b5db0..3d20c08 100644
--- a/scripts/vllm/run_vllm.py
+++ b/scripts/vllm/run_vllm.py
@@ -34,6 +34,7 @@
 import signal
 import argparse
 import itertools
+import shlex
 import subprocess
 from typing import List, Dict
 
@@ -490,7 +491,16 @@ def main():
                 if isinstance(v, bool):
                     extra_args_str += f" {k}"
                 else:
-                    extra_args_str += f" {k} {v}"
+                    s = str(v)
+                    st = s.strip()
+                    if (
+                        k == "--limit-mm-per-prompt"
+                        or (st[:1] in "{[")
+                        or any(ch.isspace() for ch in s)
+                    ):
+                        extra_args_str += f" {k} {shlex.quote(s)}"
+                    else:
+                        extra_args_str += f" {k} {v}"
             config["env"] = env_vars_str
             config["extra_args"] = extra_args_str
             

From b4de892ac504ddc061c32451e365b0dab645e937 Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Thu, 30 Apr 2026 21:17:36 +0000
Subject: [PATCH 2/7] chore(vllm): bump base image to vllm-openai-rocm v0.20.0,
 pin transformers>=5.5.0

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 docker/pyt_vllm.ubuntu.amd.Dockerfile        |  3 +-
 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile | 42 --------------------
 2 files changed, 2 insertions(+), 43 deletions(-)
 delete mode 100644 docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile

diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile
index eb59155..0d6b00c 100644
--- a/docker/pyt_vllm.ubuntu.amd.Dockerfile
+++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile
@@ -24,10 +24,11 @@
 # SOFTWARE.
 #
 #################################################################################
-ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.17.1
+ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.20.0
 FROM $BASE_DOCKER
 
 USER root
+RUN pip3 install --no-cache-dir "transformers>=5.5.0"
 ENV WORKSPACE_DIR=/workspace
 RUN mkdir -p $WORKSPACE_DIR
 WORKDIR $WORKSPACE_DIR
diff --git a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile b/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile
deleted file mode 100644
index c5876ad..0000000
--- a/docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile
+++ /dev/null
@@ -1,42 +0,0 @@
-# CONTEXT {'gpu_vendor': 'AMD', 'guest_os': 'UBUNTU'}
-###############################################################################
-#
-# MIT License
-#
-# Copyright (c) Advanced Micro Devices, Inc.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-#
-#################################################################################
-# Gemma 4 requires a vLLM build with Gemma4 support; see vLLM recipes (Google/Gemma4.md).
-ARG BASE_DOCKER=vllm/vllm-openai-rocm:gemma4
-FROM $BASE_DOCKER
-
-USER root
-ENV WORKSPACE_DIR=/workspace
-RUN mkdir -p $WORKSPACE_DIR
-WORKDIR $WORKSPACE_DIR
-
-RUN pip3 install --no-cache-dir "transformers==5.5.0"
-
-# record configuration for posterity
-RUN pip3 list
-
-# Specify entrypoint to override upstream
-ENTRYPOINT [""]

From 77f0161681420e3bab10b5568fd117e2542051a5 Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Thu, 30 Apr 2026 21:17:39 +0000
Subject: [PATCH 3/7] feat(vllm): consolidate Gemma 4 models into standard
 pyt_vllm stack

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 models.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models.json b/models.json
index e764c52..6b1c99a 100644
--- a/models.json
+++ b/models.json
@@ -490,7 +490,7 @@
   {
     "name": "pyt_vllm_gemma-4-26b-a4b-it",
     "data": "huggingface",
-    "dockerfile": "docker/pyt_vllm_gemma4",
+    "dockerfile": "docker/pyt_vllm",
     "scripts": "scripts/vllm/run.sh",
     "n_gpus": "-1",
     "owner": "mad.support@amd.com",
@@ -509,7 +509,7 @@
   {
     "name": "pyt_vllm_gemma-4-31b-it",
     "data": "huggingface",
-    "dockerfile": "docker/pyt_vllm_gemma4",
+    "dockerfile": "docker/pyt_vllm",
     "scripts": "scripts/vllm/run.sh",
     "n_gpus": "-1",
     "owner": "mad.support@amd.com",

From 76bc2767f8cc9f6655cbf987950b7841aa70704e Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Thu, 30 Apr 2026 21:17:41 +0000
Subject: [PATCH 4/7] =?UTF-8?q?docs(vllm):=20update=20README=20=E2=80=94?=
 =?UTF-8?q?=20Gemma=204=20now=20uses=20standard=20pyt=5Fvllm=20image?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 benchmark/vllm/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md
index b4e9782..79046e2 100644
--- a/benchmark/vllm/README.md
+++ b/benchmark/vllm/README.md
@@ -61,10 +61,10 @@ The following command pulls the Docker image from Docker Hub.
 docker pull vllm/vllm-openai-rocm:v0.17.1
 ```
 
-For Gemma 4, use the Gemma4-tagged image (also referenced by [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)):
+Gemma 4 models are served from the standard vLLM image (vLLM ≥0.20.0 ships Transformers v5 with native Gemma 4 support):
 
 ```sh
-docker pull vllm/vllm-openai-rocm:gemma4
+docker pull vllm/vllm-openai-rocm:v0.20.0
 ```
 
 ### MAD-integrated benchmarking
@@ -95,7 +95,7 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki
 >The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators.
 
 >[!NOTE]
->Gemma 4 models (`pyt_vllm_gemma-4-*`) are built from `vllm/vllm-openai-rocm:gemma4` (see [`docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile`](../../docker/pyt_vllm_gemma4.ubuntu.amd.Dockerfile)). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads.
+>Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM ≥0.20.0 / Transformers ≥5.5.0). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads.
 
 Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported.
 
@@ -150,7 +150,7 @@ docker pull vllm/vllm-openai-rocm:v0.17.1
 docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1
 ```
 
-For Gemma 4 standalone runs, substitute `vllm/vllm-openai-rocm:gemma4` for the image tag in the `docker run` line above. For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path.
+For Gemma 4 standalone runs use `vllm/vllm-openai-rocm:v0.20.0` (same image as other vLLM models). For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path.
 
 >[!NOTE]
 >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance

From b455096d696fdcf39e23980327b995682ca38963 Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Thu, 30 Apr 2026 21:17:45 +0000
Subject: [PATCH 5/7] fix(vllm): apply shlex.quote to all non-bool extra_args
 values (was partial)

Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
---
 scripts/vllm/run_vllm.py               | 11 +--
 tests/vllm/__init__.py                 |  0
 tests/vllm/test_run_vllm_extra_args.py | 95 ++++++++++++++++++++++++++
 3 files changed, 96 insertions(+), 10 deletions(-)
 create mode 100644 tests/vllm/__init__.py
 create mode 100644 tests/vllm/test_run_vllm_extra_args.py

diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py
index 3d20c08..8103478 100644
--- a/scripts/vllm/run_vllm.py
+++ b/scripts/vllm/run_vllm.py
@@ -491,16 +491,7 @@ def main():
                 if isinstance(v, bool):
                     extra_args_str += f" {k}"
                 else:
-                    s = str(v)
-                    st = s.strip()
-                    if (
-                        k == "--limit-mm-per-prompt"
-                        or (st[:1] in "{[")
-                        or any(ch.isspace() for ch in s)
-                    ):
-                        extra_args_str += f" {k} {shlex.quote(s)}"
-                    else:
-                        extra_args_str += f" {k} {v}"
+                    extra_args_str += f" {k} {shlex.quote(str(v))}"
             config["env"] = env_vars_str
             config["extra_args"] = extra_args_str
             
diff --git a/tests/vllm/__init__.py b/tests/vllm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py
new file mode 100644
index 0000000..62fa180
--- /dev/null
+++ b/tests/vllm/test_run_vllm_extra_args.py
@@ -0,0 +1,95 @@
+"""Tests for extra_args quoting in run_vllm.py."""
+import sys
+import os
+import shlex
+
+import pytest
+
+
+def build_extra_args_str_old(extra_args: dict) -> str:
+    """Replicates the OLD selective-quoting logic from run_vllm.py (pre-fix)."""
+    extra_args_str = ""
+    for k, v in extra_args.items():
+        if isinstance(v, bool):
+            extra_args_str += f" {k}"
+        else:
+            s = str(v)
+            st = s.strip()
+            if (
+                k == "--limit-mm-per-prompt"
+                or (st[:1] in "{[")
+                or any(ch.isspace() for ch in s)
+            ):
+                extra_args_str += f" {k} {shlex.quote(s)}"
+            else:
+                extra_args_str += f" {k} {s}"
+    return extra_args_str.strip()
+
+
+def build_extra_args_str_new(extra_args: dict) -> str:
+    """Replicates the NEW universal-quoting logic (post-fix)."""
+    extra_args_str = ""
+    for k, v in extra_args.items():
+        if isinstance(v, bool):
+            extra_args_str += f" {k}"
+        else:
+            extra_args_str += f" {k} {shlex.quote(str(v))}"
+    return extra_args_str.strip()
+
+
+# --- Tests that FAIL with the old logic, PASS with the new ---
+
+def test_shell_metachar_no_space_is_quoted_by_new():
+    """Values with shell metacharacters but no spaces are NOT quoted by old code.
+
+    The old code only quotes when there's whitespace, a JSON-like prefix, or the
+    --limit-mm-per-prompt key. A value like 'foo;bar' (no space) slips through
+    unquoted, allowing shell injection. The new code always quotes.
+    """
+    args = {"--some-arg": "foo;bar"}
+    old = build_extra_args_str_old(args)
+    new = build_extra_args_str_new(args)
+    # Old code: no whitespace -> not quoted, semicolon is a live shell metachar
+    assert old == "--some-arg foo;bar", f"unexpected old output: {old!r}"
+    # New code: shlex.quote wraps the value in single quotes
+    assert new == "--some-arg 'foo;bar'", f"unexpected new output: {new!r}"
+    assert old != new
+
+
+def test_plain_string_with_metachar_is_unquoted_by_old():
+    """Old code leaves plain strings with $ unquoted (variable expansion risk)."""
+    args = {"--trust-remote-code": "yes$HOME"}
+    old = build_extra_args_str_old(args)
+    new = build_extra_args_str_new(args)
+    # Old code: no whitespace, no JSON prefix -> raw string passed to shell
+    assert old == "--trust-remote-code yes$HOME", f"unexpected old output: {old!r}"
+    # New code: always quoted
+    assert new == "--trust-remote-code 'yes$HOME'", f"unexpected new output: {new!r}"
+
+
+# --- Tests that PASS with BOTH old and new logic ---
+
+def test_json_value_is_quoted():
+    args = {"--limit-mm-per-prompt": '{"image":0,"audio":0}'}
+    result = build_extra_args_str_new(args)
+    assert result == """--limit-mm-per-prompt '{"image":0,"audio":0}'"""
+
+
+def test_bool_flag_has_no_value():
+    args = {"--async-scheduling": True}
+    result = build_extra_args_str_new(args)
+    assert result == "--async-scheduling"
+
+
+def test_string_with_space_is_quoted():
+    args = {"--served-model-name": "my model"}
+    result = build_extra_args_str_new(args)
+    assert result == "--served-model-name 'my model'"
+
+
+def test_plain_safe_scalar_passthrough():
+    """shlex.quote does not add quotes to safe alphanumeric values."""
+    args = {"--max-model-len": 32768}
+    result = build_extra_args_str_new(args)
+    # shlex.quote('32768') == '32768' (no shell quoting needed for pure digits)
+    assert result == "--max-model-len 32768"

From 11d6bd8cd84c38317f9fdb1e698b26bc869f1c7b Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Fri, 1 May 2026 02:41:57 +0000
Subject: [PATCH 6/7] fix(vllm): address PR review feedback and enable full
 concurrency

- Remove redundant pip install transformers (v0.20.0 ships with v5)
- Delete test_run_vllm_extra_args.py (duplicated inline logic)
- Remove --async-scheduling from Gemma 4 configs (on by default)
- Enable concurrency 32/128 for gemma-4-26B-A4B-it
- Update README to reflect v0.20.0 as the standard base image

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 benchmark/vllm/README.md               | 22 +++---
 docker/pyt_vllm.ubuntu.amd.Dockerfile  |  1 -
 scripts/vllm/configs/default.yaml      | 11 +--
 tests/vllm/test_run_vllm_extra_args.py | 95 --------------------------
 4 files changed, 14 insertions(+), 115 deletions(-)
 delete mode 100644 tests/vllm/test_run_vllm_extra_args.py

diff --git a/benchmark/vllm/README.md b/benchmark/vllm/README.md
index 79046e2..c735530 100644
--- a/benchmark/vllm/README.md
+++ b/benchmark/vllm/README.md
@@ -14,7 +14,7 @@ This Docker image packages vLLM with PyTorch for AMD Instinct™ MI300X, MI325X,
 accelerators. It includes:
 
 -   ✅ ROCm™ 7.0.0
--   ✅ vLLM 0.17.1
+-   ✅ vLLM 0.20.0
 -   ✅ PyTorch 2.9.0 (2.9.0a0+git1c57644)
 -   ✅ hipBLASLt 1.0
 
@@ -57,12 +57,6 @@ To override the benchmark configs, specify a certain benchmark to use, or add yo
 
 The following command pulls the Docker image from Docker Hub.
 
-```sh
-docker pull vllm/vllm-openai-rocm:v0.17.1
-```
-
-Gemma 4 models are served from the standard vLLM image (vLLM ≥0.20.0 ships Transformers v5 with native Gemma 4 support):
-
 ```sh
 docker pull vllm/vllm-openai-rocm:v0.20.0
 ```
@@ -95,14 +89,14 @@ users can also directly run the vLLm benchmark scripts and change the benchmarki
 >The MXFP4 models are only supported on the gfx950 architecture i.e. MI350X/MI355X accelerators.
 
 >[!NOTE]
->Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM ≥0.20.0 / Transformers ≥5.5.0). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads.
+>Gemma 4 models (`pyt_vllm_gemma-4-*`) use the standard `docker/pyt_vllm` stack (vLLM 0.20.0, which bundles Transformers v5 with native Gemma 4 support). Accept Google’s Gemma license on Hugging Face and set `MAD_SECRETS_HFTOKEN` for gated weight downloads.
 
-Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`** where supported.
+Serving recipes for Gemma 4 live in [`scripts/vllm/configs/default.yaml`](../../scripts/vllm/configs/default.yaml). Both Gemma 4 entries use **tensor parallel size 1**, **`TRITON_ATTN`**, **`float16` on gfx942** (via `arch_overrides`), **`--max-model-len` 32768**, text-only multimodal limits (`--limit-mm-per-prompt`), and **`VLLM_ROCM_USE_AITER=1`**.
 
 | Model | Notes |
 | ----- | ----- |
 | **google/gemma-4-31B-it** | Dense instruct. Full serving sweep: **`max_concurrency` 1, 8, 32, 128** (four cold starts). |
-| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. **Concurrency sweep is narrowed to 1 and 8** only for typical MAD Docker memory limits. |
+| **google/gemma-4-26B-A4B-it** | Sparse MoE (“A4B”). **AITER fused MoE is disabled** via **`VLLM_ROCM_USE_AITER_MOE=0`** so MoE runs on the **Triton** path. Full concurrency sweep: **`max_concurrency` 1, 8, 32, 128**. |
 
 | MAD model name                         | Model repo                             |
 | -------------------------------------- | -------------------------------------- |
@@ -145,12 +139,12 @@ Users also can run the benchmark tool after they launch a Docker container. For
 
 #### Docker launch
 ```sh
-docker pull vllm/vllm-openai-rocm:v0.17.1
+docker pull vllm/vllm-openai-rocm:v0.20.0
 
-docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.17.1
+docker run -it --device=/dev/kfd --device=/dev/dri --group-add video --shm-size 16G --security-opt seccomp=unconfined --security-opt apparmor=unconfined --cap-add=SYS_PTRACE -v $(pwd):/workspace --env VLLM_ROCM_USE_AITER=1 --env HUGGINGFACE_HUB_CACHE=/workspace --name test vllm/vllm-openai-rocm:v0.20.0
 ```
 
-For Gemma 4 standalone runs use `vllm/vllm-openai-rocm:v0.20.0` (same image as other vLLM models). For **`google/gemma-4-26B-A4B-it`** only, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path.
+For **`google/gemma-4-26B-A4B-it`** standalone runs, also set **`VLLM_ROCM_USE_AITER_MOE=0`** (same as the MAD `default.yaml` recipe) so MoE does not use AITER’s fused path.
 
 >[!NOTE]
 >We enable [AITER](https://github.com/ROCm/aiter) during `docker run` via `--env VLLM_ROCM_USE_AITER=1` for best performance
@@ -366,7 +360,7 @@ owners and are only mentioned for informative purposes.   
 This release note summarizes notable changes since the previous docker release.
 
 MAD `pyt_vllm_gemma-4-*` configs (see [`default.yaml`](../../scripts/vllm/configs/default.yaml)):
-- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); narrowed default `max_concurrency` to **1 8** to avoid OOM on repeated server restarts.
+- **gemma-4-26B-A4B-it:** set `VLLM_ROCM_USE_AITER_MOE=0` (Triton MoE); full concurrency sweep **1 8 32 128**.
 - **gemma-4-31B-it:** unchanged full sweep **1 8 32 128**; no `VLLM_ROCM_USE_AITER_MOE` override.
 
 v0.17.1 release:
diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile
index 0d6b00c..387a798 100644
--- a/docker/pyt_vllm.ubuntu.amd.Dockerfile
+++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile
@@ -28,7 +28,6 @@ ARG BASE_DOCKER=vllm/vllm-openai-rocm:v0.20.0
 FROM $BASE_DOCKER
 
 USER root
-RUN pip3 install --no-cache-dir "transformers>=5.5.0"
 ENV WORKSPACE_DIR=/workspace
 RUN mkdir -p $WORKSPACE_DIR
 WORKDIR $WORKSPACE_DIR
diff --git a/scripts/vllm/configs/default.yaml b/scripts/vllm/configs/default.yaml
index ce09938..e556918 100644
--- a/scripts/vllm/configs/default.yaml
+++ b/scripts/vllm/configs/default.yaml
@@ -97,14 +97,17 @@
       dtype: float16
 
 ## Gemma 4: vLLM recipe recommends 1x MI300-class GPU (BF16); tp 1 for text-only bench
-## Use TRITON_ATTN (Gemma4 default); 26B-A4B MoE: VLLM_ROCM_USE_AITER_MOE=0; narrow concurrency for 26B to avoid OOM
+## Use TRITON_ATTN (Gemma4 default); ROCM_ATTN does not support head_dim 512 without extra work
+## --limit-mm-per-prompt is JSON for vLLM (json.loads), not image=0,audio=0
+## 26B-A4B is MoE: disable AITER fused MoE only (unsupported CK GEMM for this shape); other AITER features unchanged
+## Full concurrency sweep (1 8 32 128)
 - benchmark: serving
   model: google/gemma-4-26B-A4B-it
   tp: 1
   inp: 1024
   out: 1024
   dtype: auto
-  max_concurrency: 1 8
+  max_concurrency: 1 8 32 128
   env:
     VLLM_ROCM_USE_AITER: 1
     VLLM_ROCM_USE_AITER_MOE: 0
@@ -113,7 +116,6 @@
     --max-model-len: 32768
     --gpu-memory-utilization: 0.90
     --limit-mm-per-prompt: '{"image":0,"audio":0}'
-    --async-scheduling: True
   arch_overrides:
     gfx942:
       dtype: float16
@@ -132,7 +134,6 @@
     --max-model-len: 32768
     --gpu-memory-utilization: 0.90
     --limit-mm-per-prompt: '{"image":0,"audio":0}'
-    --async-scheduling: True
   arch_overrides:
     gfx942:
-      dtype: float16
\ No newline at end of file
+      dtype: float16
diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py
deleted file mode 100644
index 62fa180..0000000
--- a/tests/vllm/test_run_vllm_extra_args.py
+++ /dev/null
@@ -1,95 +0,0 @@
-"""Tests for extra_args quoting in run_vllm.py."""
-import sys
-import os
-import shlex
-
-import pytest
-
-
-def build_extra_args_str_old(extra_args: dict) -> str:
-    """Replicates the OLD selective-quoting logic from run_vllm.py (pre-fix)."""
-    extra_args_str = ""
-    for k, v in extra_args.items():
-        if isinstance(v, bool):
-            extra_args_str += f" {k}"
-        else:
-            s = str(v)
-            st = s.strip()
-            if (
-                k == "--limit-mm-per-prompt"
-                or (st[:1] in "{[")
-                or any(ch.isspace() for ch in s)
-            ):
-                extra_args_str += f" {k} {shlex.quote(s)}"
-            else:
-                extra_args_str += f" {k} {s}"
-    return extra_args_str.strip()
-
-
-def build_extra_args_str_new(extra_args: dict) -> str:
-    """Replicates the NEW universal-quoting logic (post-fix)."""
-    extra_args_str = ""
-    for k, v in extra_args.items():
-        if isinstance(v, bool):
-            extra_args_str += f" {k}"
-        else:
-            extra_args_str += f" {k} {shlex.quote(str(v))}"
-    return extra_args_str.strip()
-
-
-# --- Tests that FAIL with the old logic, PASS with the new ---
-
-def test_shell_metachar_no_space_is_quoted_by_new():
-    """Values with shell metacharacters but no spaces are NOT quoted by old code.
-
-    The old code only quotes when there's whitespace, a JSON-like prefix, or the
-    --limit-mm-per-prompt key. A value like 'foo;bar' (no space) slips through
-    unquoted, allowing shell injection. The new code always quotes.
-    """
-    args = {"--some-arg": "foo;bar"}
-    old = build_extra_args_str_old(args)
-    new = build_extra_args_str_new(args)
-    # Old code: no whitespace -> not quoted, semicolon is a live shell metachar
-    assert old == "--some-arg foo;bar", f"unexpected old output: {old!r}"
-    # New code: shlex.quote wraps the value in single quotes
-    assert new == "--some-arg 'foo;bar'", f"unexpected new output: {new!r}"
-    assert old != new
-
-
-def test_plain_string_with_metachar_is_unquoted_by_old():
-    """Old code leaves plain strings with $ unquoted (variable expansion risk)."""
-    args = {"--trust-remote-code": "yes$HOME"}
-    old = build_extra_args_str_old(args)
-    new = build_extra_args_str_new(args)
-    # Old code: no whitespace, no JSON prefix -> raw string passed to shell
-    assert old == "--trust-remote-code yes$HOME", f"unexpected old output: {old!r}"
-    # New code: always quoted
-    assert new == "--trust-remote-code 'yes$HOME'", f"unexpected new output: {new!r}"
-
-
-# --- Tests that PASS with BOTH old and new logic ---
-
-def test_json_value_is_quoted():
-    args = {"--limit-mm-per-prompt": '{"image":0,"audio":0}'}
-    result = build_extra_args_str_new(args)
-    assert result == """--limit-mm-per-prompt '{"image":0,"audio":0}'"""
-
-
-def test_bool_flag_has_no_value():
-    args = {"--async-scheduling": True}
-    result = build_extra_args_str_new(args)
-    assert result == "--async-scheduling"
-
-
-def test_string_with_space_is_quoted():
-    args = {"--served-model-name": "my model"}
-    result = build_extra_args_str_new(args)
-    assert result == "--served-model-name 'my model'"
-
-
-def test_plain_safe_scalar_passthrough():
-    """shlex.quote does not add quotes to safe alphanumeric values."""
-    args = {"--max-model-len": 32768}
-    result = build_extra_args_str_new(args)
-    # shlex.quote('32768') == '32768' (no shell quoting needed for pure digits)
-    assert result == "--max-model-len 32768"

From cac3f030eb9182cc69642ccc3d6829efd2292fa7 Mon Sep 17 00:00:00 2001
From: Stephen Shao <yu.shao@amd.com>
Date: Fri, 1 May 2026 03:00:35 +0000
Subject: [PATCH 7/7] =?UTF-8?q?fix(vllm):=20address=20Copilot=20review=20?=
 =?UTF-8?q?=E2=80=94=20ENTRYPOINT,=20bool=20flags,=20testable=20helper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Fix ENTRYPOINT [""] → ENTRYPOINT [] to properly clear upstream entrypoint
- Skip bool False flags instead of emitting them on the command line
- Extract build_extra_args_str() as importable module-level function
- Rewrite tests to import and exercise the real production code path

Co-Authored-By: Claude Opus 4 <noreply@anthropic.com>
---
 docker/pyt_vllm.ubuntu.amd.Dockerfile  |  2 +-
 scripts/vllm/run_vllm.py               | 20 ++++++---
 tests/vllm/test_run_vllm_extra_args.py | 61 ++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 7 deletions(-)
 create mode 100644 tests/vllm/test_run_vllm_extra_args.py

diff --git a/docker/pyt_vllm.ubuntu.amd.Dockerfile b/docker/pyt_vllm.ubuntu.amd.Dockerfile
index 387a798..4707632 100644
--- a/docker/pyt_vllm.ubuntu.amd.Dockerfile
+++ b/docker/pyt_vllm.ubuntu.amd.Dockerfile
@@ -36,4 +36,4 @@ WORKDIR $WORKSPACE_DIR
 RUN pip3 list
 
 # Specify entrypoint to override upstream
-ENTRYPOINT [""]
+ENTRYPOINT []
diff --git a/scripts/vllm/run_vllm.py b/scripts/vllm/run_vllm.py
index 8103478..fd96e0c 100644
--- a/scripts/vllm/run_vllm.py
+++ b/scripts/vllm/run_vllm.py
@@ -39,6 +39,19 @@
 from typing import List, Dict
 
 SUPPORTED_LIST_ARGS = ['model', 'tp', 'inp', 'out', 'bs', 'num_prompts', 'max_concurrency']
+
+
+def build_extra_args_str(extra_args: Dict) -> str:
+    parts = []
+    for k, v in extra_args.items():
+        if isinstance(v, bool):
+            if v:
+                parts.append(k)
+        else:
+            parts.append(f"{k} {shlex.quote(str(v))}")
+    return " ".join(parts)
+
+
 CSV_HEADER = [
     "model",
     "benchmark",
@@ -486,12 +499,7 @@ def main():
             env_vars = config.get("env", {})
             extra_args = config.get("extra_args", {})
             env_vars_str = " ".join(f"{k}={v}" for k, v in env_vars.items())
-            extra_args_str = ""
-            for k, v in extra_args.items():
-                if isinstance(v, bool):
-                    extra_args_str += f" {k}"
-                else:
-                    extra_args_str += f" {k} {shlex.quote(str(v))}"
+            extra_args_str = build_extra_args_str(extra_args)
             config["env"] = env_vars_str
             config["extra_args"] = extra_args_str
             
diff --git a/tests/vllm/test_run_vllm_extra_args.py b/tests/vllm/test_run_vllm_extra_args.py
new file mode 100644
index 0000000..7bcb1c3
--- /dev/null
+++ b/tests/vllm/test_run_vllm_extra_args.py
@@ -0,0 +1,61 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", "scripts", "vllm"))
+from run_vllm import build_extra_args_str
+
+
+def test_simple_string_value():
+    result = build_extra_args_str({"--attention-backend": "TRITON_ATTN"})
+    assert result == "--attention-backend TRITON_ATTN"
+
+
+def test_json_value_is_quoted():
+    result = build_extra_args_str({"--limit-mm-per-prompt": '{"image":0,"audio":0}'})
+    assert "'" in result or "\\" in result
+    assert "--limit-mm-per-prompt" in result
+
+
+def test_bool_true_emits_flag():
+    result = build_extra_args_str({"--enable-prefix-caching": True})
+    assert result == "--enable-prefix-caching"
+
+
+def test_bool_false_skips_flag():
+    result = build_extra_args_str({"--enable-prefix-caching": False})
+    assert result == ""
+
+
+def test_numeric_value():
+    result = build_extra_args_str({"--max-model-len": 32768})
+    assert result == "--max-model-len 32768"
+
+
+def test_mixed_args():
+    args = {
+        "--attention-backend": "TRITON_ATTN",
+        "--enable-prefix-caching": True,
+        "--disable-log-stats": False,
+        "--max-model-len": 32768,
+        "--limit-mm-per-prompt": '{"image":0,"audio":0}',
+    }
+    result = build_extra_args_str(args)
+    assert "--attention-backend TRITON_ATTN" in result
+    assert "--enable-prefix-caching" in result
+    assert "--disable-log-stats" not in result
+    assert "--max-model-len 32768" in result
+    assert "--limit-mm-per-prompt" in result
+
+
+def test_empty_args():
+    assert build_extra_args_str({}) == ""
+
+
+def test_value_with_spaces_is_quoted():
+    result = build_extra_args_str({"--chat-template": "path with spaces/template.jinja"})
+    assert "'" in result or "\\" in result
+
+
+def test_shell_metacharacters_are_quoted():
+    result = build_extra_args_str({"--some-arg": "value;rm -rf /"})
+    assert "'" in result or "\\" in result