From 73537aeb33037cabfd931f3ae44d0d257efcffc6 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Wed, 1 Apr 2026 08:51:52 +0000
Subject: [PATCH 1/5] bump vllm to v0.18.0 and pin FlagGems to v5.0.0

---
 .github/configs/ascend.yml      |  2 +-
 .github/configs/cuda.yml        |  2 +-
 .github/workflows/ci.yml        |  4 ++
 docker/ascend/Dockerfile        | 21 +++------
 docker/ascend/Dockerfile.v0.1.0 | 79 +++++++++++++++++++++++++++++++++
 docker/build.sh                 |  4 +-
 docker/cuda/Dockerfile          | 15 +++----
 7 files changed, 101 insertions(+), 26 deletions(-)
 create mode 100644 docker/ascend/Dockerfile.v0.1.0

diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml
index fd1e9fe..5a3cc7f 100644
--- a/.github/configs/ascend.yml
+++ b/.github/configs/ascend.yml
@@ -4,7 +4,7 @@
 platform: ascend
 
 # Docker image for this hardware
-ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-ascend-ci
+ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.2.0-ascend-ci
 
 # Runner labels for this hardware
 runner_labels:
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 5040ce9..34d0ea5 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -4,7 +4,7 @@
 platform: cuda
 
 # Docker image for this hardware
-ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-cuda-ci
+ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.2.0-cuda-ci
 
 # Runner labels for this hardware
 runner_labels:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 153ac43..2d996e7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -9,6 +9,8 @@ on:
     paths-ignore:
       - "**.md"
       - "docs/**"
+      - "examples/**"
+      - "docker/**"
       - "LICENSE"
       - ".github/ISSUE_TEMPLATE/**"
       - ".github/PULL_REQUEST_TEMPLATE.md"
@@ -17,6 +19,8 @@ on:
     paths-ignore:
       - "**.md"
       - "docs/**"
+      - "examples/**"
+      - "docker/**"
       - "LICENSE"
       - ".github/ISSUE_TEMPLATE/**"
       - ".github/PULL_REQUEST_TEMPLATE.md"
diff --git a/docker/ascend/Dockerfile b/docker/ascend/Dockerfile
index 9d10abc..3084b54 100644
--- a/docker/ascend/Dockerfile
+++ b/docker/ascend/Dockerfile
@@ -1,20 +1,12 @@
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 # ---------- base stage ----------
-FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base
+FROM quay.io/ascend/vllm-ascend:nightly-releases-v${VLLM_VERSION} AS base
 
 RUN pip install --upgrade pip setuptools
 
-# CANN Toolkit environment variables (mirrors set_env.sh baked in at build time)
-ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
-ENV LD_LIBRARY_PATH="${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}" \
-    PYTHONPATH="${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}" \
-    PATH="${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${ASCEND_TOOLKIT_HOME}/tools/ccec_compiler/bin:${PATH}"
-
-# Set ATB environment variables
-ENV ATB_HOME_PATH=/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1
-ENV LD_LIBRARY_PATH="${ATB_HOME_PATH}/lib:${ATB_HOME_PATH}/examples:${ATB_HOME_PATH}/tests/atbopstest:${LD_LIBRARY_PATH}" \
-    PATH="${ATB_HOME_PATH}/bin:${PATH}"
+# Add BiShengIR compiler to PATH
+ENV PATH="${ASCEND_TOOLKIT_HOME}/tools/bishengir/bin:${PATH}"
 
 # ---------- dev stage ----------
 FROM base AS dev
@@ -49,8 +41,9 @@ RUN pip install \
         cmake
 
 # Install FlagGems (NPU backend)
+ARG FLAGGEMS_VERSION=v5.0.0
 RUN pip install -U scikit-build-core==0.11 pybind11 \
-    && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \
+    && git clone --branch ${FLAGGEMS_VERSION} --depth 1 https://github.com/flagos-ai/FlagGems /workspace/FlagGems \
     && pip install --no-build-isolation \
         --config-settings=cmake.define.FLAGGEMS_BACKEND=NPU \
         /workspace/FlagGems
@@ -71,7 +64,7 @@ FROM base AS release
 
 ARG INDEX_URL
 ARG EXTRA_INDEX_URL
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 # Install vLLM
 # Todo
diff --git a/docker/ascend/Dockerfile.v0.1.0 b/docker/ascend/Dockerfile.v0.1.0
new file mode 100644
index 0000000..9d10abc
--- /dev/null
+++ b/docker/ascend/Dockerfile.v0.1.0
@@ -0,0 +1,79 @@
+ARG VLLM_VERSION=0.13.0
+
+# ---------- base stage ----------
+FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base
+
+RUN pip install --upgrade pip setuptools
+
+# CANN Toolkit environment variables (mirrors set_env.sh baked in at build time)
+ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
+ENV LD_LIBRARY_PATH="${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}" \
+    PYTHONPATH="${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}" \
+    PATH="${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${ASCEND_TOOLKIT_HOME}/tools/ccec_compiler/bin:${PATH}"
+
+# Set ATB environment variables
+ENV ATB_HOME_PATH=/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1
+ENV LD_LIBRARY_PATH="${ATB_HOME_PATH}/lib:${ATB_HOME_PATH}/examples:${ATB_HOME_PATH}/tests/atbopstest:${LD_LIBRARY_PATH}" \
+    PATH="${ATB_HOME_PATH}/bin:${PATH}"
+
+# ---------- dev stage ----------
+FROM base AS dev
+
+# Install dev tools
+RUN pip install \
+        pytest \
+        pytest-cov \
+        pytest-json-report \
+        ruff \
+        pre-commit \
+        ninja \
+        cmake
+
+# ---------- ci stage ----------
+FROM base AS ci
+
+# Install dev/test tools
+RUN pip install --upgrade pip
+RUN pip install \
+        pytest \
+        pytest-cov \
+        pytest-timeout \
+        pytest-json-report \
+        numpy \
+        requests \
+        decorator \
+        "modelscope>=1.18.1" \
+        ruff \
+        pre-commit \
+        ninja \
+        cmake
+
+# Install FlagGems (NPU backend)
+RUN pip install -U scikit-build-core==0.11 pybind11 \
+    && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \
+    && pip install --no-build-isolation \
+        --config-settings=cmake.define.FLAGGEMS_BACKEND=NPU \
+        /workspace/FlagGems
+
+# Install FlagTree
+RUN pip install flagtree==0.4.0+ascend3.2 \
+        --index-url=https://resource.flagos.net/repository/flagos-pypi-hosted/simple \
+        --trusted-host=resource.flagos.net
+
+# Set environment variables for vLLM and Triton
+ENV VLLM_PLUGINS=fl
+ENV TRITON_ALL_BLOCKS_PARALLEL=1
+
+WORKDIR /workspace
+
+# ---------- release stage ----------
+FROM base AS release
+
+ARG INDEX_URL
+ARG EXTRA_INDEX_URL
+ARG VLLM_VERSION=0.13.0
+
+# Install vLLM
+# Todo
+
+WORKDIR /workspace
diff --git a/docker/build.sh b/docker/build.sh
index f79c05c..ad92b88 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -14,12 +14,12 @@ PYTHON_VERSION="${PYTHON_VERSION:-3.12}"
 UV_VERSION="${UV_VERSION:-0.7.12}"
 CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
 UBUNTU_VERSION="${UBUNTU_VERSION:-22.04}"
-VLLM_VERSION="${VLLM_VERSION:-0.13.0}"
+VLLM_VERSION="${VLLM_VERSION:-0.18.0}"
 
 # ---- Build options ----
 PLATFORM="${PLATFORM:-cuda}"
 TARGET="dev"
-IMAGE_NAME="localhost:5000/vllm-plugin-fl"
+IMAGE_NAME="harbor.baai.ac.cn/flagscale/vllm-plugin-fl"
 IMAGE_TAG=""
 INDEX_URL="${INDEX_URL:-}"
 EXTRA_INDEX_URL="${EXTRA_INDEX_URL:-}"
diff --git a/docker/cuda/Dockerfile b/docker/cuda/Dockerfile
index 8cea7c9..177d696 100644
--- a/docker/cuda/Dockerfile
+++ b/docker/cuda/Dockerfile
@@ -6,7 +6,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS base
 
 ARG PYTHON_VERSION=3.12
 ARG UV_VERSION=0.7.12
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -47,7 +47,7 @@ FROM base AS dev
 
 ARG INDEX_URL
 ARG EXTRA_INDEX_URL
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 # Install vLLM
 RUN uv pip install --system \
@@ -70,7 +70,7 @@ FROM base AS ci
 
 ARG INDEX_URL
 ARG EXTRA_INDEX_URL
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 # Install vLLM
 RUN uv pip install --system \
@@ -92,18 +92,17 @@ RUN uv pip install --system \
         pre-commit \
         ninja \
         cmake
-
+ARG FLAGGEMS_VERSION=v5.0.0
 ARG FLAGCX_VERSION=v0.9.0
 
 # Install FlagGems
 RUN uv pip install --system scikit-build-core==0.11 pybind11 \
-    && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \
+    && git clone --branch ${FLAGGEMS_VERSION} --depth 1 https://github.com/flagos-ai/FlagGems /workspace/FlagGems \
     && uv pip install --system --no-build-isolation /workspace/FlagGems
 
 # Install FlagCX (NVIDIA)
-RUN git clone https://github.com/flagos-ai/FlagCX.git /workspace/FlagCX \
+RUN git clone --branch ${FLAGCX_VERSION} --depth 1  https://github.com/flagos-ai/FlagCX.git /workspace/FlagCX \
     && cd /workspace/FlagCX \
-    && git checkout ${FLAGCX_VERSION} \
     && git submodule update --init --recursive \
     && make USE_NVIDIA=1 \
     && cd plugin/torch \
@@ -125,7 +124,7 @@ FROM base AS release
 
 ARG INDEX_URL
 ARG EXTRA_INDEX_URL
-ARG VLLM_VERSION=0.13.0
+ARG VLLM_VERSION=0.18.0
 
 # Install vLLM
 RUN uv pip install --system \

From ef5484d1b2d9c0ce708a4425d02a826491973e95 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Thu, 2 Apr 2026 02:44:06 +0000
Subject: [PATCH 2/5] fix dubious ownership error

---
 .github/scripts/ascend/setup.sh | 2 ++
 .github/scripts/cuda/setup.sh   | 2 ++
 pyproject.toml                  | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/ascend/setup.sh b/.github/scripts/ascend/setup.sh
index dc1119c..dc9b6ff 100644
--- a/.github/scripts/ascend/setup.sh
+++ b/.github/scripts/ascend/setup.sh
@@ -3,5 +3,7 @@
 # Setup script for Ascend NPU CI environment.
 set -euo pipefail
 
+git config --global --add safe.directory "$(pwd)"
+
 pip install --upgrade pip "setuptools>=77.0.3"
 pip install --no-build-isolation -e ".[test]"
diff --git a/.github/scripts/cuda/setup.sh b/.github/scripts/cuda/setup.sh
index f22faa3..86a4f34 100755
--- a/.github/scripts/cuda/setup.sh
+++ b/.github/scripts/cuda/setup.sh
@@ -3,5 +3,7 @@
 # Setup script for CUDA CI environment.
 set -euo pipefail
 
+git config --global --add safe.directory "$(pwd)"
+
 uv pip install --system --upgrade pip
 uv pip install --system --no-build-isolation -e ".[test]"
diff --git a/pyproject.toml b/pyproject.toml
index f9766d7..e09f006 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,7 @@ test = [
     "requests",
     "openai",
     "decorator",
-    "vllm[audio]==0.13.0",
+    "vllm[audio]==0.18.0",
     "modelscope>=1.18.1",
 ]
 

From bafcb227a3edc6c3c66b84cdd287731801157e84 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Thu, 2 Apr 2026 04:42:44 +0000
Subject: [PATCH 3/5] add privileged to container options to fix nvidia-smi not
 found

---
 .github/configs/cuda.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
index 34d0ea5..78d8e3b 100644
--- a/.github/configs/cuda.yml
+++ b/.github/configs/cuda.yml
@@ -20,6 +20,7 @@ container_volumes:
 
 # Container options (hardware-specific settings)
 container_options: >-
+  --privileged
   --gpus all
   --shm-size=500g
   --hostname vllm-plugin-fl

From 3c6909c64f44c44953f7ebf8386a0ba1d02175c1 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 3 Apr 2026 06:43:11 +0000
Subject: [PATCH 4/5] use vllm-ascend 0.18.0rc1-a3 as base

---
 docker/ascend/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/ascend/Dockerfile b/docker/ascend/Dockerfile
index 3084b54..17ec33f 100644
--- a/docker/ascend/Dockerfile
+++ b/docker/ascend/Dockerfile
@@ -1,7 +1,7 @@
 ARG VLLM_VERSION=0.18.0
 
 # ---------- base stage ----------
-FROM quay.io/ascend/vllm-ascend:nightly-releases-v${VLLM_VERSION} AS base
+FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base
 
 RUN pip install --upgrade pip setuptools
 

From c9a185b5bab1083d02bc0dedb6c74bb41fb75f52 Mon Sep 17 00:00:00 2001
From: xmhubj <xmhubj@gmail.com>
Date: Fri, 3 Apr 2026 11:14:24 +0000
Subject: [PATCH 5/5] add wait_for_gpu_mem before starting e2e tests

---
 tests/models/qwen3/next_tp8.yaml |   6 +-
 tests/run.py                     |  21 +++-
 tests/utils/cleanup.py           | 161 +++++++++++++++++++++++++++++++
 3 files changed, 184 insertions(+), 4 deletions(-)

diff --git a/tests/models/qwen3/next_tp8.yaml b/tests/models/qwen3/next_tp8.yaml
index 0352ab8..6c11d02 100644
--- a/tests/models/qwen3/next_tp8.yaml
+++ b/tests/models/qwen3/next_tp8.yaml
@@ -3,10 +3,10 @@
 llm:
   model: "/data/models/Qwen/Qwen3-Next-80B-A3B-Instruct"
   tensor_parallel_size: 8
-  max_model_len: 16384
-  max_num_batched_tokens: 16384
+  max_model_len: 8192
+  max_num_batched_tokens: 8192
   max_num_seqs: 512
-  gpu_memory_utilization: 0.7
+  gpu_memory_utilization: 0.8
   enforce_eager: true
   trust_remote_code: true
 
diff --git a/tests/run.py b/tests/run.py
index 1e4c202..8128769 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -47,7 +47,8 @@
 _REPO_ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(_REPO_ROOT))
 
-from tests.utils.cleanup import device_cleanup
+from tests.utils.cleanup import device_cleanup, wait_for_memory
+from tests.utils.model_config import ModelConfig
 from tests.utils.platform_config import PlatformConfig
 from tests.utils.report import TestReport, TestResult
 
@@ -351,6 +352,24 @@ def _run_single(self, tc: TestCase) -> TestResult:
                 message="dry-run",
             )
 
+        # Wait for sufficient device memory before e2e tests
+        if tc.task in ("inference", "serving") and tc.model and tc.case:
+            gpu_util = ModelConfig.load(tc.model, tc.case).engine.get(
+                "gpu_memory_utilization", 0.9
+            )
+            ok, info = wait_for_memory(self.config.platform, gpu_util)
+            if not ok:
+                print("[run] FAILED: timed out waiting for device memory")
+                return TestResult(
+                    name=tc.name,
+                    passed=False,
+                    duration=0.0,
+                    message=f"OOM: timed out waiting for device memory\n{info}",
+                    task=tc.task,
+                    model=tc.model,
+                    case=tc.case,
+                )
+
         # Merge extra env vars (e.g. FL_TEST_MODEL/FL_TEST_CASE for inference)
         env = None
         if tc.extra_env:
diff --git a/tests/utils/cleanup.py b/tests/utils/cleanup.py
index 6bbe0b2..ae01a51 100644
--- a/tests/utils/cleanup.py
+++ b/tests/utils/cleanup.py
@@ -22,6 +22,7 @@
 import signal
 import subprocess
 import time
+from collections.abc import Callable
 
 
 def device_cleanup(platform: str, wait: float = 3.0) -> None:
@@ -37,6 +38,10 @@ def device_cleanup(platform: str, wait: float = 3.0) -> None:
     """
     _kill_stale_processes()
 
+    # Clear framework cache to reclaim memory held by PyTorch allocator
+    cache_fn = _PLATFORM_CACHE_CLEAR.get(platform, _cache_clear_noop)
+    cache_fn()
+
     if wait > 0:
         time.sleep(wait)
 
@@ -134,3 +139,159 @@ def _cleanup_noop() -> None:
     "cuda": _cleanup_cuda,
     "ascend": _cleanup_ascend,
 }
+
+
+# ---------------------------------------------------------------------------
+# Memory info (platform-specific)
+# ---------------------------------------------------------------------------
+
+
+def _mem_info_cuda() -> list[tuple[int, int]]:
+    """Return [(free_bytes, total_bytes), ...] for each CUDA device."""
+    import torch
+
+    result = []
+    for i in range(torch.cuda.device_count()):
+        free, total = torch.cuda.mem_get_info(i)
+        result.append((free, total))
+    return result
+
+
+def _mem_info_ascend() -> list[tuple[int, int]]:
+    """Return [(free_bytes, total_bytes), ...] for each Ascend NPU."""
+    import torch
+
+    try:
+        import torch_npu  # noqa: F401
+
+        result = []
+        for i in range(torch.npu.device_count()):
+            free, total = torch.npu.mem_get_info(i)
+            result.append((free, total))
+        return result
+    except (ImportError, AttributeError):
+        return []
+
+
+def _mem_info_noop() -> list[tuple[int, int]]:
+    return []
+
+
+_PLATFORM_MEMORY_INFO: dict[str, Callable[[], list[tuple[int, int]]]] = {
+    "cuda": _mem_info_cuda,
+    "ascend": _mem_info_ascend,
+}
+
+
+# ---------------------------------------------------------------------------
+# Cache clear (platform-specific)
+# ---------------------------------------------------------------------------
+
+
+def _cache_clear_cuda() -> None:
+    """Clear PyTorch CUDA cache."""
+    import torch
+
+    torch.cuda.empty_cache()
+    torch.cuda.ipc_collect()
+
+
+def _cache_clear_ascend() -> None:
+    """Clear PyTorch NPU cache."""
+    try:
+        import torch
+        import torch_npu  # noqa: F401
+
+        torch.npu.empty_cache()
+    except (ImportError, AttributeError):
+        pass
+
+
+def _cache_clear_noop() -> None:
+    pass
+
+
+_PLATFORM_CACHE_CLEAR: dict[str, Callable[[], None]] = {
+    "cuda": _cache_clear_cuda,
+    "ascend": _cache_clear_ascend,
+}
+
+
+# ---------------------------------------------------------------------------
+# Public memory API
+# ---------------------------------------------------------------------------
+
+
+def get_device_memory(platform: str) -> list[tuple[float, float]]:
+    """Return [(free_mb, total_mb), ...] for each device on the platform."""
+    mem_fn = _PLATFORM_MEMORY_INFO.get(platform, _mem_info_noop)
+    return [(free / (1024 * 1024), total / (1024 * 1024)) for free, total in mem_fn()]
+
+
+def wait_for_memory(
+    platform: str,
+    gpu_memory_utilization: float = 0.9,
+    timeout: int = 1800,
+    interval: int = 30,
+) -> tuple[bool, str]:
+    """Wait until devices have enough free memory for the given utilization.
+
+    Args:
+        platform: Platform name (e.g. ``"cuda"``, ``"ascend"``).
+        gpu_memory_utilization: Fraction of total memory the model needs.
+        timeout: Maximum seconds to wait (default 30 min).
+        interval: Seconds between polls.
+
+    Returns:
+        ``(True, info)`` if memory is available, ``(False, info)`` on timeout.
+    """
+    mem_fn = _PLATFORM_MEMORY_INFO.get(platform, _mem_info_noop)
+    cache_fn = _PLATFORM_CACHE_CLEAR.get(platform, _cache_clear_noop)
+
+    deadline = time.time() + timeout
+    attempt = 0
+
+    while True:
+        attempt += 1
+
+        # Kill stale vllm processes from previous e2e tests
+        _kill_stale_processes()
+        # Clear framework cache
+        cache_fn()
+        # Brief pause for resources to be released
+        time.sleep(1)
+
+        mem_info = mem_fn()
+        if not mem_info:
+            return (True, "no devices detected, skipping memory check")
+
+        # Check each device
+        all_ok = True
+        lines = []
+        for i, (free, total) in enumerate(mem_info):
+            required = total * gpu_memory_utilization
+            free_mb = free / (1024 * 1024)
+            total_mb = total / (1024 * 1024)
+            required_mb = required / (1024 * 1024)
+            ok = free >= required
+            status = "OK" if ok else "WAIT"
+            lines.append(
+                f"  Device {i}: {free_mb:.0f}/{total_mb:.0f} MiB free, "
+                f"need {required_mb:.0f} MiB ({gpu_memory_utilization:.0%}) [{status}]"
+            )
+            if not ok:
+                all_ok = False
+
+        info = "\n".join(lines)
+        print(
+            f"[memory] Attempt {attempt} (util={gpu_memory_utilization:.0%}):\n{info}"
+        )
+
+        if all_ok:
+            return (True, info)
+
+        if time.time() >= deadline:
+            return (False, info)
+
+        print(f"[memory] Waiting {interval}s for memory to free up...")
+        time.sleep(interval)