From 73537aeb33037cabfd931f3ae44d0d257efcffc6 Mon Sep 17 00:00:00 2001 From: xmhubj Date: Wed, 1 Apr 2026 08:51:52 +0000 Subject: [PATCH 1/5] bump vllm to v0.18.0 and pin FlagGems to v5.0.0 --- .github/configs/ascend.yml | 2 +- .github/configs/cuda.yml | 2 +- .github/workflows/ci.yml | 4 ++ docker/ascend/Dockerfile | 21 +++------ docker/ascend/Dockerfile.v0.1.0 | 79 +++++++++++++++++++++++++++++++++ docker/build.sh | 4 +- docker/cuda/Dockerfile | 15 +++---- 7 files changed, 101 insertions(+), 26 deletions(-) create mode 100644 docker/ascend/Dockerfile.v0.1.0 diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml index fd1e9fe..5a3cc7f 100644 --- a/.github/configs/ascend.yml +++ b/.github/configs/ascend.yml @@ -4,7 +4,7 @@ platform: ascend # Docker image for this hardware -ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-ascend-ci +ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.2.0-ascend-ci # Runner labels for this hardware runner_labels: diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml index 5040ce9..34d0ea5 100644 --- a/.github/configs/cuda.yml +++ b/.github/configs/cuda.yml @@ -4,7 +4,7 @@ platform: cuda # Docker image for this hardware -ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.1.0-cuda-ci +ci_image: harbor.baai.ac.cn/flagscale/vllm-plugin-fl:v0.2.0-cuda-ci # Runner labels for this hardware runner_labels: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 153ac43..2d996e7 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,8 @@ on: paths-ignore: - "**.md" - "docs/**" + - "examples/**" + - "docker/**" - "LICENSE" - ".github/ISSUE_TEMPLATE/**" - ".github/PULL_REQUEST_TEMPLATE.md" @@ -17,6 +19,8 @@ on: paths-ignore: - "**.md" - "docs/**" + - "examples/**" + - "docker/**" - "LICENSE" - ".github/ISSUE_TEMPLATE/**" - ".github/PULL_REQUEST_TEMPLATE.md" diff --git a/docker/ascend/Dockerfile b/docker/ascend/Dockerfile index 9d10abc..3084b54 100644 --- a/docker/ascend/Dockerfile +++ b/docker/ascend/Dockerfile @@ -1,20 +1,12 @@ -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 # ---------- base stage ---------- -FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base +FROM quay.io/ascend/vllm-ascend:nightly-releases-v${VLLM_VERSION} AS base RUN pip install --upgrade pip setuptools -# CANN Toolkit environment variables (mirrors set_env.sh baked in at build time) -ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest -ENV LD_LIBRARY_PATH="${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}" \ - PYTHONPATH="${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}" \ - PATH="${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${ASCEND_TOOLKIT_HOME}/tools/ccec_compiler/bin:${PATH}" - -# Set ATB environment variables -ENV ATB_HOME_PATH=/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1 -ENV LD_LIBRARY_PATH="${ATB_HOME_PATH}/lib:${ATB_HOME_PATH}/examples:${ATB_HOME_PATH}/tests/atbopstest:${LD_LIBRARY_PATH}" \ - PATH="${ATB_HOME_PATH}/bin:${PATH}" +# Add BiShengIR compiler to PATH +ENV PATH="${ASCEND_TOOLKIT_HOME}/tools/bishengir/bin:${PATH}" # ---------- dev stage ---------- FROM base AS dev @@ -49,8 +41,9 @@ RUN pip install \ cmake # Install FlagGems (NPU backend) +ARG FLAGGEMS_VERSION=v5.0.0 RUN pip install -U scikit-build-core==0.11 pybind11 \ - && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \ + && git clone --branch ${FLAGGEMS_VERSION} --depth 1 https://github.com/flagos-ai/FlagGems /workspace/FlagGems \ && pip install --no-build-isolation \ --config-settings=cmake.define.FLAGGEMS_BACKEND=NPU \ /workspace/FlagGems @@ -71,7 +64,7 @@ FROM base AS release ARG INDEX_URL ARG EXTRA_INDEX_URL -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 # Install vLLM # Todo diff --git a/docker/ascend/Dockerfile.v0.1.0 b/docker/ascend/Dockerfile.v0.1.0 new file mode 100644 index 0000000..9d10abc --- /dev/null +++ b/docker/ascend/Dockerfile.v0.1.0 @@ -0,0 +1,79 @@ +ARG VLLM_VERSION=0.13.0 + +# ---------- base stage ---------- +FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base + +RUN pip install --upgrade pip setuptools + +# CANN Toolkit environment variables (mirrors set_env.sh baked in at build time) +ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest +ENV LD_LIBRARY_PATH="${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:${LD_LIBRARY_PATH}" \ + PYTHONPATH="${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}" \ + PATH="${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${ASCEND_TOOLKIT_HOME}/tools/ccec_compiler/bin:${PATH}" + +# Set ATB environment variables +ENV ATB_HOME_PATH=/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1 +ENV LD_LIBRARY_PATH="${ATB_HOME_PATH}/lib:${ATB_HOME_PATH}/examples:${ATB_HOME_PATH}/tests/atbopstest:${LD_LIBRARY_PATH}" \ + PATH="${ATB_HOME_PATH}/bin:${PATH}" + +# ---------- dev stage ---------- +FROM base AS dev + +# Install dev tools +RUN pip install \ + pytest \ + pytest-cov \ + pytest-json-report \ + ruff \ + pre-commit \ + ninja \ + cmake + +# ---------- ci stage ---------- +FROM base AS ci + +# Install dev/test tools +RUN pip install --upgrade pip +RUN pip install \ + pytest \ + pytest-cov \ + pytest-timeout \ + pytest-json-report \ + numpy \ + requests \ + decorator \ + "modelscope>=1.18.1" \ + ruff \ + pre-commit \ + ninja \ + cmake + +# Install FlagGems (NPU backend) +RUN pip install -U scikit-build-core==0.11 pybind11 \ + && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \ + && pip install --no-build-isolation \ + --config-settings=cmake.define.FLAGGEMS_BACKEND=NPU \ + /workspace/FlagGems + +# Install FlagTree +RUN pip install flagtree==0.4.0+ascend3.2 \ + --index-url=https://resource.flagos.net/repository/flagos-pypi-hosted/simple \ + --trusted-host=resource.flagos.net + +# Set environment variables for vLLM and Triton +ENV VLLM_PLUGINS=fl +ENV TRITON_ALL_BLOCKS_PARALLEL=1 + +WORKDIR /workspace + +# ---------- release stage ---------- +FROM base AS release + +ARG INDEX_URL +ARG EXTRA_INDEX_URL +ARG VLLM_VERSION=0.13.0 + +# Install vLLM +# Todo + +WORKDIR /workspace diff --git a/docker/build.sh b/docker/build.sh index f79c05c..ad92b88 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -14,12 +14,12 @@ PYTHON_VERSION="${PYTHON_VERSION:-3.12}" UV_VERSION="${UV_VERSION:-0.7.12}" CUDA_VERSION="${CUDA_VERSION:-12.8.1}" UBUNTU_VERSION="${UBUNTU_VERSION:-22.04}" -VLLM_VERSION="${VLLM_VERSION:-0.13.0}" +VLLM_VERSION="${VLLM_VERSION:-0.18.0}" # ---- Build options ---- PLATFORM="${PLATFORM:-cuda}" TARGET="dev" -IMAGE_NAME="localhost:5000/vllm-plugin-fl" +IMAGE_NAME="harbor.baai.ac.cn/flagscale/vllm-plugin-fl" IMAGE_TAG="" INDEX_URL="${INDEX_URL:-}" EXTRA_INDEX_URL="${EXTRA_INDEX_URL:-}" diff --git a/docker/cuda/Dockerfile b/docker/cuda/Dockerfile index 8cea7c9..177d696 100644 --- a/docker/cuda/Dockerfile +++ b/docker/cuda/Dockerfile @@ -6,7 +6,7 @@ FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} AS base ARG PYTHON_VERSION=3.12 ARG UV_VERSION=0.7.12 -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 ENV DEBIAN_FRONTEND=noninteractive @@ -47,7 +47,7 @@ FROM base AS dev ARG INDEX_URL ARG EXTRA_INDEX_URL -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 # Install vLLM RUN uv pip install --system \ @@ -70,7 +70,7 @@ FROM base AS ci ARG INDEX_URL ARG EXTRA_INDEX_URL -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 # Install vLLM RUN uv pip install --system \ @@ -92,18 +92,17 @@ RUN uv pip install --system \ pre-commit \ ninja \ cmake - +ARG FLAGGEMS_VERSION=v5.0.0 ARG FLAGCX_VERSION=v0.9.0 # Install FlagGems RUN uv pip install --system scikit-build-core==0.11 pybind11 \ - && git clone https://github.com/flagos-ai/FlagGems /workspace/FlagGems \ + && git clone --branch ${FLAGGEMS_VERSION} --depth 1 https://github.com/flagos-ai/FlagGems /workspace/FlagGems \ && uv pip install --system --no-build-isolation /workspace/FlagGems # Install FlagCX (NVIDIA) -RUN git clone https://github.com/flagos-ai/FlagCX.git /workspace/FlagCX \ +RUN git clone --branch ${FLAGCX_VERSION} --depth 1 https://github.com/flagos-ai/FlagCX.git /workspace/FlagCX \ && cd /workspace/FlagCX \ - && git checkout ${FLAGCX_VERSION} \ && git submodule update --init --recursive \ && make USE_NVIDIA=1 \ && cd plugin/torch \ @@ -125,7 +124,7 @@ FROM base AS release ARG INDEX_URL ARG EXTRA_INDEX_URL -ARG VLLM_VERSION=0.13.0 +ARG VLLM_VERSION=0.18.0 # Install vLLM RUN uv pip install --system \ From ef5484d1b2d9c0ce708a4425d02a826491973e95 Mon Sep 17 00:00:00 2001 From: xmhubj Date: Thu, 2 Apr 2026 02:44:06 +0000 Subject: [PATCH 2/5] fix dubious ownership error --- .github/scripts/ascend/setup.sh | 2 ++ .github/scripts/cuda/setup.sh | 2 ++ pyproject.toml | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/ascend/setup.sh b/.github/scripts/ascend/setup.sh index dc1119c..dc9b6ff 100644 --- a/.github/scripts/ascend/setup.sh +++ b/.github/scripts/ascend/setup.sh @@ -3,5 +3,7 @@ # Setup script for Ascend NPU CI environment. set -euo pipefail +git config --global --add safe.directory "$(pwd)" + pip install --upgrade pip "setuptools>=77.0.3" pip install --no-build-isolation -e ".[test]" diff --git a/.github/scripts/cuda/setup.sh b/.github/scripts/cuda/setup.sh index f22faa3..86a4f34 100755 --- a/.github/scripts/cuda/setup.sh +++ b/.github/scripts/cuda/setup.sh @@ -3,5 +3,7 @@ # Setup script for CUDA CI environment. set -euo pipefail +git config --global --add safe.directory "$(pwd)" + uv pip install --system --upgrade pip uv pip install --system --no-build-isolation -e ".[test]" diff --git a/pyproject.toml b/pyproject.toml index f9766d7..e09f006 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,7 @@ test = [ "requests", "openai", "decorator", - "vllm[audio]==0.13.0", + "vllm[audio]==0.18.0", "modelscope>=1.18.1", ] From bafcb227a3edc6c3c66b84cdd287731801157e84 Mon Sep 17 00:00:00 2001 From: xmhubj Date: Thu, 2 Apr 2026 04:42:44 +0000 Subject: [PATCH 3/5] add privileged to container options to fix nvidia-smi not found --- .github/configs/cuda.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml index 34d0ea5..78d8e3b 100644 --- a/.github/configs/cuda.yml +++ b/.github/configs/cuda.yml @@ -20,6 +20,7 @@ container_volumes: # Container options (hardware-specific settings) container_options: >- + --privileged --gpus all --shm-size=500g --hostname vllm-plugin-fl From 3c6909c64f44c44953f7ebf8386a0ba1d02175c1 Mon Sep 17 00:00:00 2001 From: xmhubj Date: Fri, 3 Apr 2026 06:43:11 +0000 Subject: [PATCH 4/5] use vllm-ascend 0.18.0rc1-a3 as base --- docker/ascend/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/ascend/Dockerfile b/docker/ascend/Dockerfile index 3084b54..17ec33f 100644 --- a/docker/ascend/Dockerfile +++ b/docker/ascend/Dockerfile @@ -1,7 +1,7 @@ ARG VLLM_VERSION=0.18.0 # ---------- base stage ---------- -FROM quay.io/ascend/vllm-ascend:nightly-releases-v${VLLM_VERSION} AS base +FROM quay.io/ascend/vllm-ascend:v${VLLM_VERSION}rc1-a3 AS base RUN pip install --upgrade pip setuptools From c9a185b5bab1083d02bc0dedb6c74bb41fb75f52 Mon Sep 17 00:00:00 2001 From: xmhubj Date: Fri, 3 Apr 2026 11:14:24 +0000 Subject: [PATCH 5/5] add wait_for_gpu_mem before starting e2e tests --- tests/models/qwen3/next_tp8.yaml | 6 +- tests/run.py | 21 +++- tests/utils/cleanup.py | 161 +++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 4 deletions(-) diff --git a/tests/models/qwen3/next_tp8.yaml b/tests/models/qwen3/next_tp8.yaml index 0352ab8..6c11d02 100644 --- a/tests/models/qwen3/next_tp8.yaml +++ b/tests/models/qwen3/next_tp8.yaml @@ -3,10 +3,10 @@ llm: model: "/data/models/Qwen/Qwen3-Next-80B-A3B-Instruct" tensor_parallel_size: 8 - max_model_len: 16384 - max_num_batched_tokens: 16384 + max_model_len: 8192 + max_num_batched_tokens: 8192 max_num_seqs: 512 - gpu_memory_utilization: 0.7 + gpu_memory_utilization: 0.8 enforce_eager: true trust_remote_code: true diff --git a/tests/run.py b/tests/run.py index 1e4c202..8128769 100644 --- a/tests/run.py +++ b/tests/run.py @@ -47,7 +47,8 @@ _REPO_ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(_REPO_ROOT)) -from tests.utils.cleanup import device_cleanup +from tests.utils.cleanup import device_cleanup, wait_for_memory +from tests.utils.model_config import ModelConfig from tests.utils.platform_config import PlatformConfig from tests.utils.report import TestReport, TestResult @@ -351,6 +352,24 @@ def _run_single(self, tc: TestCase) -> TestResult: message="dry-run", ) + # Wait for sufficient device memory before e2e tests + if tc.task in ("inference", "serving") and tc.model and tc.case: + gpu_util = ModelConfig.load(tc.model, tc.case).engine.get( + "gpu_memory_utilization", 0.9 + ) + ok, info = wait_for_memory(self.config.platform, gpu_util) + if not ok: + print("[run] FAILED: timed out waiting for device memory") + return TestResult( + name=tc.name, + passed=False, + duration=0.0, + message=f"OOM: timed out waiting for device memory\n{info}", + task=tc.task, + model=tc.model, + case=tc.case, + ) + # Merge extra env vars (e.g. FL_TEST_MODEL/FL_TEST_CASE for inference) env = None if tc.extra_env: diff --git a/tests/utils/cleanup.py b/tests/utils/cleanup.py index 6bbe0b2..ae01a51 100644 --- a/tests/utils/cleanup.py +++ b/tests/utils/cleanup.py @@ -22,6 +22,7 @@ import signal import subprocess import time +from collections.abc import Callable def device_cleanup(platform: str, wait: float = 3.0) -> None: @@ -37,6 +38,10 @@ def device_cleanup(platform: str, wait: float = 3.0) -> None: """ _kill_stale_processes() + # Clear framework cache to reclaim memory held by PyTorch allocator + cache_fn = _PLATFORM_CACHE_CLEAR.get(platform, _cache_clear_noop) + cache_fn() + if wait > 0: time.sleep(wait) @@ -134,3 +139,159 @@ def _cleanup_noop() -> None: "cuda": _cleanup_cuda, "ascend": _cleanup_ascend, } + + +# --------------------------------------------------------------------------- +# Memory info (platform-specific) +# --------------------------------------------------------------------------- + + +def _mem_info_cuda() -> list[tuple[int, int]]: + """Return [(free_bytes, total_bytes), ...] for each CUDA device.""" + import torch + + result = [] + for i in range(torch.cuda.device_count()): + free, total = torch.cuda.mem_get_info(i) + result.append((free, total)) + return result + + +def _mem_info_ascend() -> list[tuple[int, int]]: + """Return [(free_bytes, total_bytes), ...] for each Ascend NPU.""" + import torch + + try: + import torch_npu # noqa: F401 + + result = [] + for i in range(torch.npu.device_count()): + free, total = torch.npu.mem_get_info(i) + result.append((free, total)) + return result + except (ImportError, AttributeError): + return [] + + +def _mem_info_noop() -> list[tuple[int, int]]: + return [] + + +_PLATFORM_MEMORY_INFO: dict[str, Callable[[], list[tuple[int, int]]]] = { + "cuda": _mem_info_cuda, + "ascend": _mem_info_ascend, +} + + +# --------------------------------------------------------------------------- +# Cache clear (platform-specific) +# --------------------------------------------------------------------------- + + +def _cache_clear_cuda() -> None: + """Clear PyTorch CUDA cache.""" + import torch + + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + + +def _cache_clear_ascend() -> None: + """Clear PyTorch NPU cache.""" + try: + import torch + import torch_npu # noqa: F401 + + torch.npu.empty_cache() + except (ImportError, AttributeError): + pass + + +def _cache_clear_noop() -> None: + pass + + +_PLATFORM_CACHE_CLEAR: dict[str, Callable[[], None]] = { + "cuda": _cache_clear_cuda, + "ascend": _cache_clear_ascend, +} + + +# --------------------------------------------------------------------------- +# Public memory API +# --------------------------------------------------------------------------- + + +def get_device_memory(platform: str) -> list[tuple[float, float]]: + """Return [(free_mb, total_mb), ...] for each device on the platform.""" + mem_fn = _PLATFORM_MEMORY_INFO.get(platform, _mem_info_noop) + return [(free / (1024 * 1024), total / (1024 * 1024)) for free, total in mem_fn()] + + +def wait_for_memory( + platform: str, + gpu_memory_utilization: float = 0.9, + timeout: int = 1800, + interval: int = 30, +) -> tuple[bool, str]: + """Wait until devices have enough free memory for the given utilization. + + Args: + platform: Platform name (e.g. ``"cuda"``, ``"ascend"``). + gpu_memory_utilization: Fraction of total memory the model needs. + timeout: Maximum seconds to wait (default 30 min). + interval: Seconds between polls. + + Returns: + ``(True, info)`` if memory is available, ``(False, info)`` on timeout. + """ + mem_fn = _PLATFORM_MEMORY_INFO.get(platform, _mem_info_noop) + cache_fn = _PLATFORM_CACHE_CLEAR.get(platform, _cache_clear_noop) + + deadline = time.time() + timeout + attempt = 0 + + while True: + attempt += 1 + + # Kill stale vllm processes from previous e2e tests + _kill_stale_processes() + # Clear framework cache + cache_fn() + # Brief pause for resources to be released + time.sleep(1) + + mem_info = mem_fn() + if not mem_info: + return (True, "no devices detected, skipping memory check") + + # Check each device + all_ok = True + lines = [] + for i, (free, total) in enumerate(mem_info): + required = total * gpu_memory_utilization + free_mb = free / (1024 * 1024) + total_mb = total / (1024 * 1024) + required_mb = required / (1024 * 1024) + ok = free >= required + status = "OK" if ok else "WAIT" + lines.append( + f" Device {i}: {free_mb:.0f}/{total_mb:.0f} MiB free, " + f"need {required_mb:.0f} MiB ({gpu_memory_utilization:.0%}) [{status}]" + ) + if not ok: + all_ok = False + + info = "\n".join(lines) + print( + f"[memory] Attempt {attempt} (util={gpu_memory_utilization:.0%}):\n{info}" + ) + + if all_ok: + return (True, info) + + if time.time() >= deadline: + return (False, info) + + print(f"[memory] Waiting {interval}s for memory to free up...") + time.sleep(interval)