diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 10c03c3e1773..f93e86d9ec09 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -223,6 +223,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}" check_and_skip_if_image_exists +# The rust frontend lives in a git submodule under rust/. Buildkite's default +# checkout does not recurse submodules, and the Dockerfile only sees what's in +# the build context, so initialize the submodule here before invoking bake. +echo "--- :git: Initializing git submodules" +git submodule sync --recursive +git submodule update --init --recursive + echo "--- :docker: Setting up Docker buildx bake" echo "Target: ${TARGET}" echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}" diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh index 035f070ab891..a8f76d5b2657 100755 --- a/.buildkite/image_build/image_build_cpu.sh +++ b/.buildkite/image_build/image_build_cpu.sh @@ -21,6 +21,12 @@ else exit 0 fi +# The rust frontend lives in a git submodule under rust/. Buildkite's default +# checkout does not recurse submodules, and the Dockerfile only sees what's in +# the build context, so initialize the submodule here before building. +git submodule sync --recursive +git submodule update --init --recursive + # build docker build --file docker/Dockerfile.cpu \ --build-arg max_jobs=16 \ diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh index b561e2c2e463..9ee0b353aa8a 100755 --- a/.buildkite/image_build/image_build_cpu_arm64.sh +++ b/.buildkite/image_build/image_build_cpu_arm64.sh @@ -21,6 +21,12 @@ else exit 0 fi +# The rust frontend lives in a git submodule under rust/. Buildkite's default +# checkout does not recurse submodules, and the Dockerfile only sees what's in +# the build context, so initialize the submodule here before building. +git submodule sync --recursive +git submodule update --init --recursive + # build docker build --file docker/Dockerfile.cpu \ --build-arg max_jobs=16 \ diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh index 60fa1789fa06..df900dc60342 100755 --- a/.buildkite/image_build/image_build_hpu.sh +++ b/.buildkite/image_build/image_build_hpu.sh @@ -11,7 +11,7 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true # skip build if image already exists if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh index c3734dce13ca..45417b7339be 100755 --- a/.buildkite/image_build/image_build_xpu.sh +++ b/.buildkite/image_build/image_build_xpu.sh @@ -11,8 +11,8 @@ REPO=$2 BUILDKITE_COMMIT=$3 # authenticate with AWS ECR -aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" -aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true +aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true # skip build if image already exists if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml new file mode 100644 index 000000000000..4afafd856ed4 --- /dev/null +++ b/.buildkite/test_areas/rust_frontend.yaml @@ -0,0 +1,107 @@ +group: Rust Frontend +depends_on: + - image-build +steps: +- label: Rust Frontend OpenAI Coverage + timeout_in_minutes: 90 + device: h200_18gb + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - rust/ + - vllm/benchmarks/ + - vllm/entrypoints/openai/ + - vllm/entrypoints/serve/ + - vllm/v1/sample/ + - tests/utils.py + - tests/benchmarks/test_serve_cli.py + - tests/entrypoints/openai/chat_completion/test_chat_completion.py + # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py + # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py + # - tests/entrypoints/openai/completion/test_prompt_validation.py + - tests/entrypoints/openai/completion/test_shutdown.py + # - tests/entrypoints/openai/test_return_token_ids.py + # - tests/entrypoints/openai/test_uds.py + - tests/v1/sample/test_logprobs_e2e.py + commands: + - export VLLM_USE_RUST_FRONTEND=1 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)" + - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py + # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid" + # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py + # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds" + - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly" + # - pytest -v -s entrypoints/openai/test_return_token_ids.py + # - pytest -v -s entrypoints/openai/test_uds.py + - pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server" + +- label: Rust Frontend Serve/Admin Coverage + timeout_in_minutes: 60 + device: h200_18gb + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - rust/ + - vllm/entrypoints/openai/ + - vllm/entrypoints/serve/ + - vllm/v1/engine/ + - tests/utils.py + # - tests/entrypoints/rpc/test_collective_rpc.py + - tests/entrypoints/serve/disagg/test_serving_tokens.py + - tests/entrypoints/serve/instrumentator/test_basic.py + - tests/entrypoints/serve/instrumentator/test_metrics.py + # - tests/entrypoints/serve/instrumentator/test_sleep.py + commands: + - export VLLM_USE_RUST_FRONTEND=1 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # - pytest -v -s entrypoints/rpc/test_collective_rpc.py + - pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load" + - pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow" + - pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist" + # - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py + +- label: Rust Frontend Core Correctness + timeout_in_minutes: 30 + device: h200_18gb + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - rust/ + - vllm/entrypoints/openai/ + - tests/utils.py + - tests/entrypoints/openai/correctness/test_lmeval.py + commands: + - export VLLM_USE_RUST_FRONTEND=1 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine + +- label: Rust Frontend Tool Use + timeout_in_minutes: 60 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - rust/ + - vllm/entrypoints/openai/ + - vllm/tool_parsers/ + - tests/utils.py + - tests/tool_use/ + commands: + - export VLLM_USE_RUST_FRONTEND=1 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 toolACE -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice" + +- label: Rust Frontend Distributed + timeout_in_minutes: 30 + num_devices: 4 + working_dir: "/vllm-workspace/tests" + source_file_dependencies: + - rust/ + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/v1/engine/ + - vllm/v1/worker/ + - tests/utils.py + - tests/v1/distributed/test_internal_lb_dp.py + commands: + - export VLLM_USE_RUST_FRONTEND=1 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - export NCCL_CUMEM_HOST_ENABLE=0 + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info" diff --git a/.dockerignore b/.dockerignore index 3863656915d0..66447272e95a 100644 --- a/.dockerignore +++ b/.dockerignore @@ -2,6 +2,7 @@ /build dist vllm/*.so +vllm/vllm-rs # Byte-compiled / optimized / DLL files __pycache__/ @@ -31,3 +32,4 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +rust/target/ diff --git a/.gitignore b/.gitignore index e53d19b35340..2c4e135e58dc 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,9 @@ __pycache__/ # C extensions *.so +# Rust binaries +vllm/vllm-rs + # Distribution / packaging .Python build/ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000000..79d557ecd69e --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "rust"] + path = rust + url = https://github.com/Inferact/vllm-frontend-rs.git diff --git a/build_rust.sh b/build_rust.sh new file mode 100755 index 000000000000..fb4a589de4c0 --- /dev/null +++ b/build_rust.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Build the vllm-rs Rust frontend binary and install it into the vllm package. +# Usage: ./build_rust.sh [--debug] +# +# By default builds in release mode. Pass --debug for faster compile times +# during development. + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "$0")" && pwd)" +RUST_DIR="$REPO_ROOT/rust" +TARGET_PATH="$REPO_ROOT/vllm/vllm-rs" + +# Read the required toolchain from rust-toolchain.toml. +TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/') + +# Ensure rustup and the required toolchain are available. +if ! command -v rustup &>/dev/null; then + echo "rustup not found, installing..." + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none + source "$HOME/.cargo/env" +fi + +if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then + echo "Installing Rust toolchain: $TOOLCHAIN" + rustup toolchain install "$TOOLCHAIN" +fi + +if [[ "${1:-}" == "--debug" ]]; then + PROFILE_ARGS=() + PROFILE_DIR="debug" +else + PROFILE_ARGS=(--release) + PROFILE_DIR="release" +fi + +cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \ + --manifest-path "$RUST_DIR/Cargo.toml" \ + --bin vllm-rs \ + --features native-tls-vendored + +cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH" +echo "Installed vllm-rs to $TARGET_PATH" diff --git a/docker/Dockerfile b/docker/Dockerfile index fd0622e2416a..f2abc1db6bc2 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -119,6 +119,8 @@ RUN if [ "${BUILD_OS}" = "manylinux" ]; then \ curl \ sudo \ rdma-core-devel \ + protobuf-compiler \ + protobuf-devel \ && dnf clean all \ && rm -rf /var/cache/dnf; \ else \ @@ -131,6 +133,7 @@ RUN if [ "${BUILD_OS}" = "manylinux" ]; then \ sudo \ python3-pip \ libibverbs-dev \ + protobuf-compiler libprotobuf-dev \ # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # as it was causing spam when compiling the CUTLASS kernels gcc-10 \ @@ -165,6 +168,11 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" ENV VIRTUAL_ENV="/opt/venv" +# Compiler and linker environment +ENV CC=/usr/bin/gcc-10 CXX=/usr/bin/g++-10 +ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10 +ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10 + # Environment for uv ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" @@ -416,12 +424,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi +# Install Rust toolchain for building Rust extensions. +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --profile minimal --default-toolchain none +ENV PATH="/root/.cargo/bin:${PATH}" + WORKDIR /workspace # Copy pre-built csrc wheel directly COPY --from=csrc-build /workspace/dist /precompiled-wheels COPY . . +# Fail loudly if the rust submodule was not initialized on the host before +# `docker build`. The rust frontend source is brought in via `COPY . .`, so an +# uninitialized submodule would otherwise produce a confusing cargo failure. +RUN if [ ! -f rust/Cargo.toml ]; then \ + echo "ERROR: rust/ submodule is not initialized."; \ + echo "Run 'git submodule update --init --recursive' on the host before building."; \ + exit 1; \ + fi + +# Require the Rust frontend to be successfully built into the final wheel. +ENV VLLM_REQUIRE_RUST_FRONTEND=1 + ARG GIT_REPO_CHECK=0 RUN --mount=type=bind,source=.git,target=.git \ if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi @@ -442,6 +467,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=.git,target=.git \ if [ "${vllm_target_device}" = "cuda" ]; then \ + export VLLM_USE_PRECOMPILED=1; \ export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \ fi && \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu index d15ced8e0111..43f3da4cea98 100644 --- a/docker/Dockerfile.cpu +++ b/docker/Dockerfile.cpu @@ -33,10 +33,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ apt-get update -y \ && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \ gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \ + protobuf-compiler libprotobuf-dev \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && curl -LsSf https://astral.sh/uv/install.sh | sh +# Compiler and linker environment ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12 +ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12 +ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12 ENV CCACHE_DIR=/root/.cache/ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache @@ -112,8 +116,25 @@ COPY requirements/build/cpu.txt requirements/build/cpu.txt RUN --mount=type=cache,target=/root/.cache/uv \ uv pip install -r requirements/build/cpu.txt +# Install Rust toolchain for building Rust extensions. +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --profile minimal --default-toolchain none +ENV PATH="/root/.cargo/bin:${PATH}" + +# Require Rust frontend to be successfully built into the wheel +ENV VLLM_REQUIRE_RUST_FRONTEND=1 + COPY . . +# Fail loudly if the rust submodule was not initialized on the host before +# `docker build`. The rust frontend source is brought in via `COPY . .`, so an +# uninitialized submodule would otherwise produce a confusing cargo failure. +RUN if [ ! -f rust/Cargo.toml ]; then \ + echo "ERROR: rust/ submodule is not initialized."; \ + echo "Run 'git submodule update --init --recursive' on the host before building."; \ + exit 1; \ + fi + RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi RUN --mount=type=cache,target=/root/.cache/uv \ diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 0ed12f11da94..07feff33e249 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -25,7 +25,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ apt-transport-https ca-certificates wget curl \ - libnuma-dev + libnuma-dev \ + protobuf-compiler libprotobuf-dev RUN python3 -m pip install --upgrade pip # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE @@ -48,6 +49,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# Install Rust toolchain for building Rust extensions. +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \ + sh -s -- -y --profile minimal --default-toolchain none +ENV PATH="/root/.cargo/bin:${PATH}" + # Install sccache if USE_SCCACHE is enabled (for release builds) ARG USE_SCCACHE ARG SCCACHE_DOWNLOAD_URL @@ -98,6 +104,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \ && cd vllm \ && git fetch -v --prune -- origin ${VLLM_BRANCH} \ && git checkout FETCH_HEAD \ + && git submodule update --init --recursive \ && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \ git remote add upstream "https://github.com/vllm-project/vllm.git" \ && git fetch upstream ; fi @@ -106,6 +113,24 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm + +# Fail loudly if the rust submodule was not initialized on the host before +# `docker build`. The rust frontend source is brought in via the fetch_vllm +# stage, so an uninitialized submodule would otherwise produce a confusing +# cargo failure. +RUN if [ ! -f vllm/rust/Cargo.toml ]; then \ + echo "ERROR: rust/ submodule is not initialized."; \ + echo "Run 'git submodule update --init --recursive' on the host before building."; \ + exit 1; \ + fi + +# Require the Rust frontend to be successfully built into the final wheel. +ENV VLLM_REQUIRE_RUST_FRONTEND=1 + +# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit +# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise). +ENV CARGO_BUILD_JOBS=4 + # Build vLLM (setup.py auto-detects sccache in PATH) RUN cd vllm \ && python3 -m pip install -r requirements/rocm.txt \ @@ -272,6 +297,23 @@ FROM fetch_vllm AS build_vllm_wheel_release ARG COMMON_WORKDIR +# Fail loudly if the rust submodule was not initialized on the host before +# `docker build`. The rust frontend source is brought in via the fetch_vllm +# stage, so an uninitialized submodule would otherwise produce a confusing +# cargo failure. +RUN if [ ! -f vllm/rust/Cargo.toml ]; then \ + echo "ERROR: rust/ submodule is not initialized."; \ + echo "Run 'git submodule update --init --recursive' on the host before building."; \ + exit 1; \ + fi + +# Require the Rust frontend to be successfully built into the final wheel. +ENV VLLM_REQUIRE_RUST_FRONTEND=1 + +# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit +# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise). +ENV CARGO_BUILD_JOBS=4 + # Create /install directory for custom wheels RUN mkdir -p /install diff --git a/docs/contributing/README.md b/docs/contributing/README.md index a538a408e986..fd91c8978ac0 100644 --- a/docs/contributing/README.md +++ b/docs/contributing/README.md @@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using: VLLM_USE_PRECOMPILED=1 uv pip install -e . ``` +To rebuild only the Rust frontend binary: + +```bash +./build_rust.sh # release build +./build_rust.sh --debug # faster build for development +``` + If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first: ```bash diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md index a07b9c1c2fa4..8f3512db3d40 100644 --- a/docs/contributing/ci/nightly_builds.md +++ b/docs/contributing/ci/nightly_builds.md @@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script: 3. **Selects compatible wheel** based on: - Package name (`vllm`) - Platform tag (architecture match) -4. **Downloads and extracts** precompiled binaries from the wheel: - - C++ extension modules (`.so` files) - - Flash Attention Python modules - - Triton kernel Python files +4. **Downloads and extracts** precompiled artifacts from the wheel: + - Native extension modules (`.so` files) + - The `vllm-rs` Rust frontend binary + - Flash Attention Python modules and Triton/FlashMLA Python files 5. **Patches package_data** to include extracted files in the installation !!! note "What is the base commit?" diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md index 309dd671251e..ec333b3ee1bf 100644 --- a/docs/getting_started/installation/gpu.cuda.inc.md +++ b/docs/getting_started/installation/gpu.cuda.inc.md @@ -101,12 +101,22 @@ This command will do the following: 1. Look for the current branch in your vLLM clone. 1. Identify the corresponding base commit in the main branch. 1. Download the pre-built wheel of the base commit. -1. Use its compiled libraries in the installation. +1. Use its compiled libraries and `vllm-rs` binary in the installation. !!! note 1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol. 2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date. +!!! tip "Rebuilding the Rust frontend" +If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install: + + ```bash + ./build_rust.sh # release build + ./build_rust.sh --debug # faster build for development + ``` + + This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`. + In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`. ```bash diff --git a/pyproject.toml b/pyproject.toml index b8d14463256d..568af0311753 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ requires = [ "packaging>=24.2", "setuptools>=77.0.3,<81.0.0", "setuptools-scm>=8.0", + "setuptools-rust>=1.9.0", "torch == 2.11.0", "wheel", "jinja2", diff --git a/requirements/build/cpu.txt b/requirements/build/cpu.txt index 16ada0572273..640432ddd8cc 100644 --- a/requirements/build/cpu.txt +++ b/requirements/build/cpu.txt @@ -4,6 +4,7 @@ ninja packaging>=24.2 setuptools==77.0.3 # this version can reuse CMake build dir setuptools-scm>=8 +setuptools-rust>=1.9.0 torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64" torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64" wheel diff --git a/requirements/build/cuda.txt b/requirements/build/cuda.txt index 490b0bdbc530..70da484a4133 100644 --- a/requirements/build/cuda.txt +++ b/requirements/build/cuda.txt @@ -4,6 +4,7 @@ ninja packaging>=24.2 setuptools>=77.0.3,<81.0.0 setuptools-scm>=8 +setuptools-rust>=1.9.0 torch==2.11.0 wheel jinja2>=3.1.6 diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 0b472b90c026..61fcbc07010c 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -15,6 +15,7 @@ tensorizer==2.10.1 packaging>=24.2 setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 +setuptools-rust>=1.9.0 runai-model-streamer[s3,gcs,azure]==0.15.7 conch-triton-kernels==1.2.1 timm>=1.0.17 diff --git a/rust b/rust new file mode 160000 index 000000000000..eb2b54b1a0b3 --- /dev/null +++ b/rust @@ -0,0 +1 @@ +Subproject commit eb2b54b1a0b3ef2904a25538bc0211f058f7e95f diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 000000000000..4933b3ba1707 --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,2 @@ +[toolchain] +channel = "1.95" diff --git a/setup.py b/setup.py index 7c226a72425f..9a5350fac73c 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,8 @@ from packaging.version import Version, parse from setuptools import Extension, setup from setuptools.command.build_ext import build_ext +from setuptools_rust import Binding, RustExtension +from setuptools_rust.build import build_rust from setuptools_scm import get_version from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME @@ -33,11 +35,24 @@ def load_module_from_path(module_name, path): ROOT_DIR = Path(__file__).parent logger = logging.getLogger(__name__) +PRECOMPILED_RUST_FRONTEND_PATH = ROOT_DIR / "vllm" / "vllm-rs" + # cannot import envs directly because it depends on vllm, # which is not installed yet envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py")) VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE +USE_PRECOMPILED_EXTENSIONS = envs.VLLM_USE_PRECOMPILED +# VLLM_USE_PRECOMPILED implies precompiled rust frontend too. +USE_PRECOMPILED_RUST_FRONTEND = ( + envs.VLLM_USE_PRECOMPILED or envs.VLLM_USE_PRECOMPILED_RUST +) + + +def should_require_rust_frontend() -> bool: + value = os.getenv("VLLM_REQUIRE_RUST_FRONTEND", "") + return value.lower() not in ("", "0", "false", "no") + if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu": logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS") @@ -405,6 +420,24 @@ def build_extensions(self) -> None: return +class precompiled_build_rust(build_rust): + """Skips local Rust builds when the precompiled wheel already ships vllm-rs.""" + + def run(self) -> None: + if PRECOMPILED_RUST_FRONTEND_PATH.exists(): + logger.info( + "Skipping local Rust build: using precompiled %s", + PRECOMPILED_RUST_FRONTEND_PATH, + ) + return + + logger.warning( + "Precompiled wheel did not provide %s; falling back to local Rust build.", + PRECOMPILED_RUST_FRONTEND_PATH, + ) + super().run() + + class precompiled_wheel_utils: """Extracts libraries and other files from an existing wheel.""" @@ -653,7 +686,11 @@ def determine_wheel_url() -> tuple[str, str | None]: @staticmethod def extract_precompiled_and_patch_package( - wheel_url_or_path: str, download_filename: str | None + wheel_url_or_path: str, + download_filename: str | None, + *, + extract_extensions: bool, + extract_rust_frontend: bool, ) -> dict: import tempfile import zipfile @@ -676,19 +713,25 @@ def extract_precompiled_and_patch_package( package_data_patch = {} with zipfile.ZipFile(wheel_path) as wheel: - files_to_copy = [ - "vllm/_C.abi3.so", - "vllm/_C_stable_libtorch.abi3.so", - "vllm/_moe_C.abi3.so", - "vllm/_flashmla_C.abi3.so", - "vllm/_flashmla_extension_C.abi3.so", - "vllm/_sparse_flashmla_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", - "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", - "vllm/cumem_allocator.abi3.so", - # ROCm-specific libraries - "vllm/_rocm_C.abi3.so", - ] + exact_members = set() + if extract_extensions: + exact_members.update( + { + "vllm/_C.abi3.so", + "vllm/_C_stable_libtorch.abi3.so", + "vllm/_moe_C.abi3.so", + "vllm/_flashmla_C.abi3.so", + "vllm/_flashmla_extension_C.abi3.so", + "vllm/_sparse_flashmla_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so", + "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so", + "vllm/cumem_allocator.abi3.so", + # ROCm-specific libraries + "vllm/_rocm_C.abi3.so", + } + ) + if extract_rust_frontend: + exact_members.add("vllm/vllm-rs") flash_attn_regex = re.compile( r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py" @@ -707,27 +750,23 @@ def extract_precompiled_and_patch_package( ) # DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.) deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*") - file_members = list( - filter(lambda x: x.filename in files_to_copy, wheel.filelist) - ) - file_members += list( - filter( - lambda x: flash_attn_regex.match(x.filename) - and x.filename not in flash_attn_files_to_skip, - wheel.filelist, - ) - ) - file_members += list( - filter( - lambda x: triton_kernels_regex.match(x.filename), wheel.filelist - ) - ) - file_members += list( - filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist) - ) - file_members += list( - filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist) - ) + file_members = [] + for member in wheel.filelist: + if member.filename in exact_members: + file_members.append(member) + continue + + if not extract_extensions: + continue + + if ( + flash_attn_regex.match(member.filename) + and member.filename not in flash_attn_files_to_skip + or triton_kernels_regex.match(member.filename) + or flashmla_regex.match(member.filename) + or deep_gemm_regex.match(member.filename) + ): + file_members.append(member) for file in file_members: print(f"[extract] {file.filename}") @@ -738,6 +777,9 @@ def extract_precompiled_and_patch_package( open(target_path, "wb") as dst, ): shutil.copyfileobj(src, dst) + mode = file.external_attr >> 16 + if mode: + os.chmod(target_path, mode) pkg = os.path.dirname(file.filename).replace("/", ".") package_data_patch.setdefault(pkg, []).append( @@ -910,7 +952,7 @@ def get_vllm_version() -> str: if envs.VLLM_TARGET_DEVICE == "empty": version += f"{sep}empty" elif _is_cuda(): - if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: + if USE_PRECOMPILED_EXTENSIONS and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: version += f"{sep}precompiled" else: cuda_version = str(get_nvcc_cuda_version()) @@ -998,7 +1040,7 @@ def _read_requirements(filename: str) -> list[str]: if _is_cuda(): ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C")) - if envs.VLLM_USE_PRECOMPILED or ( + if USE_PRECOMPILED_EXTENSIONS or ( CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3") ): # FA3 requires CUDA 12.3 or later @@ -1008,7 +1050,7 @@ def _read_requirements(filename: str) -> list[str]: ext_modules.append( CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True) ) - if envs.VLLM_USE_PRECOMPILED or ( + if USE_PRECOMPILED_EXTENSIONS or ( CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9") ): # FlashMLA requires CUDA 12.9 or later @@ -1059,11 +1101,14 @@ def _read_requirements(filename: str) -> list[str]: } -# If using precompiled, extract and patch package_data (in advance of setup) -if envs.VLLM_USE_PRECOMPILED: +# If using precompiled artifacts, extract and patch package_data in advance. +if USE_PRECOMPILED_RUST_FRONTEND: wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url() patch = precompiled_wheel_utils.extract_precompiled_and_patch_package( - wheel_url, download_filename + wheel_url, + download_filename, + extract_extensions=USE_PRECOMPILED_EXTENSIONS, + extract_rust_frontend=True, ) for pkg, files in patch.items(): package_data.setdefault(pkg, []).extend(files) @@ -1076,14 +1121,32 @@ def _read_requirements(filename: str) -> list[str]: else: cmdclass = { "build_ext": precompiled_build_ext - if envs.VLLM_USE_PRECOMPILED + if USE_PRECOMPILED_EXTENSIONS else cmake_build_ext, } +if USE_PRECOMPILED_RUST_FRONTEND: + cmdclass["build_rust"] = precompiled_build_rust + +# Rust frontend binary, built via setuptools-rust and installed into the +# package directory alongside the Python modules. +# TODO: we may use `RustBin` to directly install it into `bin` directory, but this +# requires extra work on using precompiled binaries. +rust_extensions = [ + RustExtension( + target="vllm.vllm-rs", + path="rust/src/cmd/Cargo.toml", + args=["--bin", "vllm-rs"], + features=["native-tls-vendored"], + binding=Binding.Exec, + optional=not should_require_rust_frontend(), + ), +] setup( # static metadata should rather go in pyproject.toml version=get_vllm_version(), ext_modules=ext_modules, + rust_extensions=rust_extensions, install_requires=get_requirements(), extras_require={ # AMD Zen CPU optimizations via zentorch diff --git a/tests/entrypoints/openai/completion/test_shutdown.py b/tests/entrypoints/openai/completion/test_shutdown.py index 966c9f869c44..04af1c32f918 100644 --- a/tests/entrypoints/openai/completion/test_shutdown.py +++ b/tests/entrypoints/openai/completion/test_shutdown.py @@ -300,7 +300,7 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float): proc.send_signal(signal.SIGTERM) # abort timeout (0) should exit promptly - for _ in range(20): + for _ in range(40): if proc.poll() is not None: break time.sleep(0.1) @@ -311,7 +311,7 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float): pytest.fail("Process did not exit after SIGTERM with abort timeout") exit_time = time.time() - start_time - assert exit_time < 2.1, f"Default shutdown took too long: {exit_time:.1f}s" + assert exit_time < 4.1, f"Default shutdown took too long: {exit_time:.1f}s" assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}" await _assert_children_cleaned_up(child_pids) diff --git a/tests/test_envs.py b/tests/test_envs.py index 3f3add2ab764..40bbe87e8785 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -95,6 +95,19 @@ def test_is_envs_cache_enabled() -> None: assert not envs._is_envs_cache_enabled() +def test_precompiled_install_flags_are_orthogonal() -> None: + with patch.dict( + os.environ, + { + "VLLM_PRECOMPILED_WHEEL_LOCATION": "/tmp/vllm.whl", + "VLLM_USE_PRECOMPILED_RUST": "1", + }, + clear=False, + ): + assert environment_variables["VLLM_USE_PRECOMPILED"]() is False + assert environment_variables["VLLM_USE_PRECOMPILED_RUST"]() is True + + class TestEnvWithChoices: """Test cases for env_with_choices function.""" diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py index 9871a27da381..0af9f32c3ee3 100644 --- a/vllm/entrypoints/cli/launch.py +++ b/vllm/entrypoints/cli/launch.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import signal import uvloop @@ -110,6 +111,14 @@ def cmd_init() -> list[CLISubcommand]: async def run_launch_fastapi(args: argparse.Namespace) -> None: """Run the online serving layer with FastAPI (no GPU inference).""" + + # Interrupt initialization if SIGTERM arrives before uvicorn installs + # its own signal handlers. Once uvicorn is running it replaces this. + def _interrupt_init(*_) -> None: + raise KeyboardInterrupt("terminated") + + signal.signal(signal.SIGTERM, _interrupt_init) + # 1. Socket binding listen_address, sock = setup_server(args) diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py index 8213184ad061..6d6cfa6cd764 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -21,7 +21,11 @@ from vllm.v1.executor import Executor from vllm.v1.executor.multiproc_executor import MultiprocExecutor from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus -from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure +from vllm.v1.utils import ( + APIServerProcessManager, + RustFrontendProcessManager, + wait_for_completion_or_failure, +) logger = init_logger(__name__) @@ -81,11 +85,12 @@ def cmd(args: argparse.Namespace) -> None: ) # Default api_server_count if not explicitly set. + # - Rust frontend: Use 1 (not applicable as it's multithreaded) # - External LB: Leave as 1 (external LB handles distribution) # - Hybrid LB: Use local DP size (internal LB for local ranks only) # - Internal LB: Use full DP size if args.api_server_count is None: - if is_external_lb: + if is_external_lb or envs.VLLM_RUST_FRONTEND_PATH: args.api_server_count = 1 elif is_hybrid_lb: args.api_server_count = args.data_parallel_size_local or 1 @@ -102,6 +107,11 @@ def cmd(args: argparse.Namespace) -> None: "Defaulting api_server_count to data_parallel_size (%d).", args.api_server_count, ) + elif envs.VLLM_RUST_FRONTEND_PATH and args.api_server_count > 1: + logger.warning( + "Ignoring --api-server-count=%d when using rust front-end process" + ) + args.api_server_count = 1 # Elastic EP currently only supports running with at most one API server. if getattr(args, "enable_elastic_ep", False) and args.api_server_count > 1: @@ -114,7 +124,7 @@ def cmd(args: argparse.Namespace) -> None: if args.api_server_count < 1: run_headless(args) - elif args.api_server_count > 1: + elif args.api_server_count > 1 or envs.VLLM_RUST_FRONTEND_PATH: run_multi_api_server(args) else: # Single API server (this process). @@ -230,9 +240,15 @@ def signal_handler(signum, frame): def run_multi_api_server(args: argparse.Namespace): assert not args.headless + rust_frontend_path = envs.VLLM_RUST_FRONTEND_PATH num_api_servers: int = args.api_server_count assert num_api_servers > 0 + if rust_frontend_path and num_api_servers > 1: + raise ValueError( + "VLLM_RUST_FRONTEND_PATH does not support api_server_count > 1" + ) + if num_api_servers > 1: setup_multiprocess_prometheus() @@ -270,7 +286,9 @@ def signal_handler(signum, frame): dp_rank = parallel_config.data_parallel_rank assert parallel_config.local_engines_only or dp_rank == 0 - api_server_manager: APIServerProcessManager | None = None + api_server_manager: APIServerProcessManager | RustFrontendProcessManager | None = ( + None + ) from vllm.v1.engine.utils import get_engine_zmq_addresses @@ -279,23 +297,34 @@ def signal_handler(signum, frame): with launch_core_engines( vllm_config, executor_class, log_stats, addresses, num_api_servers ) as (local_engine_manager, coordinator, addresses, tensor_queue): - # Construct common args for the APIServerProcessManager up-front. - stats_update_address = None - if coordinator: - stats_update_address = coordinator.get_stats_publish_address() - - # Start API servers. - api_server_manager = APIServerProcessManager( - listen_address=listen_address, - sock=sock, - args=args, - num_servers=num_api_servers, - input_addresses=addresses.inputs, - output_addresses=addresses.outputs, - stats_update_address=stats_update_address, - tensor_queue=tensor_queue, + stats_update_address = ( + coordinator.get_stats_publish_address() if coordinator else None ) + if rust_frontend_path: + # Start rust front-end process. + api_server_manager = RustFrontendProcessManager( + binary_path=rust_frontend_path, + sock=sock, + args=args, + input_address=addresses.inputs[0], + output_address=addresses.outputs[0], + engine_count=parallel_config.data_parallel_size, + stats_update_address=stats_update_address, + ) + else: + # Start API server(s). + api_server_manager = APIServerProcessManager( + listen_address=listen_address, + sock=sock, + args=args, + num_servers=num_api_servers, + input_addresses=addresses.inputs, + output_addresses=addresses.outputs, + stats_update_address=stats_update_address, + tensor_queue=tensor_queue, + ) + # Wait for API servers. try: wait_for_completion_or_failure( diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index da2ec10284c5..f2b23fa090a3 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -547,8 +547,7 @@ def validate_api_server_args(args): @instrument(span_name="API server setup") def setup_server(args): - """Validate API server args, set up signal handler, create socket - ready to serve.""" + """Validate API server args and create the server socket.""" log_version_and_model(logger, VLLM_VERSION, args.model) log_non_default_args(args) @@ -574,12 +573,6 @@ def setup_server(args): # many concurrent requests active set_ulimit() - def signal_handler(*_) -> None: - # Interrupt server on sigterm while initializing - raise KeyboardInterrupt("terminated") - - signal.signal(signal.SIGTERM, signal_handler) - if args.uds: listen_address = f"unix:{args.uds}" else: @@ -689,6 +682,13 @@ async def run_server(args, **uvicorn_kwargs) -> None: # Add process-specific prefix to stdout and stderr. decorate_logs("APIServer") + # Interrupt initialization if SIGTERM arrives before uvicorn installs its + # own signal handlers. Once uvicorn is running it replaces this. + def _interrupt_init(*_) -> None: + raise KeyboardInterrupt("terminated") + + signal.signal(signal.SIGTERM, _interrupt_init) + listen_address, sock = setup_server(args) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index e3682280ec50..de3ff364cac5 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -9,6 +9,7 @@ from http import HTTPStatus from logging import Logger from string import Template +from typing import Any import regex as re from fastapi import Request @@ -203,7 +204,7 @@ def get_max_tokens( ) -def log_non_default_args(args: Namespace | EngineArgs): +def get_non_default_args(args: Namespace | EngineArgs) -> dict[str, Any]: from vllm.entrypoints.openai.cli_args import make_arg_parser non_default_args = {} @@ -230,6 +231,43 @@ def log_non_default_args(args: Namespace | EngineArgs): "Unsupported argument type. Must be Namespace or EngineArgs instance." ) + return non_default_args + + +def _jsonify_arg_value(value: Any) -> Any: + if value is None or isinstance(value, bool | int | float | str): + return value + if dataclasses.is_dataclass(value) and not isinstance(value, type): + return { + key: _jsonify_arg_value(val) + for key, val in dataclasses.asdict(value).items() + } + if isinstance(value, dict): + return {str(key): _jsonify_arg_value(val) for key, val in value.items()} + if isinstance(value, tuple | list): + return [_jsonify_arg_value(item) for item in value] + if (model_dump := getattr(value, "model_dump", None)) is not None: + return _jsonify_arg_value(model_dump(mode="json")) + if (to_dict := getattr(value, "dict", None)) is not None: + return _jsonify_arg_value(to_dict()) + return repr(value) + + +def jsonify_non_default_args( + args: Namespace | EngineArgs, + *, + exclude: set[str] | None = None, +) -> dict[str, Any]: + non_default_args = get_non_default_args(args) + if exclude is not None: + for key in exclude: + non_default_args.pop(key, None) + + return {key: _jsonify_arg_value(value) for key, value in non_default_args.items()} + + +def log_non_default_args(args: Namespace | EngineArgs): + non_default_args = get_non_default_args(args) logger.info("non-default args: %s", non_default_args) diff --git a/vllm/envs.py b/vllm/envs.py index a5a4bfaffd59..c25c27a42dbf 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -85,6 +85,7 @@ MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False + VLLM_USE_PRECOMPILED_RUST: bool = False VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False VLLM_DOCKER_BUILD_CONTEXT: bool = False VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False @@ -133,6 +134,8 @@ Q_SCALE_CONSTANT: int = 200 K_SCALE_CONSTANT: int = 200 V_SCALE_CONSTANT: int = 100 + VLLM_USE_RUST_FRONTEND: bool = False + VLLM_RUST_FRONTEND_PATH: str | None = "auto" VLLM_SERVER_DEV_MODE: bool = False VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128 VLLM_MLA_DISABLE: bool = False @@ -505,6 +508,40 @@ def _get_or_set_default() -> str: logger = logging.getLogger(__name__) + +def _resolve_rust_frontend_path() -> str | None: + """Resolve the Rust frontend binary path. + + Returns None if VLLM_USE_RUST_FRONTEND is not enabled. + When enabled, resolves VLLM_RUST_FRONTEND_PATH ("auto" by default) + to the actual binary path. + """ + use_rust = bool(int(os.environ.get("VLLM_USE_RUST_FRONTEND", "0"))) + raw = os.environ.get("VLLM_RUST_FRONTEND_PATH", "auto") + + if not use_rust: + if os.environ.get("VLLM_RUST_FRONTEND_PATH") is not None: + logger.warning( + "VLLM_RUST_FRONTEND_PATH is set but VLLM_USE_RUST_FRONTEND " + "is not enabled. The Rust frontend will not be used. " + "Set VLLM_USE_RUST_FRONTEND=1 to enable it." + ) + return None + + if raw.lower() in ("auto", "1", "true"): + pkg_dir = os.path.dirname(os.path.abspath(__file__)) + candidate = os.path.join(pkg_dir, "vllm-rs") + if os.path.isfile(candidate) and os.access(candidate, os.X_OK): + return candidate + + raise FileNotFoundError( + "VLLM_RUST_FRONTEND_PATH=auto but the vllm-rs binary was " + f"not found at {candidate}. " + "Build with setuptools-rust or set the path explicitly." + ) + return raw + + environment_variables: dict[str, Callable[[], Any]] = { # ================== Installation Time Env Vars ================== # Target device of vLLM, supporting [cuda (by default), @@ -532,11 +569,15 @@ def _get_or_set_default() -> str: # By default this is 1. # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU. "NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None), - # If set, vllm will use precompiled binaries (*.so) + # If set, vllm will use precompiled native binaries (*.so) "VLLM_USE_PRECOMPILED": lambda: ( os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true") or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")) ), + # If set, vllm will use the precompiled Rust frontend binary (vllm-rs). + "VLLM_USE_PRECOMPILED_RUST": lambda: ( + os.environ.get("VLLM_USE_PRECOMPILED_RUST", "").strip().lower() in ("1", "true") + ), # If set, skip adding +precompiled suffix to version string "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool( int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0")) @@ -1118,6 +1159,15 @@ def _get_or_set_default() -> str: # If set to "0", disable LayerName opaque type for layer_name # parameters in custom ops. Defaults to enabled on torch >= 2.11. "VLLM_USE_LAYERNAME": lambda: bool(int(os.getenv("VLLM_USE_LAYERNAME", "1"))), + # If set, use the Rust frontend binary instead of the Python API server + # process(es). + "VLLM_USE_RUST_FRONTEND": lambda: bool( + int(os.getenv("VLLM_USE_RUST_FRONTEND", "0")) + ), + # Path to the Rust frontend binary. Defaults to "auto" which discovers + # the binary installed with the vllm package. Only used when + # VLLM_USE_RUST_FRONTEND=1. + "VLLM_RUST_FRONTEND_PATH": lambda: _resolve_rust_frontend_path(), # If set, vllm will run in development mode, which will enable # some additional endpoints for developing and debugging, # e.g. `/reset_prefix_cache` diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py index eb81a3c88fb7..ba1db3282216 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import contextlib +import json import multiprocessing import threading import time @@ -233,6 +234,146 @@ def shutdown(self, timeout: float | None = None) -> None: shutdown(self.processes, timeout=timeout) +class RustFrontendProcessManager: + """Manages a single Rust frontend subprocess. + + Launches the Rust vllm-rs binary in 'frontend' mode, passing the + listening socket fd and ZMQ transport addresses. Provides the same + interface as APIServerProcessManager for process monitoring. + """ + + def __init__( + self, + binary_path: str, + sock: Any, + args: argparse.Namespace, + input_address: str, + output_address: str, + engine_count: int, + stats_update_address: str | None = None, + ): + import os + import subprocess + + fd = sock.fileno() + os.set_inheritable(fd, True) + + cmd = [ + binary_path, + "frontend", + "--listen-fd", + str(fd), + "--input-address", + input_address, + "--output-address", + output_address, + "--engine-count", + str(engine_count), + ] + if stats_update_address is not None: + cmd.extend(["--coordinator-address", stats_update_address]) + from vllm.entrypoints.utils import jsonify_non_default_args + + args_json = json.dumps( + jsonify_non_default_args(args, exclude={"api_server_count"}), + sort_keys=True, + ) + cmd.extend(["--args-json", args_json]) + + logger.info("Launching Rust frontend: %s", " ".join(cmd)) + self._proc = subprocess.Popen(cmd, pass_fds=(fd,)) + + # Create a process wrapper with a sentinel fd for monitoring + self.processes: list[_SubprocessWrapper] = [ + _SubprocessWrapper(self._proc, "RustFrontend") + ] + + self._finalizer = weakref.finalize(self, _shutdown_subprocesses, self.processes) + + def shutdown(self, timeout: float | None = None) -> None: + if self._finalizer.detach() is not None: + _shutdown_subprocesses(self.processes, timeout=timeout) + + +class _SubprocessWrapper: + """Wraps subprocess.Popen to provide the BaseProcess-like interface + needed by wait_for_completion_or_failure.""" + + def __init__(self, proc, name: str): + self._proc = proc + self.name = name + self.pid = proc.pid + self._sentinel_conn: connection.Connection | None = None + self._sentinel_send: connection.Connection | None = None + + # Use a Pipe-based sentinel so subprocess monitoring works uniformly + # across platforms with multiprocessing.connection.wait(). + recv, send = connection.Pipe(duplex=False) + self._sentinel_conn = recv + self._sentinel_send = send + + def monitor_subprocess() -> None: + try: + proc.wait() + finally: + with contextlib.suppress(Exception): + send.close() + + threading.Thread( + target=monitor_subprocess, daemon=True, name=f"{name}Monitor" + ).start() + + @property + def sentinel(self): + return self._sentinel_conn + + @property + def exitcode(self) -> int | None: + return self._proc.returncode if self._proc.poll() is not None else None + + def is_alive(self) -> bool: + return self._proc.poll() is None + + def terminate(self): + self._proc.terminate() + + def join(self, timeout=None): + with contextlib.suppress(Exception): + self._proc.wait(timeout=timeout) + + def __del__(self): + with contextlib.suppress(Exception): + if self._sentinel_conn is not None: + self._sentinel_conn.close() + if self._sentinel_send is not None: + self._sentinel_send.close() + + +def _shutdown_subprocesses( + procs: list[_SubprocessWrapper], timeout: float | None = None +) -> None: + """Shutdown subprocess wrappers (mirrors the shutdown() function).""" + if timeout is None: + timeout = 0.0 + timeout = max(timeout, 5.0) + + for proc in procs: + if proc.is_alive(): + proc.terminate() + + deadline = time.monotonic() + timeout + for proc in procs: + remaining = deadline - time.monotonic() + if remaining <= 0: + break + if proc.is_alive(): + proc.join(remaining) + + for proc in procs: + if proc.is_alive() and (pid := proc.pid) is not None: + kill_process_tree(pid) + + def run_api_server_worker_proc( listen_address, sock, args, client_config=None, **uvicorn_kwargs ) -> None: @@ -253,7 +394,7 @@ def run_api_server_worker_proc( def wait_for_completion_or_failure( - api_server_manager: APIServerProcessManager, + api_server_manager: "APIServerProcessManager | RustFrontendProcessManager", engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"] | None = None, coordinator: "DPCoordinator | None" = None, @@ -274,7 +415,7 @@ def wait_for_completion_or_failure( logger.info("Waiting for API servers to complete ...") # Create a mapping of sentinels to their corresponding processes # for efficient lookup - sentinel_to_proc: dict[Any, BaseProcess] = { + sentinel_to_proc: dict[Any, BaseProcess | _SubprocessWrapper | None] = { proc.sentinel: proc for proc in api_server_manager.processes }