vllm-project · njhill · Apr 2, 2026 · Apr 25, 2026 · Apr 25, 2026 · Apr 26, 2026
diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
@@ -222,6 +222,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
 
 check_and_skip_if_image_exists
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before invoking bake.
+echo "--- :git: Initializing git submodules"
+git submodule sync --recursive
+git submodule update --init --recursive
+
 echo "--- :docker: Setting up Docker buildx bake"
 echo "Target: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"

diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
@@ -21,6 +21,12 @@ else
   exit 0
 fi
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
   --build-arg max_jobs=16 \

diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -21,6 +21,12 @@ else
   exit 0
 fi
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
   --build-arg max_jobs=16 \

diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then

diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh
@@ -11,8 +11,8 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 
 # skip build if image already exists
 if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then

diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml
@@ -0,0 +1,107 @@
+group: Rust Frontend
+depends_on:
+  - image-build
+steps:
+- label: Rust Frontend OpenAI Coverage
+  timeout_in_minutes: 90
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/benchmarks/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/sample/
+  - tests/utils.py
+  - tests/benchmarks/test_serve_cli.py
+  - tests/entrypoints/openai/chat_completion/test_chat_completion.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - tests/entrypoints/openai/completion/test_prompt_validation.py
+  - tests/entrypoints/openai/completion/test_shutdown.py
+  # - tests/entrypoints/openai/test_return_token_ids.py
+  # - tests/entrypoints/openai/test_uds.py
+  - tests/v1/sample/test_logprobs_e2e.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
+  - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
+  - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure"
+  # - pytest -v -s entrypoints/openai/test_return_token_ids.py
+  # - pytest -v -s entrypoints/openai/test_uds.py
+  - pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server"
+
+- label: Rust Frontend Serve/Admin Coverage
+  timeout_in_minutes: 60
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/engine/
+  - tests/utils.py
+  # - tests/entrypoints/rpc/test_collective_rpc.py
+  - tests/entrypoints/serve/disagg/test_serving_tokens.py
+  - tests/entrypoints/serve/instrumentator/test_basic.py
+  - tests/entrypoints/serve/instrumentator/test_metrics.py
+  # - tests/entrypoints/serve/instrumentator/test_sleep.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  # - pytest -v -s entrypoints/rpc/test_collective_rpc.py
+  - pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load"
+  - pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow"
+  - pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist"
+  # - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py
+
+- label: Rust Frontend Core Correctness
+  timeout_in_minutes: 30
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - tests/utils.py
+  - tests/entrypoints/openai/correctness/test_lmeval.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: Rust Frontend Tool Use
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/tool_parsers/
+  - tests/utils.py
+  - tests/tool_use/
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 toolACE -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice"
+
+- label: Rust Frontend Distributed
+  timeout_in_minutes: 30
+  num_devices: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/utils.py
+  - tests/v1/distributed/test_internal_lb_dp.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info"
diff --git a/.dockerignore b/.dockerignore
@@ -2,6 +2,7 @@
 /build
 dist
 vllm/*.so
+vllm/vllm-rs
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -31,3 +32,4 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+rust/target/
diff --git a/.gitignore b/.gitignore
@@ -26,6 +26,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Rust binaries
+vllm/vllm-rs
+
 # Distribution / packaging
 .Python
 build/

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "rust"]
+	path = rust
+	url = [email protected]:Inferact/vllm-frontend-rs.git
diff --git a/build_rust.sh b/build_rust.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Build the vllm-rs Rust frontend binary and install it into the vllm package.
+# Usage: ./build_rust.sh [--debug]
+#
+# By default builds in release mode. Pass --debug for faster compile times
+# during development.
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
+RUST_DIR="$REPO_ROOT/rust"
+TARGET_PATH="$REPO_ROOT/vllm/vllm-rs"
+
+# Read the required toolchain from rust-toolchain.toml.
+TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
+
+# Ensure rustup and the required toolchain are available.
+if ! command -v rustup &>/dev/null; then
+    echo "rustup not found, installing..."
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+    source "$HOME/.cargo/env"
+fi
+
+if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
+    echo "Installing Rust toolchain: $TOOLCHAIN"
+    rustup toolchain install "$TOOLCHAIN"
+fi
+
+if [[ "${1:-}" == "--debug" ]]; then
+    PROFILE_ARGS=()
+    PROFILE_DIR="debug"
+else
+    PROFILE_ARGS=(--release)
+    PROFILE_DIR="release"
+fi
+
+cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
+    --manifest-path "$RUST_DIR/Cargo.toml" \
+    --bin vllm-rs \
+    --features native-tls-vendored
+
+cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
+echo "Installed vllm-rs to $TARGET_PATH"
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -107,6 +107,7 @@ RUN apt-get update -y \
         sudo \
         python3-pip \
         libibverbs-dev \
+        protobuf-compiler libprotobuf-dev \
         # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
         # as it was causing spam when compiling the CUTLASS kernels
         gcc-10 \
@@ -129,6 +130,11 @@ RUN apt-get update -y \
 ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 
+# Compiler and linker environment
+ENV CC=/usr/bin/gcc-10 CXX=/usr/bin/g++-10
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10
+
 # Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
@@ -380,12 +386,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
+# Install Rust toolchain for building Rust extensions.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
 WORKDIR /workspace
 
 # Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
 COPY . .
 
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via `COPY . .`, so an
+# uninitialized submodule would otherwise produce a confusing cargo failure.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Require the Rust frontend to be successfully built into the final wheel.
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
@@ -406,6 +429,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
     if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_USE_PRECOMPILED=1; \
         export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
     fi && \
     python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38

@@ -33,10 +33,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     apt-get update -y \
     && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
     gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
+    protobuf-compiler libprotobuf-dev \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+# Compiler and linker environment
 ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -112,8 +116,25 @@ COPY requirements/build/cpu.txt requirements/build/cpu.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -r requirements/build/cpu.txt
 
+# Install Rust toolchain for building Rust extensions.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Require Rust frontend to be successfully built into the wheel
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
 COPY . .
 
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via `COPY . .`, so an
+# uninitialized submodule would otherwise produce a confusing cargo failure.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
 RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN --mount=type=cache,target=/root/.cache/uv \

diff --git a/docs/contributing/README.md b/docs/contributing/README.md
@@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 ```
 
+To rebuild only the Rust frontend binary:
+
+```bash
+./build_rust.sh          # release build
+./build_rust.sh --debug  # faster build for development
+```
+
 If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:
 
 ```bash

diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md
@@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
 3. **Selects compatible wheel** based on:
     - Package name (`vllm`)
     - Platform tag (architecture match)
-4. **Downloads and extracts** precompiled binaries from the wheel:
-    - C++ extension modules (`.so` files)
-    - Flash Attention Python modules
-    - Triton kernel Python files
+4. **Downloads and extracts** precompiled artifacts from the wheel:
+    - Native extension modules (`.so` files)
+    - The `vllm-rs` Rust frontend binary
+    - Flash Attention Python modules and Triton/FlashMLA Python files
 5. **Patches package_data** to include extracted files in the installation
 
 !!! note "What is the base commit?"

diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -101,12 +101,22 @@ This command will do the following:
 1. Look for the current branch in your vLLM clone.
 1. Identify the corresponding base commit in the main branch.
 1. Download the pre-built wheel of the base commit.
-1. Use its compiled libraries in the installation.
+1. Use its compiled libraries and `vllm-rs` binary in the installation.
 
 !!! note
     1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
     2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
 
+!!! tip "Rebuilding the Rust frontend"
+If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install:
+
+    ```bash
+    ./build_rust.sh          # release build
+    ./build_rust.sh --debug  # faster build for development
+    ```
+
+    This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`.
+
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`.
 
 ```bash

diff --git a/pyproject.toml b/pyproject.toml
@@ -6,6 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
+    "setuptools-rust>=1.9.0",
     "torch == 2.11.0",
     "wheel",
     "jinja2",