diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh
index 10c03c3e1773..f93e86d9ec09 100755
--- a/.buildkite/image_build/image_build.sh
+++ b/.buildkite/image_build/image_build.sh
@@ -223,6 +223,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
 
 check_and_skip_if_image_exists
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before invoking bake.
+echo "--- :git: Initializing git submodules"
+git submodule sync --recursive
+git submodule update --init --recursive
+
 echo "--- :docker: Setting up Docker buildx bake"
 echo "Target: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
diff --git a/.buildkite/image_build/image_build_cpu.sh b/.buildkite/image_build/image_build_cpu.sh
index 035f070ab891..a8f76d5b2657 100755
--- a/.buildkite/image_build/image_build_cpu.sh
+++ b/.buildkite/image_build/image_build_cpu.sh
@@ -21,6 +21,12 @@ else
   exit 0
 fi
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
   --build-arg max_jobs=16 \
diff --git a/.buildkite/image_build/image_build_cpu_arm64.sh b/.buildkite/image_build/image_build_cpu_arm64.sh
index b561e2c2e463..9ee0b353aa8a 100755
--- a/.buildkite/image_build/image_build_cpu_arm64.sh
+++ b/.buildkite/image_build/image_build_cpu_arm64.sh
@@ -21,6 +21,12 @@ else
   exit 0
 fi
 
+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
   --build-arg max_jobs=16 \
diff --git a/.buildkite/image_build/image_build_hpu.sh b/.buildkite/image_build/image_build_hpu.sh
index 60fa1789fa06..df900dc60342 100755
--- a/.buildkite/image_build/image_build_hpu.sh
+++ b/.buildkite/image_build/image_build_hpu.sh
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
 
 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
diff --git a/.buildkite/image_build/image_build_xpu.sh b/.buildkite/image_build/image_build_xpu.sh
index c3734dce13ca..45417b7339be 100755
--- a/.buildkite/image_build/image_build_xpu.sh
+++ b/.buildkite/image_build/image_build_xpu.sh
@@ -11,8 +11,8 @@ REPO=$2
 BUILDKITE_COMMIT=$3
 
 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
 
 # skip build if image already exists
 if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
diff --git a/.buildkite/test_areas/rust_frontend.yaml b/.buildkite/test_areas/rust_frontend.yaml
new file mode 100644
index 000000000000..4afafd856ed4
--- /dev/null
+++ b/.buildkite/test_areas/rust_frontend.yaml
@@ -0,0 +1,107 @@
+group: Rust Frontend
+depends_on:
+  - image-build
+steps:
+- label: Rust Frontend OpenAI Coverage
+  timeout_in_minutes: 90
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/benchmarks/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/sample/
+  - tests/utils.py
+  - tests/benchmarks/test_serve_cli.py
+  - tests/entrypoints/openai/chat_completion/test_chat_completion.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - tests/entrypoints/openai/completion/test_prompt_validation.py
+  - tests/entrypoints/openai/completion/test_shutdown.py
+  # - tests/entrypoints/openai/test_return_token_ids.py
+  # - tests/entrypoints/openai/test_uds.py
+  - tests/v1/sample/test_logprobs_e2e.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
+  - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
+  - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
+  # - pytest -v -s entrypoints/openai/test_return_token_ids.py
+  # - pytest -v -s entrypoints/openai/test_uds.py
+  - pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server"
+
+- label: Rust Frontend Serve/Admin Coverage
+  timeout_in_minutes: 60
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/engine/
+  - tests/utils.py
+  # - tests/entrypoints/rpc/test_collective_rpc.py
+  - tests/entrypoints/serve/disagg/test_serving_tokens.py
+  - tests/entrypoints/serve/instrumentator/test_basic.py
+  - tests/entrypoints/serve/instrumentator/test_metrics.py
+  # - tests/entrypoints/serve/instrumentator/test_sleep.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  # - pytest -v -s entrypoints/rpc/test_collective_rpc.py
+  - pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load"
+  - pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow"
+  - pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist"
+  # - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py
+
+- label: Rust Frontend Core Correctness
+  timeout_in_minutes: 30
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - tests/utils.py
+  - tests/entrypoints/openai/correctness/test_lmeval.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: Rust Frontend Tool Use
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/tool_parsers/
+  - tests/utils.py
+  - tests/tool_use/
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 toolACE -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice"
+
+- label: Rust Frontend Distributed
+  timeout_in_minutes: 30
+  num_devices: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/utils.py
+  - tests/v1/distributed/test_internal_lb_dp.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info"
diff --git a/.dockerignore b/.dockerignore
index 3863656915d0..66447272e95a 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -2,6 +2,7 @@
 /build
 dist
 vllm/*.so
+vllm/vllm-rs
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -31,3 +32,4 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+rust/target/
diff --git a/.gitignore b/.gitignore
index e53d19b35340..2c4e135e58dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,9 @@ __pycache__/
 # C extensions
 *.so
 
+# Rust binaries
+vllm/vllm-rs
+
 # Distribution / packaging
 .Python
 build/
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000000..79d557ecd69e
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "rust"]
+	path = rust
+	url = https://github.com/Inferact/vllm-frontend-rs.git
diff --git a/build_rust.sh b/build_rust.sh
new file mode 100755
index 000000000000..fb4a589de4c0
--- /dev/null
+++ b/build_rust.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Build the vllm-rs Rust frontend binary and install it into the vllm package.
+# Usage: ./build_rust.sh [--debug]
+#
+# By default builds in release mode. Pass --debug for faster compile times
+# during development.
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
+RUST_DIR="$REPO_ROOT/rust"
+TARGET_PATH="$REPO_ROOT/vllm/vllm-rs"
+
+# Read the required toolchain from rust-toolchain.toml.
+TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
+
+# Ensure rustup and the required toolchain are available.
+if ! command -v rustup &>/dev/null; then
+    echo "rustup not found, installing..."
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+    source "$HOME/.cargo/env"
+fi
+
+if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
+    echo "Installing Rust toolchain: $TOOLCHAIN"
+    rustup toolchain install "$TOOLCHAIN"
+fi
+
+if [[ "${1:-}" == "--debug" ]]; then
+    PROFILE_ARGS=()
+    PROFILE_DIR="debug"
+else
+    PROFILE_ARGS=(--release)
+    PROFILE_DIR="release"
+fi
+
+cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
+    --manifest-path "$RUST_DIR/Cargo.toml" \
+    --bin vllm-rs \
+    --features native-tls-vendored
+
+cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
+echo "Installed vllm-rs to $TARGET_PATH"
diff --git a/docker/Dockerfile b/docker/Dockerfile
index fd0622e2416a..f2abc1db6bc2 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -119,6 +119,8 @@ RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
             curl \
             sudo \
             rdma-core-devel \
+            protobuf-compiler \
+            protobuf-devel \
         && dnf clean all \
         && rm -rf /var/cache/dnf; \
     else \
@@ -131,6 +133,7 @@ RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
             sudo \
             python3-pip \
             libibverbs-dev \
+            protobuf-compiler libprotobuf-dev \
             # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
             # as it was causing spam when compiling the CUTLASS kernels
             gcc-10 \
@@ -165,6 +168,11 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh \
 ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/venv"
 
+# Compiler and linker environment
+ENV CC=/usr/bin/gcc-10 CXX=/usr/bin/g++-10
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-10
+
 # Environment for uv
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
@@ -416,12 +424,29 @@ RUN --mount=type=cache,target=/root/.cache/uv \
         --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
     fi
 
+# Install Rust toolchain for building Rust extensions.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
 WORKDIR /workspace
 
 # Copy pre-built csrc wheel directly
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
 COPY . .
 
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via `COPY . .`, so an
+# uninitialized submodule would otherwise produce a confusing cargo failure.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Require the Rust frontend to be successfully built into the final wheel.
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
     if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
@@ -442,6 +467,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
     --mount=type=bind,source=.git,target=.git \
     if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_USE_PRECOMPILED=1; \
         export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
     fi && \
     python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
diff --git a/docker/Dockerfile.cpu b/docker/Dockerfile.cpu
index d15ced8e0111..43f3da4cea98 100644
--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -33,10 +33,14 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
     apt-get update -y \
     && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
     gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
+    protobuf-compiler libprotobuf-dev \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
     && curl -LsSf https://astral.sh/uv/install.sh | sh
 
+# Compiler and linker environment
 ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
+ENV CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12
+ENV CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_LINKER=/usr/bin/gcc-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
 
@@ -112,8 +116,25 @@ COPY requirements/build/cpu.txt requirements/build/cpu.txt
 RUN --mount=type=cache,target=/root/.cache/uv \
     uv pip install -r requirements/build/cpu.txt
 
+# Install Rust toolchain for building Rust extensions.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Require Rust frontend to be successfully built into the wheel
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
 COPY . .
 
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via `COPY . .`, so an
+# uninitialized submodule would otherwise produce a confusing cargo failure.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
 RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
 
 RUN --mount=type=cache,target=/root/.cache/uv \
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 0ed12f11da94..07feff33e249 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -25,7 +25,8 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
 RUN apt-get update -q -y && apt-get install -q -y \
     sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
     apt-transport-https ca-certificates wget curl \
-    libnuma-dev
+    libnuma-dev \
+    protobuf-compiler libprotobuf-dev
 RUN python3 -m pip install --upgrade pip
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
@@ -48,6 +49,11 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 
+# Install Rust toolchain for building Rust extensions.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
 # Install sccache if USE_SCCACHE is enabled (for release builds)
 ARG USE_SCCACHE
 ARG SCCACHE_DOWNLOAD_URL
@@ -98,6 +104,7 @@ ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
 	    && git checkout FETCH_HEAD \
+	    && git submodule update --init --recursive \
         && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
                git remote add upstream "https://github.com/vllm-project/vllm.git" \
                && git fetch upstream ; fi
@@ -106,6 +113,24 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via the fetch_vllm
+# stage, so an uninitialized submodule would otherwise produce a confusing
+# cargo failure.
+RUN if [ ! -f vllm/rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Require the Rust frontend to be successfully built into the final wheel.
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
+# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
 # Build vLLM (setup.py auto-detects sccache in PATH)
 RUN cd vllm \
     && python3 -m pip install -r requirements/rocm.txt \
@@ -272,6 +297,23 @@ FROM fetch_vllm AS build_vllm_wheel_release
 
 ARG COMMON_WORKDIR
 
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via the fetch_vllm
+# stage, so an uninitialized submodule would otherwise produce a confusing
+# cargo failure.
+RUN if [ ! -f vllm/rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Require the Rust frontend to be successfully built into the final wheel.
+ENV VLLM_REQUIRE_RUST_FRONTEND=1
+
+# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
 # Create /install directory for custom wheels
 RUN mkdir -p /install
 
diff --git a/docs/contributing/README.md b/docs/contributing/README.md
index a538a408e986..fd91c8978ac0 100644
--- a/docs/contributing/README.md
+++ b/docs/contributing/README.md
@@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 ```
 
+To rebuild only the Rust frontend binary:
+
+```bash
+./build_rust.sh          # release build
+./build_rust.sh --debug  # faster build for development
+```
+
 If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:
 
 ```bash
diff --git a/docs/contributing/ci/nightly_builds.md b/docs/contributing/ci/nightly_builds.md
index a07b9c1c2fa4..8f3512db3d40 100644
--- a/docs/contributing/ci/nightly_builds.md
+++ b/docs/contributing/ci/nightly_builds.md
@@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
 3. **Selects compatible wheel** based on:
     - Package name (`vllm`)
     - Platform tag (architecture match)
-4. **Downloads and extracts** precompiled binaries from the wheel:
-    - C++ extension modules (`.so` files)
-    - Flash Attention Python modules
-    - Triton kernel Python files
+4. **Downloads and extracts** precompiled artifacts from the wheel:
+    - Native extension modules (`.so` files)
+    - The `vllm-rs` Rust frontend binary
+    - Flash Attention Python modules and Triton/FlashMLA Python files
 5. **Patches package_data** to include extracted files in the installation
 
 !!! note "What is the base commit?"
diff --git a/docs/getting_started/installation/gpu.cuda.inc.md b/docs/getting_started/installation/gpu.cuda.inc.md
index 309dd671251e..ec333b3ee1bf 100644
--- a/docs/getting_started/installation/gpu.cuda.inc.md
+++ b/docs/getting_started/installation/gpu.cuda.inc.md
@@ -101,12 +101,22 @@ This command will do the following:
 1. Look for the current branch in your vLLM clone.
 1. Identify the corresponding base commit in the main branch.
 1. Download the pre-built wheel of the base commit.
-1. Use its compiled libraries in the installation.
+1. Use its compiled libraries and `vllm-rs` binary in the installation.
 
 !!! note
     1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
     2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
 
+!!! tip "Rebuilding the Rust frontend"
+If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install:
+
+    ```bash
+    ./build_rust.sh          # release build
+    ./build_rust.sh --debug  # faster build for development
+    ```
+
+    This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`.
+
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`.
 
 ```bash
diff --git a/pyproject.toml b/pyproject.toml
index b8d14463256d..568af0311753 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,6 +6,7 @@ requires = [
     "packaging>=24.2",
     "setuptools>=77.0.3,<81.0.0",
     "setuptools-scm>=8.0",
+    "setuptools-rust>=1.9.0",
     "torch == 2.11.0",
     "wheel",
     "jinja2",
diff --git a/requirements/build/cpu.txt b/requirements/build/cpu.txt
index 16ada0572273..640432ddd8cc 100644
--- a/requirements/build/cpu.txt
+++ b/requirements/build/cpu.txt
@@ -4,6 +4,7 @@ ninja
 packaging>=24.2
 setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
 torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le"  or platform_machine == "riscv64"
 wheel
diff --git a/requirements/build/cuda.txt b/requirements/build/cuda.txt
index 490b0bdbc530..70da484a4133 100644
--- a/requirements/build/cuda.txt
+++ b/requirements/build/cuda.txt
@@ -4,6 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 torch==2.11.0
 wheel
 jinja2>=3.1.6
diff --git a/requirements/rocm.txt b/requirements/rocm.txt
index 0b472b90c026..61fcbc07010c 100644
--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -15,6 +15,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 runai-model-streamer[s3,gcs,azure]==0.15.7
 conch-triton-kernels==1.2.1
 timm>=1.0.17
diff --git a/rust b/rust
new file mode 160000
index 000000000000..eb2b54b1a0b3
--- /dev/null
+++ b/rust
@@ -0,0 +1 @@
+Subproject commit eb2b54b1a0b3ef2904a25538bc0211f058f7e95f
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
new file mode 100644
index 000000000000..4933b3ba1707
--- /dev/null
+++ b/rust-toolchain.toml
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "1.95"
diff --git a/setup.py b/setup.py
index 7c226a72425f..9a5350fac73c 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,8 @@
 from packaging.version import Version, parse
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_rust import Binding, RustExtension
+from setuptools_rust.build import build_rust
 from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 
@@ -33,11 +35,24 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = Path(__file__).parent
 logger = logging.getLogger(__name__)
 
+PRECOMPILED_RUST_FRONTEND_PATH = ROOT_DIR / "vllm" / "vllm-rs"
+
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
 
 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
+USE_PRECOMPILED_EXTENSIONS = envs.VLLM_USE_PRECOMPILED
+# VLLM_USE_PRECOMPILED implies precompiled rust frontend too.
+USE_PRECOMPILED_RUST_FRONTEND = (
+    envs.VLLM_USE_PRECOMPILED or envs.VLLM_USE_PRECOMPILED_RUST
+)
+
+
+def should_require_rust_frontend() -> bool:
+    value = os.getenv("VLLM_REQUIRE_RUST_FRONTEND", "")
+    return value.lower() not in ("", "0", "false", "no")
+
 
 if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
     logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
@@ -405,6 +420,24 @@ def build_extensions(self) -> None:
         return
 
 
+class precompiled_build_rust(build_rust):
+    """Skips local Rust builds when the precompiled wheel already ships vllm-rs."""
+
+    def run(self) -> None:
+        if PRECOMPILED_RUST_FRONTEND_PATH.exists():
+            logger.info(
+                "Skipping local Rust build: using precompiled %s",
+                PRECOMPILED_RUST_FRONTEND_PATH,
+            )
+            return
+
+        logger.warning(
+            "Precompiled wheel did not provide %s; falling back to local Rust build.",
+            PRECOMPILED_RUST_FRONTEND_PATH,
+        )
+        super().run()
+
+
 class precompiled_wheel_utils:
     """Extracts libraries and other files from an existing wheel."""
 
@@ -653,7 +686,11 @@ def determine_wheel_url() -> tuple[str, str | None]:
 
     @staticmethod
     def extract_precompiled_and_patch_package(
-        wheel_url_or_path: str, download_filename: str | None
+        wheel_url_or_path: str,
+        download_filename: str | None,
+        *,
+        extract_extensions: bool,
+        extract_rust_frontend: bool,
     ) -> dict:
         import tempfile
         import zipfile
@@ -676,19 +713,25 @@ def extract_precompiled_and_patch_package(
             package_data_patch = {}
 
             with zipfile.ZipFile(wheel_path) as wheel:
-                files_to_copy = [
-                    "vllm/_C.abi3.so",
-                    "vllm/_C_stable_libtorch.abi3.so",
-                    "vllm/_moe_C.abi3.so",
-                    "vllm/_flashmla_C.abi3.so",
-                    "vllm/_flashmla_extension_C.abi3.so",
-                    "vllm/_sparse_flashmla_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                    "vllm/cumem_allocator.abi3.so",
-                    # ROCm-specific libraries
-                    "vllm/_rocm_C.abi3.so",
-                ]
+                exact_members = set()
+                if extract_extensions:
+                    exact_members.update(
+                        {
+                            "vllm/_C.abi3.so",
+                            "vllm/_C_stable_libtorch.abi3.so",
+                            "vllm/_moe_C.abi3.so",
+                            "vllm/_flashmla_C.abi3.so",
+                            "vllm/_flashmla_extension_C.abi3.so",
+                            "vllm/_sparse_flashmla_C.abi3.so",
+                            "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                            "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
+                            "vllm/cumem_allocator.abi3.so",
+                            # ROCm-specific libraries
+                            "vllm/_rocm_C.abi3.so",
+                        }
+                    )
+                if extract_rust_frontend:
+                    exact_members.add("vllm/vllm-rs")
 
                 flash_attn_regex = re.compile(
                     r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
@@ -707,27 +750,23 @@ def extract_precompiled_and_patch_package(
                 )
                 # DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.)
                 deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*")
-                file_members = list(
-                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
-                )
-                file_members += list(
-                    filter(
-                        lambda x: flash_attn_regex.match(x.filename)
-                        and x.filename not in flash_attn_files_to_skip,
-                        wheel.filelist,
-                    )
-                )
-                file_members += list(
-                    filter(
-                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
-                    )
-                )
-                file_members += list(
-                    filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist)
-                )
-                file_members += list(
-                    filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist)
-                )
+                file_members = []
+                for member in wheel.filelist:
+                    if member.filename in exact_members:
+                        file_members.append(member)
+                        continue
+
+                    if not extract_extensions:
+                        continue
+
+                    if (
+                        flash_attn_regex.match(member.filename)
+                        and member.filename not in flash_attn_files_to_skip
+                        or triton_kernels_regex.match(member.filename)
+                        or flashmla_regex.match(member.filename)
+                        or deep_gemm_regex.match(member.filename)
+                    ):
+                        file_members.append(member)
 
                 for file in file_members:
                     print(f"[extract] {file.filename}")
@@ -738,6 +777,9 @@ def extract_precompiled_and_patch_package(
                         open(target_path, "wb") as dst,
                     ):
                         shutil.copyfileobj(src, dst)
+                    mode = file.external_attr >> 16
+                    if mode:
+                        os.chmod(target_path, mode)
 
                     pkg = os.path.dirname(file.filename).replace("/", ".")
                     package_data_patch.setdefault(pkg, []).append(
@@ -910,7 +952,7 @@ def get_vllm_version() -> str:
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
     elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
+        if USE_PRECOMPILED_EXTENSIONS and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
             version += f"{sep}precompiled"
         else:
             cuda_version = str(get_nvcc_cuda_version())
@@ -998,7 +1040,7 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _is_cuda():
     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or (
+    if USE_PRECOMPILED_EXTENSIONS or (
         CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
     ):
         # FA3 requires CUDA 12.3 or later
@@ -1008,7 +1050,7 @@ def _read_requirements(filename: str) -> list[str]:
     ext_modules.append(
         CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
     )
-    if envs.VLLM_USE_PRECOMPILED or (
+    if USE_PRECOMPILED_EXTENSIONS or (
         CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
     ):
         # FlashMLA requires CUDA 12.9 or later
@@ -1059,11 +1101,14 @@ def _read_requirements(filename: str) -> list[str]:
 }
 
 
-# If using precompiled, extract and patch package_data (in advance of setup)
-if envs.VLLM_USE_PRECOMPILED:
+# If using precompiled artifacts, extract and patch package_data in advance.
+if USE_PRECOMPILED_RUST_FRONTEND:
     wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
     patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url, download_filename
+        wheel_url,
+        download_filename,
+        extract_extensions=USE_PRECOMPILED_EXTENSIONS,
+        extract_rust_frontend=True,
     )
     for pkg, files in patch.items():
         package_data.setdefault(pkg, []).extend(files)
@@ -1076,14 +1121,32 @@ def _read_requirements(filename: str) -> list[str]:
 else:
     cmdclass = {
         "build_ext": precompiled_build_ext
-        if envs.VLLM_USE_PRECOMPILED
+        if USE_PRECOMPILED_EXTENSIONS
         else cmake_build_ext,
     }
+if USE_PRECOMPILED_RUST_FRONTEND:
+    cmdclass["build_rust"] = precompiled_build_rust
+
+# Rust frontend binary, built via setuptools-rust and installed into the
+# package directory alongside the Python modules.
+# TODO: we may use `RustBin` to directly install it into `bin` directory, but this
+# requires extra work on using precompiled binaries.
+rust_extensions = [
+    RustExtension(
+        target="vllm.vllm-rs",
+        path="rust/src/cmd/Cargo.toml",
+        args=["--bin", "vllm-rs"],
+        features=["native-tls-vendored"],
+        binding=Binding.Exec,
+        optional=not should_require_rust_frontend(),
+    ),
+]
 
 setup(
     # static metadata should rather go in pyproject.toml
     version=get_vllm_version(),
     ext_modules=ext_modules,
+    rust_extensions=rust_extensions,
     install_requires=get_requirements(),
     extras_require={
         # AMD Zen CPU optimizations via zentorch
diff --git a/tests/entrypoints/openai/completion/test_shutdown.py b/tests/entrypoints/openai/completion/test_shutdown.py
index 966c9f869c44..04af1c32f918 100644
--- a/tests/entrypoints/openai/completion/test_shutdown.py
+++ b/tests/entrypoints/openai/completion/test_shutdown.py
@@ -300,7 +300,7 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
         proc.send_signal(signal.SIGTERM)
 
         # abort timeout (0) should exit promptly
-        for _ in range(20):
+        for _ in range(40):
             if proc.poll() is not None:
                 break
             time.sleep(0.1)
@@ -311,7 +311,7 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
             pytest.fail("Process did not exit after SIGTERM with abort timeout")
 
         exit_time = time.time() - start_time
-        assert exit_time < 2.1, f"Default shutdown took too long: {exit_time:.1f}s"
+        assert exit_time < 4.1, f"Default shutdown took too long: {exit_time:.1f}s"
         assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}"
 
         await _assert_children_cleaned_up(child_pids)
diff --git a/tests/test_envs.py b/tests/test_envs.py
index 3f3add2ab764..40bbe87e8785 100644
--- a/tests/test_envs.py
+++ b/tests/test_envs.py
@@ -95,6 +95,19 @@ def test_is_envs_cache_enabled() -> None:
     assert not envs._is_envs_cache_enabled()
 
 
+def test_precompiled_install_flags_are_orthogonal() -> None:
+    with patch.dict(
+        os.environ,
+        {
+            "VLLM_PRECOMPILED_WHEEL_LOCATION": "/tmp/vllm.whl",
+            "VLLM_USE_PRECOMPILED_RUST": "1",
+        },
+        clear=False,
+    ):
+        assert environment_variables["VLLM_USE_PRECOMPILED"]() is False
+        assert environment_variables["VLLM_USE_PRECOMPILED_RUST"]() is True
+
+
 class TestEnvWithChoices:
     """Test cases for env_with_choices function."""
 
diff --git a/vllm/entrypoints/cli/launch.py b/vllm/entrypoints/cli/launch.py
index 9871a27da381..0af9f32c3ee3 100644
--- a/vllm/entrypoints/cli/launch.py
+++ b/vllm/entrypoints/cli/launch.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import argparse
+import signal
 
 import uvloop
 
@@ -110,6 +111,14 @@ def cmd_init() -> list[CLISubcommand]:
 
 async def run_launch_fastapi(args: argparse.Namespace) -> None:
     """Run the online serving layer with FastAPI (no GPU inference)."""
+
+    # Interrupt initialization if SIGTERM arrives before uvicorn installs
+    # its own signal handlers. Once uvicorn is running it replaces this.
+    def _interrupt_init(*_) -> None:
+        raise KeyboardInterrupt("terminated")
+
+    signal.signal(signal.SIGTERM, _interrupt_init)
+
     # 1. Socket binding
     listen_address, sock = setup_server(args)
 
diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
index 8213184ad061..6d6cfa6cd764 100644
--- a/vllm/entrypoints/cli/serve.py
+++ b/vllm/entrypoints/cli/serve.py
@@ -21,7 +21,11 @@
 from vllm.v1.executor import Executor
 from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
-from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
+from vllm.v1.utils import (
+    APIServerProcessManager,
+    RustFrontendProcessManager,
+    wait_for_completion_or_failure,
+)
 
 logger = init_logger(__name__)
 
@@ -81,11 +85,12 @@ def cmd(args: argparse.Namespace) -> None:
             )
 
         # Default api_server_count if not explicitly set.
+        # - Rust frontend: Use 1 (not applicable as it's multithreaded)
         # - External LB: Leave as 1 (external LB handles distribution)
         # - Hybrid LB: Use local DP size (internal LB for local ranks only)
         # - Internal LB: Use full DP size
         if args.api_server_count is None:
-            if is_external_lb:
+            if is_external_lb or envs.VLLM_RUST_FRONTEND_PATH:
                 args.api_server_count = 1
             elif is_hybrid_lb:
                 args.api_server_count = args.data_parallel_size_local or 1
@@ -102,6 +107,11 @@ def cmd(args: argparse.Namespace) -> None:
                         "Defaulting api_server_count to data_parallel_size (%d).",
                         args.api_server_count,
                     )
+        elif envs.VLLM_RUST_FRONTEND_PATH and args.api_server_count > 1:
+            logger.warning(
+                "Ignoring --api-server-count=%d when using rust front-end process"
+            )
+            args.api_server_count = 1
 
         # Elastic EP currently only supports running with at most one API server.
         if getattr(args, "enable_elastic_ep", False) and args.api_server_count > 1:
@@ -114,7 +124,7 @@ def cmd(args: argparse.Namespace) -> None:
 
         if args.api_server_count < 1:
             run_headless(args)
-        elif args.api_server_count > 1:
+        elif args.api_server_count > 1 or envs.VLLM_RUST_FRONTEND_PATH:
             run_multi_api_server(args)
         else:
             # Single API server (this process).
@@ -230,9 +240,15 @@ def signal_handler(signum, frame):
 
 def run_multi_api_server(args: argparse.Namespace):
     assert not args.headless
+    rust_frontend_path = envs.VLLM_RUST_FRONTEND_PATH
     num_api_servers: int = args.api_server_count
     assert num_api_servers > 0
 
+    if rust_frontend_path and num_api_servers > 1:
+        raise ValueError(
+            "VLLM_RUST_FRONTEND_PATH does not support api_server_count > 1"
+        )
+
     if num_api_servers > 1:
         setup_multiprocess_prometheus()
 
@@ -270,7 +286,9 @@ def signal_handler(signum, frame):
     dp_rank = parallel_config.data_parallel_rank
     assert parallel_config.local_engines_only or dp_rank == 0
 
-    api_server_manager: APIServerProcessManager | None = None
+    api_server_manager: APIServerProcessManager | RustFrontendProcessManager | None = (
+        None
+    )
 
     from vllm.v1.engine.utils import get_engine_zmq_addresses
 
@@ -279,23 +297,34 @@ def signal_handler(signum, frame):
     with launch_core_engines(
         vllm_config, executor_class, log_stats, addresses, num_api_servers
     ) as (local_engine_manager, coordinator, addresses, tensor_queue):
-        # Construct common args for the APIServerProcessManager up-front.
-        stats_update_address = None
-        if coordinator:
-            stats_update_address = coordinator.get_stats_publish_address()
-
-        # Start API servers.
-        api_server_manager = APIServerProcessManager(
-            listen_address=listen_address,
-            sock=sock,
-            args=args,
-            num_servers=num_api_servers,
-            input_addresses=addresses.inputs,
-            output_addresses=addresses.outputs,
-            stats_update_address=stats_update_address,
-            tensor_queue=tensor_queue,
+        stats_update_address = (
+            coordinator.get_stats_publish_address() if coordinator else None
         )
 
+        if rust_frontend_path:
+            # Start rust front-end process.
+            api_server_manager = RustFrontendProcessManager(
+                binary_path=rust_frontend_path,
+                sock=sock,
+                args=args,
+                input_address=addresses.inputs[0],
+                output_address=addresses.outputs[0],
+                engine_count=parallel_config.data_parallel_size,
+                stats_update_address=stats_update_address,
+            )
+        else:
+            # Start API server(s).
+            api_server_manager = APIServerProcessManager(
+                listen_address=listen_address,
+                sock=sock,
+                args=args,
+                num_servers=num_api_servers,
+                input_addresses=addresses.inputs,
+                output_addresses=addresses.outputs,
+                stats_update_address=stats_update_address,
+                tensor_queue=tensor_queue,
+            )
+
     # Wait for API servers.
     try:
         wait_for_completion_or_failure(
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index da2ec10284c5..f2b23fa090a3 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -547,8 +547,7 @@ def validate_api_server_args(args):
 
 @instrument(span_name="API server setup")
 def setup_server(args):
-    """Validate API server args, set up signal handler, create socket
-    ready to serve."""
+    """Validate API server args and create the server socket."""
 
     log_version_and_model(logger, VLLM_VERSION, args.model)
     log_non_default_args(args)
@@ -574,12 +573,6 @@ def setup_server(args):
     # many concurrent requests active
     set_ulimit()
 
-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm while initializing
-        raise KeyboardInterrupt("terminated")
-
-    signal.signal(signal.SIGTERM, signal_handler)
-
     if args.uds:
         listen_address = f"unix:{args.uds}"
     else:
@@ -689,6 +682,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
     # Add process-specific prefix to stdout and stderr.
     decorate_logs("APIServer")
 
+    # Interrupt initialization if SIGTERM arrives before uvicorn installs its
+    # own signal handlers. Once uvicorn is running it replaces this.
+    def _interrupt_init(*_) -> None:
+        raise KeyboardInterrupt("terminated")
+
+    signal.signal(signal.SIGTERM, _interrupt_init)
+
     listen_address, sock = setup_server(args)
     await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
 
diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py
index e3682280ec50..de3ff364cac5 100644
--- a/vllm/entrypoints/utils.py
+++ b/vllm/entrypoints/utils.py
@@ -9,6 +9,7 @@
 from http import HTTPStatus
 from logging import Logger
 from string import Template
+from typing import Any
 
 import regex as re
 from fastapi import Request
@@ -203,7 +204,7 @@ def get_max_tokens(
     )
 
 
-def log_non_default_args(args: Namespace | EngineArgs):
+def get_non_default_args(args: Namespace | EngineArgs) -> dict[str, Any]:
     from vllm.entrypoints.openai.cli_args import make_arg_parser
 
     non_default_args = {}
@@ -230,6 +231,43 @@ def log_non_default_args(args: Namespace | EngineArgs):
             "Unsupported argument type. Must be Namespace or EngineArgs instance."
         )
 
+    return non_default_args
+
+
+def _jsonify_arg_value(value: Any) -> Any:
+    if value is None or isinstance(value, bool | int | float | str):
+        return value
+    if dataclasses.is_dataclass(value) and not isinstance(value, type):
+        return {
+            key: _jsonify_arg_value(val)
+            for key, val in dataclasses.asdict(value).items()
+        }
+    if isinstance(value, dict):
+        return {str(key): _jsonify_arg_value(val) for key, val in value.items()}
+    if isinstance(value, tuple | list):
+        return [_jsonify_arg_value(item) for item in value]
+    if (model_dump := getattr(value, "model_dump", None)) is not None:
+        return _jsonify_arg_value(model_dump(mode="json"))
+    if (to_dict := getattr(value, "dict", None)) is not None:
+        return _jsonify_arg_value(to_dict())
+    return repr(value)
+
+
+def jsonify_non_default_args(
+    args: Namespace | EngineArgs,
+    *,
+    exclude: set[str] | None = None,
+) -> dict[str, Any]:
+    non_default_args = get_non_default_args(args)
+    if exclude is not None:
+        for key in exclude:
+            non_default_args.pop(key, None)
+
+    return {key: _jsonify_arg_value(value) for key, value in non_default_args.items()}
+
+
+def log_non_default_args(args: Namespace | EngineArgs):
+    non_default_args = get_non_default_args(args)
     logger.info("non-default args: %s", non_default_args)
 
 
diff --git a/vllm/envs.py b/vllm/envs.py
index a5a4bfaffd59..c25c27a42dbf 100755
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -85,6 +85,7 @@
     MAX_JOBS: str | None = None
     NVCC_THREADS: str | None = None
     VLLM_USE_PRECOMPILED: bool = False
+    VLLM_USE_PRECOMPILED_RUST: bool = False
     VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
     VLLM_DOCKER_BUILD_CONTEXT: bool = False
     VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -133,6 +134,8 @@
     Q_SCALE_CONSTANT: int = 200
     K_SCALE_CONSTANT: int = 200
     V_SCALE_CONSTANT: int = 100
+    VLLM_USE_RUST_FRONTEND: bool = False
+    VLLM_RUST_FRONTEND_PATH: str | None = "auto"
     VLLM_SERVER_DEV_MODE: bool = False
     VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
     VLLM_MLA_DISABLE: bool = False
@@ -505,6 +508,40 @@ def _get_or_set_default() -> str:
 
 logger = logging.getLogger(__name__)
 
+
+def _resolve_rust_frontend_path() -> str | None:
+    """Resolve the Rust frontend binary path.
+
+    Returns None if VLLM_USE_RUST_FRONTEND is not enabled.
+    When enabled, resolves VLLM_RUST_FRONTEND_PATH ("auto" by default)
+    to the actual binary path.
+    """
+    use_rust = bool(int(os.environ.get("VLLM_USE_RUST_FRONTEND", "0")))
+    raw = os.environ.get("VLLM_RUST_FRONTEND_PATH", "auto")
+
+    if not use_rust:
+        if os.environ.get("VLLM_RUST_FRONTEND_PATH") is not None:
+            logger.warning(
+                "VLLM_RUST_FRONTEND_PATH is set but VLLM_USE_RUST_FRONTEND "
+                "is not enabled. The Rust frontend will not be used. "
+                "Set VLLM_USE_RUST_FRONTEND=1 to enable it."
+            )
+        return None
+
+    if raw.lower() in ("auto", "1", "true"):
+        pkg_dir = os.path.dirname(os.path.abspath(__file__))
+        candidate = os.path.join(pkg_dir, "vllm-rs")
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+
+        raise FileNotFoundError(
+            "VLLM_RUST_FRONTEND_PATH=auto but the vllm-rs binary was "
+            f"not found at {candidate}. "
+            "Build with setuptools-rust or set the path explicitly."
+        )
+    return raw
+
+
 environment_variables: dict[str, Callable[[], Any]] = {
     # ================== Installation Time Env Vars ==================
     # Target device of vLLM, supporting [cuda (by default),
@@ -532,11 +569,15 @@ def _get_or_set_default() -> str:
     # By default this is 1.
     # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
     "NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None),
-    # If set, vllm will use precompiled binaries (*.so)
+    # If set, vllm will use precompiled native binaries (*.so)
     "VLLM_USE_PRECOMPILED": lambda: (
         os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
         or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION"))
     ),
+    # If set, vllm will use the precompiled Rust frontend binary (vllm-rs).
+    "VLLM_USE_PRECOMPILED_RUST": lambda: (
+        os.environ.get("VLLM_USE_PRECOMPILED_RUST", "").strip().lower() in ("1", "true")
+    ),
     # If set, skip adding +precompiled suffix to version string
     "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
         int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
@@ -1118,6 +1159,15 @@ def _get_or_set_default() -> str:
     # If set to "0", disable LayerName opaque type for layer_name
     # parameters in custom ops.  Defaults to enabled on torch >= 2.11.
     "VLLM_USE_LAYERNAME": lambda: bool(int(os.getenv("VLLM_USE_LAYERNAME", "1"))),
+    # If set, use the Rust frontend binary instead of the Python API server
+    # process(es).
+    "VLLM_USE_RUST_FRONTEND": lambda: bool(
+        int(os.getenv("VLLM_USE_RUST_FRONTEND", "0"))
+    ),
+    # Path to the Rust frontend binary. Defaults to "auto" which discovers
+    # the binary installed with the vllm package. Only used when
+    # VLLM_USE_RUST_FRONTEND=1.
+    "VLLM_RUST_FRONTEND_PATH": lambda: _resolve_rust_frontend_path(),
     # If set, vllm will run in development mode, which will enable
     # some additional endpoints for developing and debugging,
     # e.g. `/reset_prefix_cache`
diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
index eb81a3c88fb7..ba1db3282216 100644
--- a/vllm/v1/utils.py
+++ b/vllm/v1/utils.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import contextlib
+import json
 import multiprocessing
 import threading
 import time
@@ -233,6 +234,146 @@ def shutdown(self, timeout: float | None = None) -> None:
             shutdown(self.processes, timeout=timeout)
 
 
+class RustFrontendProcessManager:
+    """Manages a single Rust frontend subprocess.
+
+    Launches the Rust vllm-rs binary in 'frontend' mode, passing the
+    listening socket fd and ZMQ transport addresses. Provides the same
+    interface as APIServerProcessManager for process monitoring.
+    """
+
+    def __init__(
+        self,
+        binary_path: str,
+        sock: Any,
+        args: argparse.Namespace,
+        input_address: str,
+        output_address: str,
+        engine_count: int,
+        stats_update_address: str | None = None,
+    ):
+        import os
+        import subprocess
+
+        fd = sock.fileno()
+        os.set_inheritable(fd, True)
+
+        cmd = [
+            binary_path,
+            "frontend",
+            "--listen-fd",
+            str(fd),
+            "--input-address",
+            input_address,
+            "--output-address",
+            output_address,
+            "--engine-count",
+            str(engine_count),
+        ]
+        if stats_update_address is not None:
+            cmd.extend(["--coordinator-address", stats_update_address])
+        from vllm.entrypoints.utils import jsonify_non_default_args
+
+        args_json = json.dumps(
+            jsonify_non_default_args(args, exclude={"api_server_count"}),
+            sort_keys=True,
+        )
+        cmd.extend(["--args-json", args_json])
+
+        logger.info("Launching Rust frontend: %s", " ".join(cmd))
+        self._proc = subprocess.Popen(cmd, pass_fds=(fd,))
+
+        # Create a process wrapper with a sentinel fd for monitoring
+        self.processes: list[_SubprocessWrapper] = [
+            _SubprocessWrapper(self._proc, "RustFrontend")
+        ]
+
+        self._finalizer = weakref.finalize(self, _shutdown_subprocesses, self.processes)
+
+    def shutdown(self, timeout: float | None = None) -> None:
+        if self._finalizer.detach() is not None:
+            _shutdown_subprocesses(self.processes, timeout=timeout)
+
+
+class _SubprocessWrapper:
+    """Wraps subprocess.Popen to provide the BaseProcess-like interface
+    needed by wait_for_completion_or_failure."""
+
+    def __init__(self, proc, name: str):
+        self._proc = proc
+        self.name = name
+        self.pid = proc.pid
+        self._sentinel_conn: connection.Connection | None = None
+        self._sentinel_send: connection.Connection | None = None
+
+        # Use a Pipe-based sentinel so subprocess monitoring works uniformly
+        # across platforms with multiprocessing.connection.wait().
+        recv, send = connection.Pipe(duplex=False)
+        self._sentinel_conn = recv
+        self._sentinel_send = send
+
+        def monitor_subprocess() -> None:
+            try:
+                proc.wait()
+            finally:
+                with contextlib.suppress(Exception):
+                    send.close()
+
+        threading.Thread(
+            target=monitor_subprocess, daemon=True, name=f"{name}Monitor"
+        ).start()
+
+    @property
+    def sentinel(self):
+        return self._sentinel_conn
+
+    @property
+    def exitcode(self) -> int | None:
+        return self._proc.returncode if self._proc.poll() is not None else None
+
+    def is_alive(self) -> bool:
+        return self._proc.poll() is None
+
+    def terminate(self):
+        self._proc.terminate()
+
+    def join(self, timeout=None):
+        with contextlib.suppress(Exception):
+            self._proc.wait(timeout=timeout)
+
+    def __del__(self):
+        with contextlib.suppress(Exception):
+            if self._sentinel_conn is not None:
+                self._sentinel_conn.close()
+            if self._sentinel_send is not None:
+                self._sentinel_send.close()
+
+
+def _shutdown_subprocesses(
+    procs: list[_SubprocessWrapper], timeout: float | None = None
+) -> None:
+    """Shutdown subprocess wrappers (mirrors the shutdown() function)."""
+    if timeout is None:
+        timeout = 0.0
+    timeout = max(timeout, 5.0)
+
+    for proc in procs:
+        if proc.is_alive():
+            proc.terminate()
+
+    deadline = time.monotonic() + timeout
+    for proc in procs:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        if proc.is_alive():
+            proc.join(remaining)
+
+    for proc in procs:
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
+
+
 def run_api_server_worker_proc(
     listen_address, sock, args, client_config=None, **uvicorn_kwargs
 ) -> None:
@@ -253,7 +394,7 @@ def run_api_server_worker_proc(
 
 
 def wait_for_completion_or_failure(
-    api_server_manager: APIServerProcessManager,
+    api_server_manager: "APIServerProcessManager | RustFrontendProcessManager",
     engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
     | None = None,
     coordinator: "DPCoordinator | None" = None,
@@ -274,7 +415,7 @@ def wait_for_completion_or_failure(
         logger.info("Waiting for API servers to complete ...")
         # Create a mapping of sentinels to their corresponding processes
         # for efficient lookup
-        sentinel_to_proc: dict[Any, BaseProcess] = {
+        sentinel_to_proc: dict[Any, BaseProcess | _SubprocessWrapper | None] = {
             proc.sentinel: proc for proc in api_server_manager.processes
         }