pytorch
diff --git a/‎.ci/docker/build.sh‎
Lines changed: 2 additions & 2 deletions b/‎.ci/docker/build.sh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 2 additions & 1 deletion b/‎.ci/scripts/export_model_artifact.sh‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.ci/scripts/setup-vulkan-linux-deps.sh‎
Lines changed: 121 additions & 4 deletions b/‎.ci/scripts/setup-vulkan-linux-deps.sh‎
Lines changed: 121 additions & 4 deletions
diff --git a/‎.ci/scripts/setup-vulkan-windows-deps.ps1‎
Lines changed: 37 additions & 0 deletions b/‎.ci/scripts/setup-vulkan-windows-deps.ps1‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎.ci/scripts/setup-windows-msvc-vulkan.ps1‎
Lines changed: 51 additions & 0 deletions b/‎.ci/scripts/setup-windows-msvc-vulkan.ps1‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎.ci/scripts/test_backend.sh‎
Lines changed: 9 additions & 2 deletions b/‎.ci/scripts/test_backend.sh‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 101 additions & 0 deletions b/‎.ci/scripts/test_model_e2e.sh‎
Lines changed: 101 additions & 0 deletions
@@ -89,10 +89,10 @@ case "${IMAGE_NAME}" in
     OS_VERSION=24.04
     GCC_VERSION=14
     ;;
-  executorch-ubuntu-26.04-gcc15)
+  executorch-ubuntu-26.04-gcc14)
     LINTRUNNER=""
     OS_VERSION=26.04
-    GCC_VERSION=15
+    GCC_VERSION=14
     ;;
   *)
     echo "Invalid image name ${IMAGE_NAME}"
 
@@ -422,8 +422,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
       --no-compile
   echo "::endgroup::"
 
-  # Copy tokenizer for the runner
+  # Copy tokenizer files for the runner and model-specific serving launcher.
   cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
+  cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json"
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
 
@@ -1,4 +1,3 @@
-
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
@@ -22,7 +21,7 @@ install_swiftshader() {
   tar -C "${_swiftshader_dir}" -xzf "${_tmp_archive}"
 
   export VK_ICD_FILENAMES="${_swiftshader_dir}/swiftshader/build/Linux/vk_swiftshader_icd.json"
-  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/"
+  export LD_LIBRARY_PATH="${_swiftshader_dir}/swiftshader/build/Linux/:${LD_LIBRARY_PATH:-}"
   export ETVK_USING_SWIFTSHADER=1
 }
 
@@ -43,7 +42,125 @@ install_vulkan_sdk() {
   export PATH="${PATH}:${_vulkan_sdk_dir}/${VULKAN_SDK_VERSION}/x86_64/bin/"
 }
 
+_maybe_sudo() {
+  if [ "$(id -u)" -eq 0 ]; then
+    "$@"
+  else
+    sudo "$@"
+  fi
+}
+
+install_glslc() {
+  # The glslc shipped in the LunarG SDK is dynamically linked against a newer
+  # glibc/libstdc++ than the manylinux_2_28 / AlmaLinux 8 CUDA runner image
+  # provides (glibc 2.28), where it fails to load with "GLIBC_2.29 not found".
+  # conda-forge's shaderc is built against an old sysroot, runs there, and is
+  # recent enough for the GL_EXT_integer_dot_product / GL_KHR_cooperative_matrix
+  # extensions the Vulkan shaders use. Install it into an isolated prefix so the
+  # base conda env that builds ExecuTorch is left untouched, then put it on PATH.
+  _glslc_prefix=/tmp/shaderc
+  conda create -y -p "${_glslc_prefix}" -c conda-forge shaderc
+  export PATH="${_glslc_prefix}/bin:${PATH}"
+}
+
+install_vulkan_loader() {
+  # libvulkan.so.1 (the Khronos loader that volk dlopen()s at runtime) is not part
+  # of the NVIDIA driver and is absent from the CUDA builder image; vulkan-tools
+  # provides vulkaninfo for the device sanity check. Both ship as native el8 RPMs.
+  if command -v dnf >/dev/null 2>&1; then
+    _maybe_sudo dnf install -y vulkan-loader vulkan-tools
+  fi
+}
+
+_find_nvidia_vulkan_library() {
+  # NVIDIA implements its Vulkan ICD inside libGLX_nvidia.so.0. The NVIDIA
+  # container runtime mounts this library into the container (it is pulled from
+  # the driver's ldcache when NVIDIA_DRIVER_CAPABILITIES includes graphics/all),
+  # so prefer ldconfig and fall back to the usual mount locations.
+  local lib cand
+  lib="$(ldconfig -p 2>/dev/null | awk '/libGLX_nvidia\.so\.0/ {print $NF; exit}')"
+  if [ -z "${lib}" ]; then
+    for cand in /usr/lib64/libGLX_nvidia.so.0 \
+        /usr/lib/x86_64-linux-gnu/libGLX_nvidia.so.0 \
+        /usr/lib/libGLX_nvidia.so.0; do
+      if [ -e "${cand}" ]; then
+        lib="${cand}"
+        break
+      fi
+    done
+  fi
+  printf '%s' "${lib}"
+}
+
+_vulkan_has_real_device() {
+  # True if the loader enumerates a hardware GPU. vulkaninfo can exit non-zero
+  # for unrelated reasons (no display/WSI), so key off the reported deviceType.
+  command -v vulkaninfo >/dev/null 2>&1 || return 0
+  vulkaninfo --summary 2>/dev/null |
+    grep -qE 'PHYSICAL_DEVICE_TYPE_(DISCRETE|INTEGRATED|VIRTUAL)_GPU'
+}
+
+setup_real_gpu_icd() {
+  # Select a Vulkan ICD so the runtime exercises the real GPU when one is usable.
+  # Two quirks of the CUDA CI image make this non-trivial:
+  #   1. The NVIDIA container runtime mounts the driver's Vulkan library but does
+  #      not register its ICD manifest, so the loader never discovers the GPU on
+  #      its own. We synthesize the manifest and pin the loader to it.
+  #   2. Installing vulkan-loader/vulkan-tools pulls in mesa-vulkan-drivers,
+  #      which drop Intel/AMD/lavapipe manifests for absent hardware. lavapipe
+  #      fails vkCreateInstance on this image and, because the loader walks every
+  #      manifest in icd.d, that poisons device enumeration for the whole
+  #      process. Pinning VK_ICD_FILENAMES makes the loader ignore icd.d, so the
+  #      broken stubs cannot interfere.
+  local nvidia_lib
+  nvidia_lib="$(_find_nvidia_vulkan_library)"
+  if [ -n "${nvidia_lib}" ]; then
+    local icd=/tmp/nvidia_icd.json
+    cat >"${icd}" <<JSON
+{
+    "file_format_version": "1.0.0",
+    "ICD": {
+        "library_path": "${nvidia_lib}",
+        "api_version": "1.3.0"
+    }
+}
+JSON
+    export VK_ICD_FILENAMES="${icd}"
+    unset ETVK_USING_SWIFTSHADER || true
+    if _vulkan_has_real_device; then
+      echo "Real NVIDIA GPU selected; pinned Vulkan ICD to ${nvidia_lib}"
+      return
+    fi
+    echo "WARNING: ${nvidia_lib} present but no GPU enumerated; using SwiftShader."
+    # Surface why the NVIDIA driver did not enumerate (e.g. a missing dependency
+    # of libGLX_nvidia, or no render node) so the fallback is diagnosable in CI.
+    if command -v vulkaninfo >/dev/null 2>&1; then
+      echo "--- NVIDIA Vulkan ICD diagnostic ---"
+      VK_LOADER_DEBUG=warn vulkaninfo --summary 2>&1 | head -40 || true
+      echo "--- end diagnostic ---"
+    fi
+    unset VK_ICD_FILENAMES
+  else
+    echo "WARNING: no NVIDIA Vulkan driver library found; using SwiftShader."
+  fi
+  install_swiftshader
+}
+
 VULKAN_SDK_VERSION="1.4.321.1"
 
-install_swiftshader
-install_vulkan_sdk "${VULKAN_SDK_VERSION}"
+# The no-argument default installs SwiftShader so the existing CPU-runner CI is
+# unchanged. Pass "real-gpu" to prefer a real system ICD when one is present.
+case "${1:-swiftshader}" in
+  real-gpu)
+    # Do not download the LunarG SDK here: its prebuilt glslc cannot run on the
+    # old-glibc CUDA image. glslc comes from conda-forge and the loader from the
+    # system package manager instead.
+    install_vulkan_loader
+    install_glslc
+    setup_real_gpu_icd
+    ;;
+  swiftshader | *)
+    install_swiftshader
+    install_vulkan_sdk "${VULKAN_SDK_VERSION}"
+    ;;
+esac
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Install glslc (the Vulkan shader compiler) on Windows via conda-forge's
+# shaderc package, and make sure it is on PATH. glslc is the only build-time
+# Vulkan dependency -- the Vulkan headers and the volk loader come from the
+# in-tree submodules -- so this avoids depending on the heavyweight LunarG SDK
+# installer. Requires conda to be available (the callers create/activate an env).
+
+$ErrorActionPreference = "Stop"
+
+Write-Host "Installing shaderc (provides glslc) from conda-forge..."
+conda install -y -c conda-forge shaderc
+if ($LASTEXITCODE -ne 0) {
+    Write-Error "Failed to install shaderc from conda-forge (exit ${LASTEXITCODE})"
+    exit 1
+}
+
+$glslc = Get-Command glslc -ErrorAction SilentlyContinue
+if (-not $glslc) {
+    Write-Error "glslc not found on PATH after installing shaderc"
+    exit 1
+}
+
+# Expose glslc to the current process and, when running as a GitHub Actions step,
+# to subsequent steps.
+$glslcDir = Split-Path -Parent $glslc.Source
+$env:PATH = "${glslcDir};${env:PATH}"
+if ($env:GITHUB_PATH) {
+    Add-Content -Path $env:GITHUB_PATH -Value $glslcDir
+}
+
+Write-Host "glslc available at $($glslc.Source)"
+& glslc --version
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Build-validation for the Vulkan backend under MSVC on Windows. Mirrors
+# setup-windows-msvc.ps1 but installs glslc (the Vulkan shader compiler) and
+# configures/builds the vulkan_backend target. This is a bring-up job: it exists
+# to surface MSVC portability issues in the Vulkan/volk/VMA code, so it may need
+# iteration.
+
+conda create --yes --quiet -n et python=3.12
+conda activate et
+
+# Install cmake
+conda install -y cmake
+
+# Activate the VS environment - this is required for MSVC to work.
+& "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1" -Arch amd64
+
+# Install glslc (via conda-forge shaderc) and put it on PATH in this process.
+.ci/scripts/setup-vulkan-windows-deps.ps1
+
+# Install CI requirements
+pip install -r .ci/docker/requirements-ci.txt
+
+$buildDir = "cmake-out-vulkan"
+if (Test-Path -Path $buildDir) {
+    Remove-Item -Path $buildDir -Recurse -Force
+}
+New-Item -Path $buildDir -ItemType Directory
+
+cmake -S . -B $buildDir `
+    -DCMAKE_BUILD_TYPE=Release `
+    -DEXECUTORCH_BUILD_VULKAN=ON `
+    -DPYTHON_EXECUTABLE=python
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "CMake configuration failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+cmake --build $buildDir --config Release --target vulkan_backend -j16
+
+if ($LASTEXITCODE -ne 0) {
+    Write-Host "Vulkan backend MSVC build failed. Exit code: $LASTEXITCODE."
+    exit $LASTEXITCODE
+}
+
+Write-Host "Vulkan backend MSVC build completed successfully!"
@@ -51,8 +51,15 @@ if [[ "$FLOW" == *qnn* ]]; then
 fi
 
 if [[ "$FLOW" == *vulkan* ]]; then
-    # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate.
-    source .ci/scripts/setup-vulkan-linux-deps.sh
+    # Setup the Vulkan SDK and select an ICD: use the real system GPU ICD when one
+    # is present (real-GPU runner), otherwise fall back to SwiftShader (CPU
+    # runner). The Vulkan loader searches both standard ICD directories.
+    if ls /etc/vulkan/icd.d/*.json /usr/share/vulkan/icd.d/*.json \
+        >/dev/null 2>&1; then
+        source .ci/scripts/setup-vulkan-linux-deps.sh "real-gpu"
+    else
+        source .ci/scripts/setup-vulkan-linux-deps.sh "swiftshader"
+    fi
 
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
@@ -447,4 +447,105 @@ case "$MODEL_NAME" in
 esac
 echo "::endgroup::"
 
+if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
+  echo "::group::Run $MODEL_NAME OpenAI serving smoke"
+  pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1"
+  python -m pip install --no-deps --no-build-isolation --editable . -v
+
+  PORT=$(python - <<'PY'
+import socket
+
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+)
+  SERVER_LOG=$(mktemp)
+  WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+  python -u -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path "${MODEL_DIR}/model.pte" \
+    --data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \
+    --tokenizer-path "${MODEL_DIR}/tokenizer.json" \
+    --hf-tokenizer "${MODEL_DIR}" \
+    --model-id qwen3.5-moe \
+    --max-context 4096 \
+    --max-sessions 2 \
+    --no-think \
+    --worker-bin "$WORKER_BIN" \
+    --host 127.0.0.1 \
+    --port "$PORT" >"$SERVER_LOG" 2>&1 &
+  SERVER_PID=$!
+
+  cleanup_qwen_server() {
+    if kill -0 "$SERVER_PID" 2>/dev/null; then
+      kill "$SERVER_PID" 2>/dev/null || true
+      wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    rm -f "$SERVER_LOG"
+  }
+  trap cleanup_qwen_server EXIT
+
+  if ! python - "$PORT" "$SERVER_LOG" <<'PY'
+import json
+import sys
+import time
+import urllib.request
+
+port = sys.argv[1]
+log_path = sys.argv[2]
+base = f"http://127.0.0.1:{port}"
+
+
+def request(path, payload=None):
+    data = None
+    headers = {}
+    if payload is not None:
+        data = json.dumps(payload).encode("utf-8")
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(base + path, data=data, headers=headers)
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+last = None
+for _ in range(180):
+    try:
+        request("/health")
+        break
+    except Exception as e:
+        last = e
+        time.sleep(1)
+else:
+    print(open(log_path, encoding="utf-8", errors="replace").read())
+    raise RuntimeError(f"server did not become healthy: {last}")
+
+models = request("/v1/models")
+ids = {m["id"] for m in models["data"]}
+if "qwen3.5-moe" not in ids:
+    raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}")
+
+body = {
+    "model": "qwen3.5-moe",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "max_tokens": 32,
+    "temperature": 0,
+}
+resp = request("/v1/chat/completions", body)
+content = resp["choices"][0]["message"].get("content") or ""
+if "Paris" not in content:
+    raise AssertionError(f"expected Paris in serving response, got: {content!r}")
+
+print("Qwen3.5-MoE serving smoke passed")
+PY
+  then
+    echo "Qwen3.5-MoE serving smoke failed; server log:"
+    cat "$SERVER_LOG"
+    exit 1
+  fi
+
+  cleanup_qwen_server
+  trap - EXIT
+  echo "::endgroup::"
+fi
+
 popd