qwen3_5_moe: add OpenAI serving entrypoint (#20313)

mergennachin · web-flow · commit c9ef42303625 · 2026-06-18T14:16:13.000-04:00
Add a model-specific OpenAI-compatible serving launcher for Qwen3.5-MoE.
The Python process stays as the control plane for HTTP, chat templating,
request validation, session affinity, and Qwen tool parsing; model
execution stays in the C++ qwen3_5_moe_worker process through the
generic examples/llm_server JSONL protocol.

This keeps Qwen-specific serving glue in examples/models/qwen3_5_moe
while reusing the generic server runtime. It also keeps the existing C++
runner path intact: the serving entrypoint is a wrapper around the
worker/engine path, not a replacement for main.cpp.

Add CUDA e2e serving smoke coverage for the Qwen artifact job. The
test-model-cuda-e2e job runs in a fresh environment after downloading
exported artifacts, so install ExecuTorch in editable mode before
invoking python -m executorch.examples.models.qwen3_5_moe.serve.

Validation:

- python -m pytest -q examples/models/qwen3_5_moe/test_serve.py: 8
passed

- python -m py_compile examples/models/qwen3_5_moe/serve.py
examples/models/qwen3_5_moe/test_serve.py

- bash -n .ci/scripts/test_model_e2e.sh

- Qwen BFCL serving slice with Pi-style session-affinity headers: 50/56
generated-slice pass rate (89.29%); parallel, parallel_multiple, and
irrelevance categories passed 100%; live_multiple was the weakest slice
at 4/6.

- Pi integration uses the OpenAI-compatible endpoint plus session_id /
x-session-affinity headers. Subagent fanout must run with enough
--max-sessions for the concurrent named sessions; otherwise the expected
behavior is a 429 capacity_exhausted response instead of silently
duplicating model weights.
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -422,8 +422,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
       --no-compile
   echo "::endgroup::"
 
-  # Copy tokenizer for the runner
+  # Copy tokenizer files for the runner and model-specific serving launcher.
   cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
+  cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json"
 
   # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
   echo "::group::Export"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -447,4 +447,105 @@ case "$MODEL_NAME" in
 esac
 echo "::endgroup::"
 
+if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
+  echo "::group::Run $MODEL_NAME OpenAI serving smoke"
+  pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1"
+  python -m pip install --no-deps --no-build-isolation --editable . -v
+
+  PORT=$(python - <<'PY'
+import socket
+
+with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+    s.bind(("127.0.0.1", 0))
+    print(s.getsockname()[1])
+PY
+)
+  SERVER_LOG=$(mktemp)
+  WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
+  python -u -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path "${MODEL_DIR}/model.pte" \
+    --data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \
+    --tokenizer-path "${MODEL_DIR}/tokenizer.json" \
+    --hf-tokenizer "${MODEL_DIR}" \
+    --model-id qwen3.5-moe \
+    --max-context 4096 \
+    --max-sessions 2 \
+    --no-think \
+    --worker-bin "$WORKER_BIN" \
+    --host 127.0.0.1 \
+    --port "$PORT" >"$SERVER_LOG" 2>&1 &
+  SERVER_PID=$!
+
+  cleanup_qwen_server() {
+    if kill -0 "$SERVER_PID" 2>/dev/null; then
+      kill "$SERVER_PID" 2>/dev/null || true
+      wait "$SERVER_PID" 2>/dev/null || true
+    fi
+    rm -f "$SERVER_LOG"
+  }
+  trap cleanup_qwen_server EXIT
+
+  if ! python - "$PORT" "$SERVER_LOG" <<'PY'
+import json
+import sys
+import time
+import urllib.request
+
+port = sys.argv[1]
+log_path = sys.argv[2]
+base = f"http://127.0.0.1:{port}"
+
+
+def request(path, payload=None):
+    data = None
+    headers = {}
+    if payload is not None:
+        data = json.dumps(payload).encode("utf-8")
+        headers["Content-Type"] = "application/json"
+    req = urllib.request.Request(base + path, data=data, headers=headers)
+    with urllib.request.urlopen(req, timeout=120) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+last = None
+for _ in range(180):
+    try:
+        request("/health")
+        break
+    except Exception as e:
+        last = e
+        time.sleep(1)
+else:
+    print(open(log_path, encoding="utf-8", errors="replace").read())
+    raise RuntimeError(f"server did not become healthy: {last}")
+
+models = request("/v1/models")
+ids = {m["id"] for m in models["data"]}
+if "qwen3.5-moe" not in ids:
+    raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}")
+
+body = {
+    "model": "qwen3.5-moe",
+    "messages": [{"role": "user", "content": "What is the capital of France?"}],
+    "max_tokens": 32,
+    "temperature": 0,
+}
+resp = request("/v1/chat/completions", body)
+content = resp["choices"][0]["message"].get("content") or ""
+if "Paris" not in content:
+    raise AssertionError(f"expected Paris in serving response, got: {content!r}")
+
+print("Qwen3.5-MoE serving smoke passed")
+PY
+  then
+    echo "Qwen3.5-MoE serving smoke failed; server log:"
+    cat "$SERVER_LOG"
+    exit 1
+  fi
+
+  cleanup_qwen_server
+  trap - EXIT
+  echo "::endgroup::"
+fi
+
 popd
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -391,6 +391,7 @@ exclude_patterns = [
     '**/*.gif',
     'extension/llm/tokenizers',
     'extension/llm/tokenizers/**',
+    'examples/llm_server',
     'backends/cadence/utils/FACTO',
     'examples/cuda',
     'examples/qualcomm',
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -69,9 +69,17 @@ target_include_directories(
 )
 target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})
 
+add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
+target_include_directories(
+  qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
+)
+target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
+
 if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(qwen3_5_moe_runner)
   target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
+  target_link_options_gc_sections(qwen3_5_moe_worker)
+  target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
 endif()
 
 if(EXECUTORCH_BUILD_CUDA)
diff --git a/examples/models/qwen3_5_moe/CMakePresets.json b/examples/models/qwen3_5_moe/CMakePresets.json
@@ -41,15 +41,19 @@
     "buildPresets": [
         {
             "name": "qwen3-5-moe-cuda",
-            "displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
+            "displayName": "Build Qwen3.5 MoE runner, worker, and no-bleed test (CUDA)",
             "configurePreset": "qwen3-5-moe-cuda",
-            "targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
+            "targets": [
+                "qwen3_5_moe_runner",
+                "qwen3_5_moe_worker",
+                "test_qwen35_moe_nobleed"
+            ]
         },
         {
             "name": "qwen3-5-moe-metal",
-            "displayName": "Build Qwen3.5 MoE runner (Metal)",
+            "displayName": "Build Qwen3.5 MoE runner and worker (Metal)",
             "configurePreset": "qwen3-5-moe-metal",
-            "targets": ["qwen3_5_moe_runner"]
+            "targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
         }
     ],
     "workflowPresets": [
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -147,6 +147,56 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
 `--cuda_graph` is intentionally single-session only. CUDA graph replay captures
 device pointers, so it is not combined with per-session mutable-state rebinding.
 
+## OpenAI-compatible serving
+
+The CUDA build also produces `qwen3_5_moe_worker`, a C++ model-execution worker
+used by the generic `examples/llm_server` control plane. The Qwen launcher wires
+in the model's Hugging Face chat template and Qwen XML tool-call parser:
+
+```bash
+python -m executorch.examples.models.qwen3_5_moe.serve \
+    --model-path qwen35_moe_exports/model.pte \
+    --data-path qwen35_moe_exports/aoti_cuda_blob.ptd \
+    --tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
+    --hf-tokenizer ~/models/Qwen3.5-35B-A3B \
+    --model-id qwen3.5-moe \
+    --max-context 4096 \
+    --max-sessions 4 \
+    --no-think
+```
+
+`--max-sessions` controls how many isolated sessions the worker can host on one
+weight load. One slot is reserved for anonymous requests; clients should send a
+stable `session_id` (or session-affinity header) to get per-conversation
+isolation and warm append-only resume.
+
+### Use from pi
+
+Point pi at the Qwen server via `~/.pi/agent/models.json`:
+
+```json
+{
+  "providers": {
+    "executorch": {
+      "baseUrl": "http://127.0.0.1:8000/v1",
+      "api": "openai-completions",
+      "apiKey": "x",
+      "models": [
+        {
+          "id": "qwen3.5-moe",
+          "compat": { "sendSessionAffinityHeaders": true }
+        }
+      ]
+    }
+  }
+}
+```
+
+The model id must match `--model-id`. `sendSessionAffinityHeaders` lets pi route
+each conversation or subagent to a stable server session; without it, requests
+use the anonymous scratch session and do not get per-conversation isolation or
+warm resume.
+
 ### CUDA no-bleed test
 
 The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
diff --git a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Model-execution worker for Qwen3.5 MoE (CUDA/AOTI).
+//
+// The Python OpenAI control plane spawns this process and drives it over the
+// generic examples/llm_server JSONL worker protocol. This file is intentionally
+// model-specific only where it constructs Qwen35MoEEngine.
+
+#include <gflags/gflags.h>
+
+#include <executorch/examples/llm_server/cpp/worker_loop.h>
+#include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cstdint>
+
+DEFINE_string(model_path, "", "Model .pte file path.");
+DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
+DEFINE_string(data_path, "", "Data file (.ptd) for the CUDA backend.");
+DEFINE_int32(
+    max_sessions,
+    1,
+    "Max physical sessions to host on one weight allocation. Clamped to 1 if "
+    "the backend cannot isolate per-session mutable state.");
+DEFINE_bool(
+    warm_resume,
+    true,
+    "Warm append-only resume for named sessions. Off resets every request.");
+
+namespace {
+namespace llm = ::executorch::extension::llm;
+using ::executorch::runtime::Error;
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) {
+    ET_LOG(
+        Error, "qwen35_moe_worker: --model_path and --tokenizer_path required");
+    return 1;
+  }
+
+  llm::Qwen35MoEConfig config;
+  config.model_path = FLAGS_model_path;
+  config.data_path = FLAGS_data_path;
+  config.tokenizer_path = FLAGS_tokenizer_path;
+  config.max_sessions = FLAGS_max_sessions;
+
+  auto engine_result = llm::Qwen35MoEEngine::create(config);
+  if (engine_result.error() != Error::Ok) {
+    ET_LOG(Error, "qwen35_moe_worker: failed to create engine");
+    return 1;
+  }
+  auto engine = std::move(engine_result.get());
+
+  return llm::run_worker_stdio_loop(
+      *engine, *engine->tokenizer(), engine->metadata(), FLAGS_warm_resume);
+}
diff --git a/examples/models/qwen3_5_moe/serve.py b/examples/models/qwen3_5_moe/serve.py
diff --git a/examples/models/qwen3_5_moe/test_serve.py b/examples/models/qwen3_5_moe/test_serve.py
diff --git a/src/executorch/examples/llm_server b/src/executorch/examples/llm_server