Skip to content

Commit c9ef423

Browse files
authored
qwen3_5_moe: add OpenAI serving entrypoint (#20313)
Add a model-specific OpenAI-compatible serving launcher for Qwen3.5-MoE. The Python process stays as the control plane for HTTP, chat templating, request validation, session affinity, and Qwen tool parsing; model execution stays in the C++ qwen3_5_moe_worker process through the generic examples/llm_server JSONL protocol. This keeps Qwen-specific serving glue in examples/models/qwen3_5_moe while reusing the generic server runtime. It also keeps the existing C++ runner path intact: the serving entrypoint is a wrapper around the worker/engine path, not a replacement for main.cpp. Add CUDA e2e serving smoke coverage for the Qwen artifact job. The test-model-cuda-e2e job runs in a fresh environment after downloading exported artifacts, so install ExecuTorch in editable mode before invoking python -m executorch.examples.models.qwen3_5_moe.serve. Validation: - python -m pytest -q examples/models/qwen3_5_moe/test_serve.py: 8 passed - python -m py_compile examples/models/qwen3_5_moe/serve.py examples/models/qwen3_5_moe/test_serve.py - bash -n .ci/scripts/test_model_e2e.sh - Qwen BFCL serving slice with Pi-style session-affinity headers: 50/56 generated-slice pass rate (89.29%); parallel, parallel_multiple, and irrelevance categories passed 100%; live_multiple was the weakest slice at 4/6. - Pi integration uses the OpenAI-compatible endpoint plus session_id / x-session-affinity headers. Subagent fanout must run with enough --max-sessions for the concurrent named sessions; otherwise the expected behavior is a 429 capacity_exhausted response instead of silently duplicating model weights.
1 parent 4557df5 commit c9ef423

10 files changed

Lines changed: 576 additions & 5 deletions

File tree

.ci/scripts/export_model_artifact.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,8 +422,9 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
422422
--no-compile
423423
echo "::endgroup::"
424424

425-
# Copy tokenizer for the runner
425+
# Copy tokenizer files for the runner and model-specific serving launcher.
426426
cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
427+
cp "$LOCAL_MODEL_DIR/tokenizer_config.json" "${OUTPUT_DIR}/tokenizer_config.json"
427428

428429
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
429430
echo "::group::Export"

.ci/scripts/test_model_e2e.sh

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,4 +447,105 @@ case "$MODEL_NAME" in
447447
esac
448448
echo "::endgroup::"
449449

450+
if [ "$DEVICE" = "cuda" ] && [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
451+
echo "::group::Run $MODEL_NAME OpenAI serving smoke"
452+
pip install -r examples/llm_server/python/requirements.txt "transformers==5.0.0rc1"
453+
python -m pip install --no-deps --no-build-isolation --editable . -v
454+
455+
PORT=$(python - <<'PY'
456+
import socket
457+
458+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
459+
s.bind(("127.0.0.1", 0))
460+
print(s.getsockname()[1])
461+
PY
462+
)
463+
SERVER_LOG=$(mktemp)
464+
WORKER_BIN="cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_worker"
465+
python -u -m executorch.examples.models.qwen3_5_moe.serve \
466+
--model-path "${MODEL_DIR}/model.pte" \
467+
--data-path "${MODEL_DIR}/aoti_cuda_blob.ptd" \
468+
--tokenizer-path "${MODEL_DIR}/tokenizer.json" \
469+
--hf-tokenizer "${MODEL_DIR}" \
470+
--model-id qwen3.5-moe \
471+
--max-context 4096 \
472+
--max-sessions 2 \
473+
--no-think \
474+
--worker-bin "$WORKER_BIN" \
475+
--host 127.0.0.1 \
476+
--port "$PORT" >"$SERVER_LOG" 2>&1 &
477+
SERVER_PID=$!
478+
479+
cleanup_qwen_server() {
480+
if kill -0 "$SERVER_PID" 2>/dev/null; then
481+
kill "$SERVER_PID" 2>/dev/null || true
482+
wait "$SERVER_PID" 2>/dev/null || true
483+
fi
484+
rm -f "$SERVER_LOG"
485+
}
486+
trap cleanup_qwen_server EXIT
487+
488+
if ! python - "$PORT" "$SERVER_LOG" <<'PY'
489+
import json
490+
import sys
491+
import time
492+
import urllib.request
493+
494+
port = sys.argv[1]
495+
log_path = sys.argv[2]
496+
base = f"http://127.0.0.1:{port}"
497+
498+
499+
def request(path, payload=None):
500+
data = None
501+
headers = {}
502+
if payload is not None:
503+
data = json.dumps(payload).encode("utf-8")
504+
headers["Content-Type"] = "application/json"
505+
req = urllib.request.Request(base + path, data=data, headers=headers)
506+
with urllib.request.urlopen(req, timeout=120) as resp:
507+
return json.loads(resp.read().decode("utf-8"))
508+
509+
510+
last = None
511+
for _ in range(180):
512+
try:
513+
request("/health")
514+
break
515+
except Exception as e:
516+
last = e
517+
time.sleep(1)
518+
else:
519+
print(open(log_path, encoding="utf-8", errors="replace").read())
520+
raise RuntimeError(f"server did not become healthy: {last}")
521+
522+
models = request("/v1/models")
523+
ids = {m["id"] for m in models["data"]}
524+
if "qwen3.5-moe" not in ids:
525+
raise AssertionError(f"qwen3.5-moe missing from /v1/models: {ids}")
526+
527+
body = {
528+
"model": "qwen3.5-moe",
529+
"messages": [{"role": "user", "content": "What is the capital of France?"}],
530+
"max_tokens": 32,
531+
"temperature": 0,
532+
}
533+
resp = request("/v1/chat/completions", body)
534+
content = resp["choices"][0]["message"].get("content") or ""
535+
if "Paris" not in content:
536+
raise AssertionError(f"expected Paris in serving response, got: {content!r}")
537+
538+
print("Qwen3.5-MoE serving smoke passed")
539+
PY
540+
then
541+
echo "Qwen3.5-MoE serving smoke failed; server log:"
542+
cat "$SERVER_LOG"
543+
exit 1
544+
fi
545+
546+
cleanup_qwen_server
547+
trap - EXIT
548+
echo "::endgroup::"
549+
fi
550+
450551
popd

.lintrunner.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ exclude_patterns = [
391391
'**/*.gif',
392392
'extension/llm/tokenizers',
393393
'extension/llm/tokenizers/**',
394+
'examples/llm_server',
394395
'backends/cadence/utils/FACTO',
395396
'examples/cuda',
396397
'examples/qualcomm',

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,17 @@ target_include_directories(
6969
)
7070
target_link_libraries(qwen3_5_moe_runner PUBLIC ${link_libraries})
7171

72+
add_executable(qwen3_5_moe_worker qwen35_moe_worker.cpp qwen35_moe_engine.cpp)
73+
target_include_directories(
74+
qwen3_5_moe_worker PUBLIC ${_common_include_directories} ${_json_include}
75+
)
76+
target_link_libraries(qwen3_5_moe_worker PUBLIC ${link_libraries})
77+
7278
if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
7379
target_link_options_gc_sections(qwen3_5_moe_runner)
7480
target_link_options(qwen3_5_moe_runner PRIVATE "LINKER:-s")
81+
target_link_options_gc_sections(qwen3_5_moe_worker)
82+
target_link_options(qwen3_5_moe_worker PRIVATE "LINKER:-s")
7583
endif()
7684

7785
if(EXECUTORCH_BUILD_CUDA)

examples/models/qwen3_5_moe/CMakePresets.json

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,19 @@
4141
"buildPresets": [
4242
{
4343
"name": "qwen3-5-moe-cuda",
44-
"displayName": "Build Qwen3.5 MoE runner + no-bleed test (CUDA)",
44+
"displayName": "Build Qwen3.5 MoE runner, worker, and no-bleed test (CUDA)",
4545
"configurePreset": "qwen3-5-moe-cuda",
46-
"targets": ["qwen3_5_moe_runner", "test_qwen35_moe_nobleed"]
46+
"targets": [
47+
"qwen3_5_moe_runner",
48+
"qwen3_5_moe_worker",
49+
"test_qwen35_moe_nobleed"
50+
]
4751
},
4852
{
4953
"name": "qwen3-5-moe-metal",
50-
"displayName": "Build Qwen3.5 MoE runner (Metal)",
54+
"displayName": "Build Qwen3.5 MoE runner and worker (Metal)",
5155
"configurePreset": "qwen3-5-moe-metal",
52-
"targets": ["qwen3_5_moe_runner"]
56+
"targets": ["qwen3_5_moe_runner", "qwen3_5_moe_worker"]
5357
}
5458
],
5559
"workflowPresets": [

examples/models/qwen3_5_moe/README.md

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,56 @@ cmake-out/examples/models/qwen3_5_moe/qwen3_5_moe_runner \
147147
`--cuda_graph` is intentionally single-session only. CUDA graph replay captures
148148
device pointers, so it is not combined with per-session mutable-state rebinding.
149149

150+
## OpenAI-compatible serving
151+
152+
The CUDA build also produces `qwen3_5_moe_worker`, a C++ model-execution worker
153+
used by the generic `examples/llm_server` control plane. The Qwen launcher wires
154+
in the model's Hugging Face chat template and Qwen XML tool-call parser:
155+
156+
```bash
157+
python -m executorch.examples.models.qwen3_5_moe.serve \
158+
--model-path qwen35_moe_exports/model.pte \
159+
--data-path qwen35_moe_exports/aoti_cuda_blob.ptd \
160+
--tokenizer-path ~/models/Qwen3.5-35B-A3B/tokenizer.json \
161+
--hf-tokenizer ~/models/Qwen3.5-35B-A3B \
162+
--model-id qwen3.5-moe \
163+
--max-context 4096 \
164+
--max-sessions 4 \
165+
--no-think
166+
```
167+
168+
`--max-sessions` controls how many isolated sessions the worker can host on one
169+
weight load. One slot is reserved for anonymous requests; clients should send a
170+
stable `session_id` (or session-affinity header) to get per-conversation
171+
isolation and warm append-only resume.
172+
173+
### Use from pi
174+
175+
Point pi at the Qwen server via `~/.pi/agent/models.json`:
176+
177+
```json
178+
{
179+
"providers": {
180+
"executorch": {
181+
"baseUrl": "http://127.0.0.1:8000/v1",
182+
"api": "openai-completions",
183+
"apiKey": "x",
184+
"models": [
185+
{
186+
"id": "qwen3.5-moe",
187+
"compat": { "sendSessionAffinityHeaders": true }
188+
}
189+
]
190+
}
191+
}
192+
}
193+
```
194+
195+
The model id must match `--model-id`. `sendSessionAffinityHeaders` lets pi route
196+
each conversation or subagent to a stable server session; without it, requests
197+
use the anonymous scratch session and do not get per-conversation isolation or
198+
warm resume.
199+
150200
### CUDA no-bleed test
151201

152202
The CUDA build also produces `test_qwen35_moe_nobleed`, which validates that two
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Model-execution worker for Qwen3.5 MoE (CUDA/AOTI).
10+
//
11+
// The Python OpenAI control plane spawns this process and drives it over the
12+
// generic examples/llm_server JSONL worker protocol. This file is intentionally
13+
// model-specific only where it constructs Qwen35MoEEngine.
14+
15+
#include <gflags/gflags.h>
16+
17+
#include <executorch/examples/llm_server/cpp/worker_loop.h>
18+
#include <executorch/examples/models/qwen3_5_moe/qwen35_moe_engine.h>
19+
#include <executorch/runtime/platform/log.h>
20+
21+
#include <cstdint>
22+
23+
DEFINE_string(model_path, "", "Model .pte file path.");
24+
DEFINE_string(tokenizer_path, "", "HuggingFace tokenizer.json path.");
25+
DEFINE_string(data_path, "", "Data file (.ptd) for the CUDA backend.");
26+
DEFINE_int32(
27+
max_sessions,
28+
1,
29+
"Max physical sessions to host on one weight allocation. Clamped to 1 if "
30+
"the backend cannot isolate per-session mutable state.");
31+
DEFINE_bool(
32+
warm_resume,
33+
true,
34+
"Warm append-only resume for named sessions. Off resets every request.");
35+
36+
namespace {
37+
namespace llm = ::executorch::extension::llm;
38+
using ::executorch::runtime::Error;
39+
} // namespace
40+
41+
int main(int argc, char** argv) {
42+
gflags::ParseCommandLineFlags(&argc, &argv, true);
43+
44+
if (FLAGS_model_path.empty() || FLAGS_tokenizer_path.empty()) {
45+
ET_LOG(
46+
Error, "qwen35_moe_worker: --model_path and --tokenizer_path required");
47+
return 1;
48+
}
49+
50+
llm::Qwen35MoEConfig config;
51+
config.model_path = FLAGS_model_path;
52+
config.data_path = FLAGS_data_path;
53+
config.tokenizer_path = FLAGS_tokenizer_path;
54+
config.max_sessions = FLAGS_max_sessions;
55+
56+
auto engine_result = llm::Qwen35MoEEngine::create(config);
57+
if (engine_result.error() != Error::Ok) {
58+
ET_LOG(Error, "qwen35_moe_worker: failed to create engine");
59+
return 1;
60+
}
61+
auto engine = std::move(engine_result.get());
62+
63+
return llm::run_worker_stdio_loop(
64+
*engine, *engine->tokenizer(), engine->metadata(), FLAGS_warm_resume);
65+
}

0 commit comments

Comments
 (0)