diff --git a/docs/configuration/server.md b/docs/configuration/server.md index 86b05e955..32053eb59 100644 --- a/docs/configuration/server.md +++ b/docs/configuration/server.md @@ -109,7 +109,10 @@ about. | `--tool-call-parser` | Parser for OpenAI-compatible tool-call payloads (handled by the smg gateway). | | `--enable-custom-logit-processor` | Allow custom logit processors. Keep disabled unless the deployment needs it. | -Common parser values include `kimi_k2` and `gpt-oss`. +Common reasoning parser values include `kimi_k25`, `base`, `qwen3`, and +`deepseek_r1`. Common tool-call parser values include `kimik2`, `qwen`, `json`, +and `passthrough`. The parser names are validated by the SMG gateway, so use +the values accepted by the bundled `tokenspeed-smg` package. ## Speculative Decoding diff --git a/docs/guides/launching.md b/docs/guides/launching.md index ae6b52f8f..6a2d76b02 100644 --- a/docs/guides/launching.md +++ b/docs/guides/launching.md @@ -31,8 +31,8 @@ tokenspeed serve nvidia/Kimi-K2.5-NVFP4 \ --max-num-seqs 256 \ --attention-backend trtllm_mla \ --moe-backend flashinfer_trtllm \ - --reasoning-parser kimi_k2 \ - --tool-call-parser kimi_k2 + --reasoning-parser kimi_k25 \ + --tool-call-parser kimik2 ``` ## Launch Checklist diff --git a/docs/recipes/models.md b/docs/recipes/models.md index b0e648398..57f32a5b6 100644 --- a/docs/recipes/models.md +++ b/docs/recipes/models.md @@ -24,8 +24,8 @@ tokenspeed serve nvidia/Kimi-K2.5-NVFP4 \ --max-num-seqs 256 \ --attention-backend trtllm_mla \ --moe-backend flashinfer_trtllm \ - --reasoning-parser kimi_k2 \ - --tool-call-parser kimi_k2 \ + --reasoning-parser kimi_k25 \ + --tool-call-parser kimik2 \ --host 0.0.0.0 \ --port 8000 ``` @@ -44,8 +44,7 @@ tokenspeed serve openai/gpt-oss-20b \ --tensor-parallel-size 1 \ --max-model-len 131072 \ --chunked-prefill-size 8192 \ - --reasoning-parser gpt-oss \ - --tool-call-parser gpt-oss \ + --reasoning-parser base \ --host 0.0.0.0 \ --port 8000 ``` @@ -58,8 +57,7 @@ tokenspeed serve openai/gpt-oss-120b \ --kv-cache-dtype fp8 \ --chunked-prefill-size 8192 \ --max-num-seqs 256 \ - --reasoning-parser gpt-oss \ - --tool-call-parser gpt-oss \ + --reasoning-parser base \ --host 0.0.0.0 \ --port 8000 ``` diff --git a/python/tokenspeed/runtime/utils/server_args.py b/python/tokenspeed/runtime/utils/server_args.py index e5b1eaf4c..772b4eba6 100755 --- a/python/tokenspeed/runtime/utils/server_args.py +++ b/python/tokenspeed/runtime/utils/server_args.py @@ -1312,7 +1312,7 @@ def add_cli_args(parser: argparse.ArgumentParser): type=str, default=ServerArgs.reasoning_parser, help=( - "Reasoning parser name (e.g. 'minimax', 'gpt-oss'). " + "Reasoning parser name (e.g. 'minimax', 'kimi_k25'). " "Used to defer json_schema grammars past the model's " "reasoning channel." ),