Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8758,6 +8758,65 @@ dsv4-fp4-gb300-dynamo-vllm:
ep: 16
dp-attn: true

dsv4-fp4-gb300-dynamo-vllm-mtp2:
image: vllm/vllm-openai:v0.21.0-ubuntu2404
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: gb300-nv
precision: fp4
framework: dynamo-vllm
multinode: true
disagg: true
scenarios:
fixed-seq-len:
- isl: 8192
osl: 1024
search-space:
- conc-list: [1, 4, 8]
spec-decoding: mtp
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-tp4-mtp2.yaml"
decode:
num-worker: 1
tp: 4
ep: 1
dp-attn: false

- conc-list: [64, 128, 1024]
spec-decoding: mtp
prefill:
num-worker: 1
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-1p1d-dep4-dep8-megamoe-mtp2.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

- conc-list: [512, 1024]
spec-decoding: mtp
prefill:
num-worker: 2
tp: 4
ep: 4
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/vllm/deepseek-v4/8k1k/disagg-gb300-2p1d-dep4-dep8-megamoe-mtp2.yaml"
decode:
num-worker: 1
tp: 8
ep: 8
dp-attn: true

dsv4-fp4-gb300-dynamo-sglang:
image: lmsysorg/sglang:nightly-dev-cu13-20260520-425dffbd
model: deepseek-ai/DeepSeek-V4-Pro
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
name: "svf-vllm-disagg-gb300-1p1d-dep4-dep8-megamoe-mtp2"

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 2
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 8

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 4
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
enforce-eager: true
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 16384
max-num-seqs: 256
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.9
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
Comment on lines +59 to +83
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 All three new GB300 disagg MTP recipes set speculative-config: '{"method":"mtp","num_speculative_tokens":2}' only on the decode worker, while every pre-existing DSV4 MTP disagg recipe in this directory (disagg-gb200-low-latency-mtp2.yaml, disagg-gb200-mid-curve-megamoe-mtp2.yaml, disagg-gb200-high-tpt-megamoe-mtp2.yaml) sets it on both prefill and decode. This looks like a copy-paste oversight where the option was added to decode but missed on the prefill side. Affects disagg-gb300-1p1d-dep4-tp4-mtp2.yaml (line 91), disagg-gb300-1p1d-dep4-dep8-megamoe-mtp2.yaml (line 95) and disagg-gb300-2p1d-dep4-dep8-megamoe-mtp2.yaml (line 95) — fix by adding the same speculative-config line to each vllm_config.prefill block (the GB200 recipes place it right after enforce-eager: true).

Extended reasoning...

What the bug is

The three newly added GB300 disagg MTP recipe files use NixlConnector with kv_role: kv_both to transfer the KV cache from prefill workers to decode workers, but only the decode vllm_config block contains speculative-config: '{"method":"mtp","num_speculative_tokens":2}'. The prefill vllm_config block has no MTP setting at all, so the prefill worker boots a model graph without the MTP modules while the decode worker boots one with them.

How the pattern was established

All three pre-existing DSV4 MTP disagg recipes in the same directory consistently set speculative-config in both prefill and decode sections:

  • disagg-gb200-low-latency-mtp2.yaml: lines 78 (prefill) and 99 (decode)
  • disagg-gb200-mid-curve-megamoe-mtp2.yaml: lines 85 (prefill) and 112 (decode)
  • disagg-gb200-high-tpt-megamoe-mtp2.yaml: lines 87 (prefill) and 113 (decode)

The new GB300 files only set it in decode:

  • disagg-gb300-1p1d-dep4-tp4-mtp2.yaml: line 91 (decode only)
  • disagg-gb300-1p1d-dep4-dep8-megamoe-mtp2.yaml: line 95 (decode only)
  • disagg-gb300-2p1d-dep4-dep8-megamoe-mtp2.yaml: line 95 (decode only)

The structural skeleton of the GB300 prefill blocks (same enforce-eager: true, moe-backend: deep_gemm_mega_moe, enable-expert-parallel) is the same as the GB200 MTP recipes — they look exactly like a copy of the existing GB300 non-MTP recipe with MTP added only on the decode side.

Why it matters

For DeepSeek MTP, the speculative module(s) are extra transformer blocks layered on top of the base model. Enabling MTP affects (a) which layers are loaded and (b) the KV-cache layout exposed to the connector. With kv_role: kv_both over NixlConnector the prefill side serializes KV state for the decode side to consume, so the two sides must agree on the model graph and its KV layout. If prefill runs without MTP while decode runs with MTP, the transferred KV blocks either cover fewer layers than decode expects or differ in shape — at best this breaks the benchmark, at worst it silently produces incorrect numbers from a file literally named *-mtp2.yaml. Even in cases where vLLM tolerates the divergence at runtime, this is a benchmark recipe and the resulting numbers would not represent the configuration the filename advertises.

Proof / step-by-step

  1. Take disagg-gb300-1p1d-dep4-dep8-megamoe-mtp2.yaml. The backend.vllm_config.prefill section runs from lines 59–83; speculative-config is absent.
  2. The backend.vllm_config.decode section runs from lines 84–110; line 95 sets speculative-config: '{"method":"mtp","num_speculative_tokens":2}'.
  3. Both sections set kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}' — i.e. prefill→decode KV transfer is active.
  4. Compare to disagg-gb200-high-tpt-megamoe-mtp2.yaml: same disagg-MTP shape, but speculative-config is present on the prefill side (line 87) and decode side (line 113).
  5. The same omission is present at line 91 of disagg-gb300-1p1d-dep4-tp4-mtp2.yaml and line 95 of disagg-gb300-2p1d-dep4-dep8-megamoe-mtp2.yaml. There is no obvious GB300-specific reason for the divergence; the difference is GPU type, not MTP semantics.

How to fix

For each of the three new files, add the line

speculative-config: '{"method":"mtp","num_speculative_tokens":2}'

to the backend.vllm_config.prefill block (matching the convention in the GB200 MTP recipes, where it appears immediately after enforce-eager: true). If the omission is intentional for some GB300-specific reason, a brief comment in the recipe explaining why would prevent future maintainers from "fixing" it.

decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 8
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 1024
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "64x128x1024"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
name: "svf-vllm-disagg-gb300-1p1d-dep4-tp4-mtp2"

model:
path: "deepseek-v4-pro"
container: "vllm/vllm-openai:v0.21.0-ubuntu2404"
precision: "fp4"

dynamo:
install: true
wheel: "1.2.0.dev20260426"

setup_script: vllm-container-deps.sh

slurm:
time_limit: "8:00:00"

health_check:
max_attempts: 1440
interval_seconds: 10

resources:
gpu_type: "gb300"
gpus_per_node: 4
prefill_nodes: 1
decode_nodes: 1
prefill_workers: 1
decode_workers: 1
gpus_per_prefill: 4
gpus_per_decode: 4

infra:
etcd_nats_dedicated_node: true

frontend:
type: dynamo
enable_multiple_frontends: false

backend:
type: vllm
connector: null
prefill_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"
decode_environment:
VLLM_ENGINE_READY_TIMEOUT_S: "3600"
TILELANG_CLEANUP_TEMP_FILES: "1"
VLLM_USE_NCCL_SYMM_MEM: "1"
NCCL_CUMEM_ENABLE: "1"
NCCL_MNNVL_ENABLE: "1"
NCCL_NVLS_ENABLE: "1"
TORCH_SYMMMEM: "NVSHMEM"

vllm_config:
prefill:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 1
pipeline-parallel-size: 1
data-parallel-size: 4
data-parallel-rpc-port: 13345
enable-expert-parallel: true
enable-ep-weight-filter: true
moe-backend: deep_gemm_mega_moe
enforce-eager: true
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 16384
max-num-seqs: 256
max-num-batched-tokens: 16384
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
no-async-scheduling: true
block-size: 256
gpu-memory-utilization: 0.9
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4
decode:
kv-transfer-config: '{"kv_connector": "NixlConnector", "kv_role": "kv_both"}'
served-model-name: "deepseek-ai/DeepSeek-V4-Pro"
kv-cache-dtype: "fp8"
tensor-parallel-size: 4
pipeline-parallel-size: 1
enable-expert-parallel: false
speculative-config: '{"method":"mtp","num_speculative_tokens":2}'
attention-config: '{"use_fp4_indexer_cache":true}'
max-model-len: 16384
max-num-seqs: 512
max-cudagraph-capture-size: 512
max-num-batched-tokens: 1024
trust-remote-code: true
no-enable-prefix-caching: true
no-enable-flashinfer-autotune: true
block-size: 256
compilation-config: '{"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}'
gpu-memory-utilization: 0.9
stream-interval: 50
no-disable-hybrid-kv-cache-manager: true
enable-sleep-mode: true
tokenizer-mode: deepseek_v4

benchmark:
type: "sa-bench"
isl: 8192
osl: 1024
concurrencies: "1x4x8"
req_rate: "inf"
use_chat_template: true
custom_tokenizer: "sa_bench_tokenizers.vllm_deepseek_v4.VLLMDeepseekV4Tokenizer"

Loading
Loading