Skip to content
26 changes: 11 additions & 15 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2000,26 +2000,22 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 1, conc-end: 32 }

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists B200 (not B300) as the Blackwell target. This config reuses the
# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
# until a B300-specific recipe ships. Prefix caching is disabled.
# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
# DeepSeek-V4-Pro on B300 with sglang (non-MTP).
# Uses nightly image with megamoe backend for high-concurrency profiles.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
image: lmsysorg/sglang:nightly-dev-cu13-20260529-a8cfae0b
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# Recipes are selected inside benchmarks/single_node/dsv4_fp4_b300_sglang.sh
# by CONC:
# CONC 1|32: TP-only, flashinfer_mxfp4
# CONC 512: DP-attn, flashinfer_mxfp4
# CONC 2048-8192: DP-attn, megamoe
# ep is implicit in sglang: --moe-a2a-backend megamoe forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
scenarios:
fixed-seq-len:
Expand All @@ -2028,14 +2024,14 @@ dsv4-fp4-b300-sglang:
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
- { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
- { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 4, ep: 1, dp-attn: true, conc-start: 512, conc-end: 512 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }

Expand Down
201 changes: 89 additions & 112 deletions benchmarks/single_node/dsv4_fp4_b300_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,9 @@ fi

nvidia-smi

# Common SGLANG env vars (apply to every config).
# ─── Common env vars (all profiles) ───────────────────────────────────────────
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
export SGLANG_OPT_USE_JIT_NORM=1
export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
export SGLANG_OPT_USE_TOPK_V2=1
export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for these images so the
# editable install stays visible. Paths in this script are $PWD-relative for
# that reason. Drop the runner conditional once lmsys moves sglang back out of
# /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}
Expand All @@ -58,114 +47,101 @@ fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
# ─── Per-concurrency launch profile ──────────────────────────────────────────
# Each block sets: PARALLEL_ARGS, MEM_FRACTION_STATIC, SWA_FULL_TOKENS_RATIO,
# and optionally MAX_RUNNING_REQUESTS plus profile-specific env vars.
#
# SWA ratio: 1k inputs need more SWA cache headroom than 8k inputs; 0.5 was
# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
if [[ "$ISL" == "1024" ]]; then
SWA_FULL_TOKENS_RATIO=0.5
else
SWA_FULL_TOKENS_RATIO=0.1
fi

# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
# script's pattern). DP-attention runs the empirically-tuned high-concurrency
# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

# Default; the DP-attn branch below overrides to 0.94.
MEM_FRACTION_STATIC=0.90
if [ "$CONC" = "1" ] || [ "$CONC" = "32" ]; then
# TP-only, no DP attention
MEM_FRACTION_STATIC=0.90
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
)

if [ "${DP_ATTENTION}" = "true" ]; then
elif [ "$CONC" = "512" ]; then
# DP attention, flashinfer_mxfp4
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export SGLANG_OPT_USE_FAST_MASK_EP=1
export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
# ep=8 in the yaml signals the mega_moe deepep backend; check high-conc
# recipes first (they also have ep=8) so they aren't shadowed by the
# medium-conc EP_SIZE=8 branch below.
if [ "$CONC" = "2048" ] || [ "$CONC" = "4096" ] || [ "$CONC" = "8192" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
if [ "$CONC" = "2048" ]; then
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=288
MAX_RUNNING_REQUESTS=2560
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
TOKENIZER_WORKER_NUM=4
elif [ "$CONC" = "4096" ]; then
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
CUDA_GRAPH_MAX_BS=544
MAX_RUNNING_REQUESTS=4352
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
TOKENIZER_WORKER_NUM=8
else
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
CUDA_GRAPH_MAX_BS=1088
MAX_RUNNING_REQUESTS=8192
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
TOKENIZER_WORKER_NUM=16
fi
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs "$CUDA_GRAPH_MAX_BS"
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 65536
--tokenizer-worker-num "$TOKENIZER_WORKER_NUM"
--enable-prefill-delayer
)
if [ "$CONC" = "4096" ]; then
PARALLEL_ARGS+=(--decode-log-interval 5)
fi
if [ "$CONC" = "8192" ]; then
PARALLEL_ARGS+=(--stream-interval 30)
fi
elif [ "${EP_SIZE}" = "8" ]; then
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=1
export SGLANG_OPT_FIX_HASH_MEGA_MOE=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=550
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--cuda-graph-max-bs 550
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MAX_RUNNING_REQUESTS=768
MEM_FRACTION_STATIC=0.94
else
export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--disable-flashinfer-autotune
--deepep-config "$DEEPEP_CONFIG"
--chunked-prefill-size 16384
--enable-prefill-delayer
)
MEM_FRACTION_STATIC=0.94
fi
else
MEM_FRACTION_STATIC=0.94
SWA_FULL_TOKENS_RATIO=$([[ "$ISL" == "1024" ]] && echo 0.5 || echo 0.1)
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 8192
--disable-flashinfer-autotune
--chunked-prefill-size 16384
--enable-prefill-delayer
)

elif [ "$CONC" = "2048" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_LOG_FORWARD_ITERS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.87
SWA_FULL_TOKENS_RATIO=0.06
MAX_RUNNING_REQUESTS=2560
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 288
--chunked-prefill-size 65536
--tokenizer-worker-num 4
--enable-prefill-delayer
)

elif [ "$CONC" = "4096" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8320
MEM_FRACTION_STATIC=0.835
SWA_FULL_TOKENS_RATIO=0.075
MAX_RUNNING_REQUESTS=4352
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 544
--chunked-prefill-size 65536
--tokenizer-worker-num 8
--enable-prefill-delayer
--decode-log-interval 5
)

elif [ "$CONC" = "8192" ]; then
# DP attention, megamoe
export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
export NVSHMEM_DISABLE_IB=1
export SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW=1
export SGLANG_OPT_USE_ONLINE_COMPRESS=1
export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=8256
MEM_FRACTION_STATIC=0.80
SWA_FULL_TOKENS_RATIO=0.3
MAX_RUNNING_REQUESTS=8192
PARALLEL_ARGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend megamoe
--cuda-graph-max-bs 1088
--chunked-prefill-size 65536
--tokenizer-worker-num 16
--enable-prefill-delayer
--stream-interval 30
)

else
echo "ERROR: unsupported CONC=$CONC" >&2
exit 1
fi

# Print all SGLANG_* env vars to both the CI step log and server.log so the
Expand Down Expand Up @@ -193,6 +169,7 @@ SERVER_PID=$!
wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas
pip install -q --upgrade transformers

run_benchmark_serving \
--model "$MODEL" \
Expand Down
13 changes: 13 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3201,6 +3201,19 @@
- "MoRI conn.py overlay (48e459bd) via job.slurm; launcher qwen3.5_fp4_mi355x_sglang-disagg.sh"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1579

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Update sglang image to nightly-dev-cu13-20260529-a8cfae0b"
- "Refactor benchmark script to dispatch by CONC instead of nested DP_ATTENTION/CONC/EP_SIZE"
- "Switch CONC 2048/4096/8192 from --moe-a2a-backend deepep to megamoe"
- "Remove env vars deleted from sglang main (SGLANG_OPT_USE_JIT_NORM, SGLANG_OPT_USE_FAST_MASK_EP, SGLANG_OPT_FIX_NEXTN_MEGA_MOE, SGLANG_OPT_FIX_HASH_MEGA_MOE)"
- "Remove env vars redundant with sglang defaults (SGLANG_OPT_USE_JIT_INDEXER_METADATA, SGLANG_OPT_USE_TOPK_V2, SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2)"
- "Remove env vars auto-set by megamoe backend (SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE, SGLANG_OPT_FIX_MEGA_MOE_MEMORY)"
- "Remove --deepep-config and SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK (unused by megamoe/StandardDispatcher)"
- "Fix CONC=512 yaml ep from 4 to 1 (flashinfer_mxfp4 does not set ep=tp)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1506

- config-keys:
- dsr1-fp4-mi355x-sglang-disagg-8k1k-mtp
description:
Expand Down
17 changes: 17 additions & 0 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,23 @@ else
echo "Squash file already exists and is valid, skipping import"
else
rm -f "$SQUASH_FILE"
# enroot's working dirs are pinned to NFS /scratch by
# /etc/enroot/enroot.conf, but enroot-aufs2ovlfs unpacks the image's
# root-owned whiteout markers into a sticky /tmp and then can't unlink
# them over NFS -- root-squash strips the CAP_FOWNER it would need, so
# it fails with "failed to remove aufs whiteout: Operation not
# permitted" and writes no .sqsh. Run the import on local disk, where
# the extracted files are owned by us and removable. Scoped to this
# subshell (and cleaned up on exit), so the salloc/srun below and the
# compute node's own /scratch are unaffected.
enroot_local="$(mktemp -d /tmp/enroot-import.XXXXXX)"
trap 'rm -rf "$enroot_local"' EXIT
export ENROOT_TEMP_PATH="$enroot_local/tmp"
export ENROOT_CACHE_PATH="$enroot_local/cache"
export ENROOT_DATA_PATH="$enroot_local/data"
export ENROOT_RUNTIME_PATH="$enroot_local/run"
mkdir -p "$ENROOT_TEMP_PATH" "$ENROOT_CACHE_PATH" \
"$ENROOT_DATA_PATH" "$ENROOT_RUNTIME_PATH"
enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
fi
)
Expand Down
Loading