Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1813,7 +1813,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
# image tag, so bumping sglang is just an image tag bump here. Sweeps
# DP-attention on/off and EP=8.
dsv4-fp4-mi355x-sglang:
image: rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4
image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: mi355x
Expand All @@ -1825,12 +1825,12 @@ dsv4-fp4-mi355x-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
- { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 32 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 512 }
- { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
- { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 }

# DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm
Expand Down
4 changes: 3 additions & 1 deletion benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ export SGLANG_FORCE_TRITON_MOE_FP8=0
export SGLANG_HACK_FLASHMLA_BACKEND=triton
export SGLANG_OPT_USE_TILELANG_INDEXER=true
export SGLANG_OPT_USE_TRITON_SWA_PREPARE=true
export AITER_BF16_FP8_MOE_BOUND=1
export AITER_BF16_FP8_MOE_BOUND=0
export SGLANG_OPT_FUSE_WQA_WKV=true
export SGLANG_OPT_USE_FUSED_PAGED_COMPRESS=true
export SGLANG_OPT_USE_MULTI_STREAM_OVERLAP=0
Expand Down Expand Up @@ -116,6 +116,8 @@ python3 -m sglang.launch_server \
--disable-radix-cache \
--attention-backend compressed \
--max-running-requests ${CONC} \
--mem-fraction-static 0.90 \
--swa-full-tokens-ratio 0.15 \
--page-size 256 \
--context-length $MAX_MODEL_LEN \
--chunked-prefill-size 8192 \
Expand Down
7 changes: 7 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3146,6 +3146,13 @@
- "Set vLLM serving knobs in benchmarks/single_node/minimaxm2.5_fp8_h200.sh: generated benchmark max-model-len, previous eval max-model-len handling, fp8 KV cache, FlashInfer attention/autotune, Triton MoE, and MiniMax QK norm fusion"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354

- config-keys:
- dsv4-fp4-mi355x-sglang
description:
- "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4"
- "Add args to avoid kvcache pool full issue on high conc"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568

- config-keys:
- qwen3.5-fp8-h200-sglang
- dsr1-fp8-mi355x-sglang
Expand Down