SemiAnalysisAI · Oseltamivir · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
@@ -1774,7 +1774,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 # image tag, so bumping sglang is just an image tag bump here. Sweeps
 # DP-attention on/off and EP=8.
 dsv4-fp4-mi355x-sglang:
-  image: rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4
+  image: rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
@@ -1786,12 +1786,12 @@ dsv4-fp4-mi355x-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 1024 }
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 4096 }
       - { tp: 8, dp-attn: false, conc-start: 1 , conc-end: 32 }
     - isl: 8192
       osl: 1024
       search-space:
-      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 512 }
+      - { tp: 8, dp-attn: true, conc-start: 64, conc-end: 2048 }
       - { tp: 8, dp-attn: false, conc-start: 1, conc-end: 32 }
 
 # DSv4 on MI355X via vLLM, using the official vllm/vllm-openai-rocm

diff --git a/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh b/benchmarks/single_node/dsv4_fp4_mi355x_sglang.sh
@@ -116,6 +116,8 @@ python3 -m sglang.launch_server \
     --disable-radix-cache \
     --attention-backend compressed \
     --max-running-requests ${CONC} \
+    --mem-fraction-static 0.90 \
+    --swa-full-tokens-ratio 0.1 \
     --page-size 256 \
     --context-length $MAX_MODEL_LEN \
     --chunked-prefill-size 8192 \

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -3138,8 +3138,16 @@
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1354
 
 - config-keys:
+<<<<<<< dsv4-mi355-sgl-0526
+    - dsv4-fp4-mi355x-sglang
+  description:
+    - "Bump image to rocm/sgl-dev:rocm720-mi35x-f96ac98-20260526-DSv4"
+    - "Add args to avoid kvcache pool full issue on high conc"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1568
+=======
     - qwen3.5-fp8-h200-sglang
     - dsr1-fp8-mi355x-sglang
   description:
     - "Validates measured-power aggregation pipeline (PR #1558) on both NVIDIA (H200) and AMD (MI355X) hardware — different SMI tools (nvidia-smi vs amd-smi), different CSV schemas (power.draw [W] vs socket_power), same aggregator. No config change. Entry intentionally kept past merge so run-sweep produces canonical agg JSONs with avg_power_w + joules_per_output_token on main for both vendors, seeding the dashboard's day-zero data."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1558
+>>>>>>> main