From 58752561d7144ef4681d06ef0d4f3c078adc304a Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 17:13:41 +0800
Subject: [PATCH 01/16] CI: Enable Qwen3-Omni performacne test

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 .../workflows/sglang_benchmark_workflow.yaml  |  1 +
 scripts/ci/sglang_benchmark_workflow.sh       | 30 +++++++++++--------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index b250505d1e11..68d330be8ada 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -143,6 +143,7 @@ jobs:
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
           else
             echo "Unknown model_name: ${model_name}"
             exit 1
diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 6a59aa7d336c..b84a2e77f346 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -60,6 +60,8 @@ if [[ "${TYPE}" == "launch" ]]; then
     elif [[ "${model_name}" == "Qwen3-Omni" ]]; then
         echo "Qwen3-Omni-Server Launch"
         export SGLANG_USE_AITER=1
+        export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0
+        export SGLANG_VLM_CACHE_SIZE_MB=0
         python3 -m sglang.launch_server \
             --model-path "${model_path}" \
             --host localhost \
@@ -68,11 +70,11 @@ if [[ "${TYPE}" == "launch" ]]; then
             --ep-size ${EP} \
             --trust-remote-code \
             --mm-attention-backend "aiter_attn"\
-            --chunked-prefill-size 16384 \
+            --chunked-prefill-size 32768 \
             --mem-fraction-static 0.85 \
             --disable-radix-cache \
-            --max-prefill-tokens 16384 \
-            --cuda-graph-max-bs 64 \
+            --max-prefill-tokens 32768 \
+            --cuda-graph-max-bs 8 \
             --page-size 64 &
         sglang_pid=$!
     else
@@ -144,16 +146,18 @@ elif [[ "${TYPE}" == "evaluation" ]]; then
 elif [[ "${TYPE}" == "performance" ]]; then
     echo
     echo "========== STARTING PERFORMANCE BENCHMARK =========="
-    python3 -m sglang.bench_serving \
-        --backend sglang-oai-chat \
-        --dataset-name image \
-        --image-count 1 \
-        --image-resolution 800x800 \
-        --random-input-len 1000 \
-        --random-output-len 2000 \
-        --max-concurrency 64 \
-        --num-prompts 192 \
-        | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log
+    if [[ "${model_name}" == "Qwen3-Omni" ]]; then
+        python3 -m sglang.bench_serving \
+            --backend sglang-oai-chat \
+            --dataset-name image \
+            --image-count 20 \
+            --image-resolution 960x1280 \
+            --random-input-len 8000 \
+            --random-output-len 500 \
+            --max-concurrency 2 \
+            --num-prompts 128 \
+            --skip-special-tokens \
+            | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log
 
 else
     echo "Unknown TYPE: ${TYPE}"

From d1c32571028411db859d0dfd2447a92fb32974d2 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 17:31:52 +0800
Subject: [PATCH 02/16] Create separate performance benchmark task

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 .github/workflows/sglang_benchmark_workflow.yaml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 68d330be8ada..5f202bbda408 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -143,6 +143,17 @@ jobs:
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
+          else
+            echo "Unknown model_name: ${model_name}"
+            exit 1
+          fi
+
+      - name: Run performance benchmark
+        continue-on-error: true
+        timeout-minutes: 60
+        run: |
+          model_name=${{ matrix.model }}
+          if [ "$model_name" == "Qwen3-Omni" ]; then
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
           else
             echo "Unknown model_name: ${model_name}"

From dfe35f036a353af8e099046c4632c4596cd05e56 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 17:43:41 +0800
Subject: [PATCH 03/16] Apply review comments

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 scripts/ci/sglang_benchmark_workflow.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index b84a2e77f346..7b65e7dcb36a 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -69,7 +69,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --tp-size ${TP} \
             --ep-size ${EP} \
             --trust-remote-code \
-            --mm-attention-backend "aiter_attn"\
+            --mm-attention-backend "aiter_attn" \
             --chunked-prefill-size 32768 \
             --mem-fraction-static 0.85 \
             --disable-radix-cache \
@@ -158,6 +158,10 @@ elif [[ "${TYPE}" == "performance" ]]; then
             --num-prompts 128 \
             --skip-special-tokens \
             | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log
+    else
+        echo "Unknown model_name: ${model_name}"
+        exit 1
+    fi
 
 else
     echo "Unknown TYPE: ${TYPE}"

From c1f4ba28db6bbc30a5df52e10d53f9a5155d4ebd Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 18:36:09 +0800
Subject: [PATCH 04/16] Add --dp-size option for CI test

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 .../workflows/sglang_benchmark_workflow.yaml   | 18 +++++++++---------
 scripts/ci/sglang_benchmark_workflow.sh        | 11 ++++++++---
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 5f202bbda408..043e5b99368c 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -122,11 +122,11 @@ jobs:
           set -ex
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-VL-235B" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1"
           elif [ "$model_name" == "Qwen3-next" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2"
           else
             echo "Unknown model_name: ${model_name}"
             exit 1
@@ -138,11 +138,11 @@ jobs:
         run: |
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-VL-235B" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1"
           elif [ "$model_name" == "Qwen3-next" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2"
           else
             echo "Unknown model_name: ${model_name}"
             exit 1
@@ -154,10 +154,10 @@ jobs:
         run: |
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-Omni" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2"
           else
-            echo "Unknown model_name: ${model_name}"
-            exit 1
+            echo "Skip performance benchmark for model_name: ${model_name}"
+            exit 0
           fi
 
       - name: Clean Up
diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 7b65e7dcb36a..3513afa1cfee 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -7,6 +7,7 @@ model_name=${2:-Qwen3-VL-235B}
 model_path=${3:-/models/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/}
 TP=${4:-8}
 EP=${5:-8}
+DP=${6:-8}
 
 export SGLANG_TORCH_PROFILER_DIR=./
 export SGLANG_PROFILE_WITH_STACK=1
@@ -17,6 +18,7 @@ echo "Detect model_name: ${model_name}"
 echo "Detect model_path ${model_path}"
 echo "Detect TP ${TP}"
 echo "Detect EP ${EP}"
+echo "Detect DP ${EP}"
 
 
 if [[ "${TYPE}" == "launch" ]]; then
@@ -31,6 +33,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --port 9000 \
             --tp-size "${TP}" \
             --ep-size "${EP}" \
+            --dp-size "${DP}" \
             --trust-remote-code \
             --chunked-prefill-size 32768 \
             --mem-fraction-static 0.6 \
@@ -47,6 +50,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --port 9000 \
             --tp-size ${TP} \
             --ep-size ${EP} \
+            --dp-size ${DP} \
             --trust-remote-code \
             --chunked-prefill-size 32768 \
             --mem-fraction-static 0.85 \
@@ -68,6 +72,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --port 9000 \
             --tp-size ${TP} \
             --ep-size ${EP} \
+            --dp-size ${DP} \
             --trust-remote-code \
             --mm-attention-backend "aiter_attn" \
             --chunked-prefill-size 32768 \
@@ -141,7 +146,7 @@ elif [[ "${TYPE}" == "evaluation" ]]; then
     python3 benchmark/mmmu/bench_sglang.py \
         --port 9000 \
         --concurrency 16 \
-        | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}.log
+        | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
 
 elif [[ "${TYPE}" == "performance" ]]; then
     echo
@@ -157,7 +162,7 @@ elif [[ "${TYPE}" == "performance" ]]; then
             --max-concurrency 2 \
             --num-prompts 128 \
             --skip-special-tokens \
-            | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log
+            | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
     else
         echo "Unknown model_name: ${model_name}"
         exit 1
@@ -165,7 +170,7 @@ elif [[ "${TYPE}" == "performance" ]]; then
 
 else
     echo "Unknown TYPE: ${TYPE}"
-    echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP]"
+    echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP] [DP]"
     exit 1
 fi
 

From d833cb8abb95cb979e64edd073f9c6117848b700 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 22:48:30 +0800
Subject: [PATCH 05/16] Minor fix

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 scripts/ci/sglang_benchmark_workflow.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 3513afa1cfee..f350d48bbedd 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -18,7 +18,7 @@ echo "Detect model_name: ${model_name}"
 echo "Detect model_path ${model_path}"
 echo "Detect TP ${TP}"
 echo "Detect EP ${EP}"
-echo "Detect DP ${EP}"
+echo "Detect DP ${DP}"
 
 
 if [[ "${TYPE}" == "launch" ]]; then
@@ -154,6 +154,9 @@ elif [[ "${TYPE}" == "performance" ]]; then
     if [[ "${model_name}" == "Qwen3-Omni" ]]; then
         python3 -m sglang.bench_serving \
             --backend sglang-oai-chat \
+            --host localhost \
+            --port 9000 \
+            --model "${model_path}" \
             --dataset-name image \
             --image-count 20 \
             --image-resolution 960x1280 \
@@ -161,8 +164,9 @@ elif [[ "${TYPE}" == "performance" ]]; then
             --random-output-len 500 \
             --max-concurrency 2 \
             --num-prompts 128 \
+            --flush-cache \
             --skip-special-tokens \
-            | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
+            2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
     else
         echo "Unknown model_name: ${model_name}"
         exit 1

From 5d2c60304b56e7f9559bfeff9d92e3458348ece8 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Wed, 10 Dec 2025 23:26:11 +0800
Subject: [PATCH 06/16] Install sglang python package

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 .github/workflows/sglang_benchmark_workflow.yaml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 043e5b99368c..96161e6f1263 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -72,7 +72,10 @@ jobs:
               pip install --upgrade pip && \
               pip install "transformers>=4.57.0" && \
               cd sgl-kernel && \
-              python3 setup_rocm.py install
+              python3 setup_rocm.py install && \
+              cd .. && \
+              rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml && \
+              pip install -e "python[all_hip]"
           EOF
 
       - name: Show dockerfile

From d538dcbf19514d7fbd5e16a62f735b806d735bfa Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Thu, 11 Dec 2025 08:38:48 +0800
Subject: [PATCH 07/16] Debug lanch server error

---
 .github/workflows/sglang_benchmark_workflow.yaml | 2 +-
 scripts/ci/sglang_benchmark_workflow.sh          | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 96161e6f1263..a569192372b9 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -75,7 +75,7 @@ jobs:
               python3 setup_rocm.py install && \
               cd .. && \
               rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml && \
-              pip install -e "python[all_hip]"
+              PYTORCH_ROCM_ARCH="gfx942" pip install -e "python[all_hip]"
           EOF
 
       - name: Show dockerfile
diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index f350d48bbedd..61559d6af1f3 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -39,7 +39,8 @@ if [[ "${TYPE}" == "launch" ]]; then
             --mem-fraction-static 0.6 \
             --disable-radix-cache \
             --max-prefill-tokens 32768 \
-            --cuda-graph-max-bs 128 &
+            --cuda-graph-max-bs 128 \
+        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-next" ]]; then
         export SGLANG_USE_AITER=1
@@ -59,7 +60,8 @@ if [[ "${TYPE}" == "launch" ]]; then
             --cuda-graph-max-bs 256 \
             --page-size 64 \
             --attention-backend triton \
-            --max-running-requests 128 &
+            --max-running-requests 128 \
+        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-Omni" ]]; then
         echo "Qwen3-Omni-Server Launch"
@@ -80,7 +82,8 @@ if [[ "${TYPE}" == "launch" ]]; then
             --disable-radix-cache \
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 8 \
-            --page-size 64 &
+            --page-size 64 \
+        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     else
         echo "Unknown model_name: ${model_name}"

From a88b0629abd4b01fa5f289e3cadcaad88568cb86 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Thu, 11 Dec 2025 11:06:56 +0800
Subject: [PATCH 08/16] Adjust page size for Qwen3-VL for pa asm with layout
 preshuffle

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 scripts/ci/sglang_benchmark_workflow.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 61559d6af1f3..ec853576c354 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -40,11 +40,12 @@ if [[ "${TYPE}" == "launch" ]]; then
             --disable-radix-cache \
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 128 \
-        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
+            --page-size 64 \
+            2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-next" ]]; then
         export SGLANG_USE_AITER=1
-        export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=1
+        export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0
         python3 -m sglang.launch_server \
             --model-path "${model_path}" \
             --host localhost \
@@ -61,7 +62,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --page-size 64 \
             --attention-backend triton \
             --max-running-requests 128 \
-        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
+            2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-Omni" ]]; then
         echo "Qwen3-Omni-Server Launch"
@@ -83,7 +84,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 8 \
             --page-size 64 \
-        2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
+            2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     else
         echo "Unknown model_name: ${model_name}"

From 64a7ceef99e9612d6ce65287a9c83e625f2c86ef Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Thu, 11 Dec 2025 11:58:21 +0800
Subject: [PATCH 09/16] set --page-size 64 for Qwen3-VL

Signed-off-by: Xiake Sun <xiake.sun@amd.com>
---
 scripts/ci/sglang_benchmark_workflow.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index ec853576c354..e8ddb5ef8071 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -40,7 +40,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --disable-radix-cache \
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 128 \
-            --page-size 64 \
+            --page-size 16 \
             2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-next" ]]; then

From 0052dcd37bdd1eada124ce8111a41c470158c5b4 Mon Sep 17 00:00:00 2001
From: Xin Huang <Xin.Huang@amd.com>
Date: Mon, 15 Dec 2025 17:06:57 +0800
Subject: [PATCH 10/16] CI: Update image in CI

---
 .github/workflows/sglang_benchmark_workflow.yaml | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 46fbad283b67..f6695646beea 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -11,15 +11,16 @@ on:
         required: false
         default: '["Qwen3-VL-235B","Qwen3-next","Qwen3-Omni"]'
       image:
-        description: 'Image to use for the benchmark (default: "rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125")'
+        description: 'Image to use for the benchmark (default: "rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212")'
         required: true
-        default: "rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125"
+        default: "rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212"
   
 concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
 env:
+  IMAGE: ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212' }}
   SGLANG_GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/zejunchen-zejun/sglang.git' }}
   SGLANG_GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }}
 
@@ -39,7 +40,7 @@ jobs:
         run: |
           docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true
           for i in {1..3}; do
-            docker pull ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125' }} && break || {
+            docker pull ${{ env.IMAGE }} && break || {
               echo "docker pull failed, retrying ($i/3)..."
               sleep 5
             }
@@ -48,7 +49,7 @@ jobs:
       - name: Generate Dockerfile
         run: |
           cat <<'EOF' > Dockerfile.mod
-          FROM ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125' }}
+          FROM ${{ env.IMAGE }}
           ENV GPU_ARCHS="gfx942"
           ENV PYTORCH_ROCM_ARCH="gfx942"
           RUN echo "=== Aiter version BEFORE uninstall ===" && pip show aiter || true

From c93275b44d71ad5fee3a1268b30989ea64db7a85 Mon Sep 17 00:00:00 2001
From: Xin Huang <Xin.Huang@amd.com>
Date: Tue, 16 Dec 2025 10:43:12 +0800
Subject: [PATCH 11/16] Use latest qwen3vl-project branch

---
 .github/workflows/sglang_benchmark_workflow.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index f6695646beea..2a424151b8dd 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -62,7 +62,7 @@ jobs:
 
           RUN git clone https://github.com/ZLkanyo009/aiter.git /aiter && \
             cd /aiter && \
-            git checkout caa3a9b404b757f15effb7acfafb1bfa7573ecac && \
+            git checkout qwen3vl-project && \
             git submodule sync && git submodule update --init --recursive && \
             python3 setup.py develop
 

From e3c3b6f7e77e5e208d03a0a6b468e03fcea77c90 Mon Sep 17 00:00:00 2001
From: Xin Huang <Xin.Huang@amd.com>
Date: Tue, 16 Dec 2025 15:38:35 +0800
Subject: [PATCH 12/16] Optimize VL model command

---
 scripts/ci/sglang_benchmark_workflow.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 5b1acd4e61cc..130a8ceef2cf 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -24,20 +24,22 @@ if [[ "${TYPE}" == "launch" ]]; then
     echo "========== LAUNCHING SERVER ========"
     if [[ "${model_name}" == "Qwen3-VL-235B" ]]; then
         export SGLANG_USE_AITER=1
-        export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=1
         python3 -m sglang.launch_server \
             --model-path "${model_path}" \
             --host localhost \
             --port 9000 \
             --tp-size "${TP}" \
-            --ep-size "${EP}" \
             --trust-remote-code \
             --chunked-prefill-size 32768 \
-            --mem-fraction-static 0.6 \
+            --mem-fraction-static 0.90 \
             --disable-radix-cache \
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 128 \
             --page-size 16 \
+            --attention-backend aiter_attn \
+            --mm-enable-dp-encoder \
+            --enable-aiter-allreduce-fusion \
+            --mm-processor-kwargs '{"max_pixels": 1638400, "min_pixels": 740}' \
             --watchdog-timeout 1200 &
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-next" ]]; then

From c2233dcf68fceb2507c95b152cee29f6d40c5461 Mon Sep 17 00:00:00 2001
From: Xin Huang <Xin.Huang@amd.com>
Date: Tue, 16 Dec 2025 15:53:10 +0800
Subject: [PATCH 13/16] Test

---
 scripts/ci/sglang_benchmark_workflow.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 130a8ceef2cf..0e52d61b5b23 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -36,7 +36,7 @@ if [[ "${TYPE}" == "launch" ]]; then
             --max-prefill-tokens 32768 \
             --cuda-graph-max-bs 128 \
             --page-size 16 \
-            --attention-backend aiter_attn \
+            --mm-attention-backend aiter_attn \
             --mm-enable-dp-encoder \
             --enable-aiter-allreduce-fusion \
             --mm-processor-kwargs '{"max_pixels": 1638400, "min_pixels": 740}' \

From d6c0d8a32b81a788131b98efba53b9dc5a98c32c Mon Sep 17 00:00:00 2001
From: Xin Huang <Xin.Huang@amd.com>
Date: Tue, 16 Dec 2025 16:06:32 +0800
Subject: [PATCH 14/16] Update Omni command

---
 scripts/ci/sglang_benchmark_workflow.sh | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 0e52d61b5b23..a2acbc1e1355 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -64,22 +64,26 @@ if [[ "${TYPE}" == "launch" ]]; then
         sglang_pid=$!
     elif [[ "${model_name}" == "Qwen3-Omni" ]]; then
         echo "Qwen3-Omni-Server Launch"
+        export SGLANG_USE_CUDA_IPC_TRANSPORT=1
+        export SGLANG_VLM_CACHE_SIZE_MB=0
         export SGLANG_USE_AITER=1
+        export USE_PA=1
         export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0
+        export SGLANG_ROCM_USE_AITER_LINEAR_SHUFFLE=1
         python3 -m sglang.launch_server \
             --model-path "${model_path}" \
             --host localhost \
             --port 9000 \
             --tp-size ${TP} \
-            --ep-size ${EP} \
             --trust-remote-code \
             --mm-attention-backend "aiter_attn"\
-            --chunked-prefill-size 16384 \
+            --chunked-prefill-size 32768 \
             --mem-fraction-static 0.85 \
             --disable-radix-cache \
-            --max-prefill-tokens 16384 \
-            --cuda-graph-max-bs 64 \
+            --max-prefill-tokens 32768 \
+            --cuda-graph-max-bs 8 \
             --page-size 64  \
+            --mm-enable-dp-encoder \
             --watchdog-timeout 1200 &
         sglang_pid=$!
     else

From 59eb42d6e8799c09a02c6be0bd2e985777074a5b Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Tue, 16 Dec 2025 21:26:53 +0800
Subject: [PATCH 15/16] Update qwen3-omni benchmark arguments, remove dp
 setting

---
 .github/workflows/sglang_benchmark_workflow.yaml | 10 +++++-----
 scripts/ci/sglang_benchmark_workflow.sh          | 10 ++++------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml
index 1e2ef5af2e5e..80ea95602b32 100644
--- a/.github/workflows/sglang_benchmark_workflow.yaml
+++ b/.github/workflows/sglang_benchmark_workflow.yaml
@@ -133,9 +133,9 @@ jobs:
           set -ex
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-VL-235B" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8"
           elif [ "$model_name" == "Qwen3-next" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1"
           else
@@ -149,9 +149,9 @@ jobs:
         run: |
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-VL-235B" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8"
           elif [ "$model_name" == "Qwen3-next" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1"
           elif [ "$model_name" == "Qwen3-Omni" ]; then
             docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1"
           else
@@ -165,7 +165,7 @@ jobs:
         run: |
           model_name=${{ matrix.model }}
           if [ "$model_name" == "Qwen3-Omni" ]; then
-            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2"
+            docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1"
           else
             echo "Skip performance benchmark for model_name: ${model_name}"
             exit 0
diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index b9bec024a640..3f98d8964101 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -18,7 +18,6 @@ echo "Detect model_name: ${model_name}"
 echo "Detect model_path ${model_path}"
 echo "Detect TP ${TP}"
 echo "Detect EP ${EP}"
-echo "Detect DP ${DP}"
 
 
 if [[ "${TYPE}" == "launch" ]]; then
@@ -53,7 +52,6 @@ if [[ "${TYPE}" == "launch" ]]; then
             --port 9000 \
             --tp-size ${TP} \
             --ep-size ${EP} \
-            --dp-size ${DP} \
             --trust-remote-code \
             --chunked-prefill-size 32768 \
             --mem-fraction-static 0.85 \
@@ -163,15 +161,15 @@ elif [[ "${TYPE}" == "performance" ]]; then
             --port 9000 \
             --model "${model_path}" \
             --dataset-name image \
-            --image-count 20 \
+            --image-count 10 \
             --image-resolution 960x1280 \
             --random-input-len 8000 \
             --random-output-len 500 \
-            --max-concurrency 2 \
+            --max-concurrency 1 \
             --num-prompts 128 \
             --flush-cache \
             --skip-special-tokens \
-            2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
+            2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log
     else
         echo "Unknown model_name: ${model_name}"
         exit 1
@@ -179,7 +177,7 @@ elif [[ "${TYPE}" == "performance" ]]; then
 
 else
     echo "Unknown TYPE: ${TYPE}"
-    echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP] [DP]"
+    echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP]"
     exit 1
 fi
 

From ce41b7f39069aec789e9cb031ea72d2184c1fd25 Mon Sep 17 00:00:00 2001
From: Xiake Sun <xiake.sun@amd.com>
Date: Tue, 16 Dec 2025 21:28:51 +0800
Subject: [PATCH 16/16] Remove dp setting

---
 scripts/ci/sglang_benchmark_workflow.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh
index 3f98d8964101..458eddfeb4ab 100755
--- a/scripts/ci/sglang_benchmark_workflow.sh
+++ b/scripts/ci/sglang_benchmark_workflow.sh
@@ -7,7 +7,6 @@ model_name=${2:-Qwen3-VL-235B}
 model_path=${3:-/models/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/}
 TP=${4:-8}
 EP=${5:-8}
-DP=${6:-8}
 
 export SGLANG_TORCH_PROFILER_DIR=./
 export SGLANG_PROFILE_WITH_STACK=1
@@ -149,7 +148,7 @@ elif [[ "${TYPE}" == "evaluation" ]]; then
     python3 benchmark/mmmu/bench_sglang.py \
         --port 9000 \
         --concurrency 16 \
-        | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}_DP${DP}.log
+        | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}.log
 
 elif [[ "${TYPE}" == "performance" ]]; then
     echo