From 58752561d7144ef4681d06ef0d4f3c078adc304a Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 17:13:41 +0800 Subject: [PATCH 01/16] CI: Enable Qwen3-Omni performacne test Signed-off-by: Xiake Sun --- .../workflows/sglang_benchmark_workflow.yaml | 1 + scripts/ci/sglang_benchmark_workflow.sh | 30 +++++++++++-------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index b250505d1e11..68d330be8ada 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -143,6 +143,7 @@ jobs: docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" elif [ "$model_name" == "Qwen3-Omni" ]; then docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" else echo "Unknown model_name: ${model_name}" exit 1 diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 6a59aa7d336c..b84a2e77f346 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -60,6 +60,8 @@ if [[ "${TYPE}" == "launch" ]]; then elif [[ "${model_name}" == "Qwen3-Omni" ]]; then echo "Qwen3-Omni-Server Launch" export SGLANG_USE_AITER=1 + export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0 + export SGLANG_VLM_CACHE_SIZE_MB=0 python3 -m sglang.launch_server \ --model-path "${model_path}" \ --host localhost \ @@ -68,11 +70,11 @@ if [[ "${TYPE}" == "launch" ]]; then --ep-size ${EP} \ --trust-remote-code \ --mm-attention-backend "aiter_attn"\ - --chunked-prefill-size 16384 \ + --chunked-prefill-size 32768 \ --mem-fraction-static 0.85 \ --disable-radix-cache \ - --max-prefill-tokens 16384 \ - --cuda-graph-max-bs 64 \ + --max-prefill-tokens 32768 \ + --cuda-graph-max-bs 8 \ --page-size 64 & sglang_pid=$! else @@ -144,16 +146,18 @@ elif [[ "${TYPE}" == "evaluation" ]]; then elif [[ "${TYPE}" == "performance" ]]; then echo echo "========== STARTING PERFORMANCE BENCHMARK ==========" - python3 -m sglang.bench_serving \ - --backend sglang-oai-chat \ - --dataset-name image \ - --image-count 1 \ - --image-resolution 800x800 \ - --random-input-len 1000 \ - --random-output-len 2000 \ - --max-concurrency 64 \ - --num-prompts 192 \ - | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log + if [[ "${model_name}" == "Qwen3-Omni" ]]; then + python3 -m sglang.bench_serving \ + --backend sglang-oai-chat \ + --dataset-name image \ + --image-count 20 \ + --image-resolution 960x1280 \ + --random-input-len 8000 \ + --random-output-len 500 \ + --max-concurrency 2 \ + --num-prompts 128 \ + --skip-special-tokens \ + | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log else echo "Unknown TYPE: ${TYPE}" From d1c32571028411db859d0dfd2447a92fb32974d2 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 17:31:52 +0800 Subject: [PATCH 02/16] Create separate performance benchmark task Signed-off-by: Xiake Sun --- .github/workflows/sglang_benchmark_workflow.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 68d330be8ada..5f202bbda408 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -143,6 +143,17 @@ jobs: docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" elif [ "$model_name" == "Qwen3-Omni" ]; then docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" + else + echo "Unknown model_name: ${model_name}" + exit 1 + fi + + - name: Run performance benchmark + continue-on-error: true + timeout-minutes: 60 + run: | + model_name=${{ matrix.model }} + if [ "$model_name" == "Qwen3-Omni" ]; then docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" else echo "Unknown model_name: ${model_name}" From dfe35f036a353af8e099046c4632c4596cd05e56 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 17:43:41 +0800 Subject: [PATCH 03/16] Apply review comments Signed-off-by: Xiake Sun --- scripts/ci/sglang_benchmark_workflow.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index b84a2e77f346..7b65e7dcb36a 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -69,7 +69,7 @@ if [[ "${TYPE}" == "launch" ]]; then --tp-size ${TP} \ --ep-size ${EP} \ --trust-remote-code \ - --mm-attention-backend "aiter_attn"\ + --mm-attention-backend "aiter_attn" \ --chunked-prefill-size 32768 \ --mem-fraction-static 0.85 \ --disable-radix-cache \ @@ -158,6 +158,10 @@ elif [[ "${TYPE}" == "performance" ]]; then --num-prompts 128 \ --skip-special-tokens \ | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log + else + echo "Unknown model_name: ${model_name}" + exit 1 + fi else echo "Unknown TYPE: ${TYPE}" From c1f4ba28db6bbc30a5df52e10d53f9a5155d4ebd Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 18:36:09 +0800 Subject: [PATCH 04/16] Add --dp-size option for CI test Signed-off-by: Xiake Sun --- .../workflows/sglang_benchmark_workflow.yaml | 18 +++++++++--------- scripts/ci/sglang_benchmark_workflow.sh | 11 ++++++++--- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 5f202bbda408..043e5b99368c 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -122,11 +122,11 @@ jobs: set -ex model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-VL-235B" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1" elif [ "$model_name" == "Qwen3-next" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1" elif [ "$model_name" == "Qwen3-Omni" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2" else echo "Unknown model_name: ${model_name}" exit 1 @@ -138,11 +138,11 @@ jobs: run: | model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-VL-235B" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1" elif [ "$model_name" == "Qwen3-next" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1" elif [ "$model_name" == "Qwen3-Omni" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2" else echo "Unknown model_name: ${model_name}" exit 1 @@ -154,10 +154,10 @@ jobs: run: | model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-Omni" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 2" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2" else - echo "Unknown model_name: ${model_name}" - exit 1 + echo "Skip performance benchmark for model_name: ${model_name}" + exit 0 fi - name: Clean Up diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 7b65e7dcb36a..3513afa1cfee 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -7,6 +7,7 @@ model_name=${2:-Qwen3-VL-235B} model_path=${3:-/models/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/} TP=${4:-8} EP=${5:-8} +DP=${6:-8} export SGLANG_TORCH_PROFILER_DIR=./ export SGLANG_PROFILE_WITH_STACK=1 @@ -17,6 +18,7 @@ echo "Detect model_name: ${model_name}" echo "Detect model_path ${model_path}" echo "Detect TP ${TP}" echo "Detect EP ${EP}" +echo "Detect DP ${EP}" if [[ "${TYPE}" == "launch" ]]; then @@ -31,6 +33,7 @@ if [[ "${TYPE}" == "launch" ]]; then --port 9000 \ --tp-size "${TP}" \ --ep-size "${EP}" \ + --dp-size "${DP}" \ --trust-remote-code \ --chunked-prefill-size 32768 \ --mem-fraction-static 0.6 \ @@ -47,6 +50,7 @@ if [[ "${TYPE}" == "launch" ]]; then --port 9000 \ --tp-size ${TP} \ --ep-size ${EP} \ + --dp-size ${DP} \ --trust-remote-code \ --chunked-prefill-size 32768 \ --mem-fraction-static 0.85 \ @@ -68,6 +72,7 @@ if [[ "${TYPE}" == "launch" ]]; then --port 9000 \ --tp-size ${TP} \ --ep-size ${EP} \ + --dp-size ${DP} \ --trust-remote-code \ --mm-attention-backend "aiter_attn" \ --chunked-prefill-size 32768 \ @@ -141,7 +146,7 @@ elif [[ "${TYPE}" == "evaluation" ]]; then python3 benchmark/mmmu/bench_sglang.py \ --port 9000 \ --concurrency 16 \ - | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}.log + | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}_DP${DP}.log elif [[ "${TYPE}" == "performance" ]]; then echo @@ -157,7 +162,7 @@ elif [[ "${TYPE}" == "performance" ]]; then --max-concurrency 2 \ --num-prompts 128 \ --skip-special-tokens \ - | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log + | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log else echo "Unknown model_name: ${model_name}" exit 1 @@ -165,7 +170,7 @@ elif [[ "${TYPE}" == "performance" ]]; then else echo "Unknown TYPE: ${TYPE}" - echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP]" + echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP] [DP]" exit 1 fi From d833cb8abb95cb979e64edd073f9c6117848b700 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 22:48:30 +0800 Subject: [PATCH 05/16] Minor fix Signed-off-by: Xiake Sun --- scripts/ci/sglang_benchmark_workflow.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 3513afa1cfee..f350d48bbedd 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -18,7 +18,7 @@ echo "Detect model_name: ${model_name}" echo "Detect model_path ${model_path}" echo "Detect TP ${TP}" echo "Detect EP ${EP}" -echo "Detect DP ${EP}" +echo "Detect DP ${DP}" if [[ "${TYPE}" == "launch" ]]; then @@ -154,6 +154,9 @@ elif [[ "${TYPE}" == "performance" ]]; then if [[ "${model_name}" == "Qwen3-Omni" ]]; then python3 -m sglang.bench_serving \ --backend sglang-oai-chat \ + --host localhost \ + --port 9000 \ + --model "${model_path}" \ --dataset-name image \ --image-count 20 \ --image-resolution 960x1280 \ @@ -161,8 +164,9 @@ elif [[ "${TYPE}" == "performance" ]]; then --random-output-len 500 \ --max-concurrency 2 \ --num-prompts 128 \ + --flush-cache \ --skip-special-tokens \ - | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log + 2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log else echo "Unknown model_name: ${model_name}" exit 1 From 5d2c60304b56e7f9559bfeff9d92e3458348ece8 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Wed, 10 Dec 2025 23:26:11 +0800 Subject: [PATCH 06/16] Install sglang python package Signed-off-by: Xiake Sun --- .github/workflows/sglang_benchmark_workflow.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 043e5b99368c..96161e6f1263 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -72,7 +72,10 @@ jobs: pip install --upgrade pip && \ pip install "transformers>=4.57.0" && \ cd sgl-kernel && \ - python3 setup_rocm.py install + python3 setup_rocm.py install && \ + cd .. && \ + rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml && \ + pip install -e "python[all_hip]" EOF - name: Show dockerfile From d538dcbf19514d7fbd5e16a62f735b806d735bfa Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Thu, 11 Dec 2025 08:38:48 +0800 Subject: [PATCH 07/16] Debug lanch server error --- .github/workflows/sglang_benchmark_workflow.yaml | 2 +- scripts/ci/sglang_benchmark_workflow.sh | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 96161e6f1263..a569192372b9 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -75,7 +75,7 @@ jobs: python3 setup_rocm.py install && \ cd .. && \ rm -rf python/pyproject.toml && mv python/pyproject_other.toml python/pyproject.toml && \ - pip install -e "python[all_hip]" + PYTORCH_ROCM_ARCH="gfx942" pip install -e "python[all_hip]" EOF - name: Show dockerfile diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index f350d48bbedd..61559d6af1f3 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -39,7 +39,8 @@ if [[ "${TYPE}" == "launch" ]]; then --mem-fraction-static 0.6 \ --disable-radix-cache \ --max-prefill-tokens 32768 \ - --cuda-graph-max-bs 128 & + --cuda-graph-max-bs 128 \ + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-next" ]]; then export SGLANG_USE_AITER=1 @@ -59,7 +60,8 @@ if [[ "${TYPE}" == "launch" ]]; then --cuda-graph-max-bs 256 \ --page-size 64 \ --attention-backend triton \ - --max-running-requests 128 & + --max-running-requests 128 \ + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-Omni" ]]; then echo "Qwen3-Omni-Server Launch" @@ -80,7 +82,8 @@ if [[ "${TYPE}" == "launch" ]]; then --disable-radix-cache \ --max-prefill-tokens 32768 \ --cuda-graph-max-bs 8 \ - --page-size 64 & + --page-size 64 \ + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! else echo "Unknown model_name: ${model_name}" From a88b0629abd4b01fa5f289e3cadcaad88568cb86 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Thu, 11 Dec 2025 11:06:56 +0800 Subject: [PATCH 08/16] Adjust page size for Qwen3-VL for pa asm with layout preshuffle Signed-off-by: Xiake Sun --- scripts/ci/sglang_benchmark_workflow.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 61559d6af1f3..ec853576c354 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -40,11 +40,12 @@ if [[ "${TYPE}" == "launch" ]]; then --disable-radix-cache \ --max-prefill-tokens 32768 \ --cuda-graph-max-bs 128 \ - 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & + --page-size 64 \ + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-next" ]]; then export SGLANG_USE_AITER=1 - export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=1 + export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0 python3 -m sglang.launch_server \ --model-path "${model_path}" \ --host localhost \ @@ -61,7 +62,7 @@ if [[ "${TYPE}" == "launch" ]]; then --page-size 64 \ --attention-backend triton \ --max-running-requests 128 \ - 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-Omni" ]]; then echo "Qwen3-Omni-Server Launch" @@ -83,7 +84,7 @@ if [[ "${TYPE}" == "launch" ]]; then --max-prefill-tokens 32768 \ --cuda-graph-max-bs 8 \ --page-size 64 \ - 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & + 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! else echo "Unknown model_name: ${model_name}" From 64a7ceef99e9612d6ce65287a9c83e625f2c86ef Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Thu, 11 Dec 2025 11:58:21 +0800 Subject: [PATCH 09/16] set --page-size 64 for Qwen3-VL Signed-off-by: Xiake Sun --- scripts/ci/sglang_benchmark_workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index ec853576c354..e8ddb5ef8071 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -40,7 +40,7 @@ if [[ "${TYPE}" == "launch" ]]; then --disable-radix-cache \ --max-prefill-tokens 32768 \ --cuda-graph-max-bs 128 \ - --page-size 64 \ + --page-size 16 \ 2>&1 | tee launch_server_${model_name}_TP${TP}_EP${EP}_DP${DP}.log & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-next" ]]; then From 0052dcd37bdd1eada124ce8111a41c470158c5b4 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Mon, 15 Dec 2025 17:06:57 +0800 Subject: [PATCH 10/16] CI: Update image in CI --- .github/workflows/sglang_benchmark_workflow.yaml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 46fbad283b67..f6695646beea 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -11,15 +11,16 @@ on: required: false default: '["Qwen3-VL-235B","Qwen3-next","Qwen3-Omni"]' image: - description: 'Image to use for the benchmark (default: "rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125")' + description: 'Image to use for the benchmark (default: "rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212")' required: true - default: "rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125" + default: "rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true env: + IMAGE: ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm6.4.3.127_sglang_f25820a_aiter_5fad56f_20251212' }} SGLANG_GITHUB_REPO_URL: ${{ github.event.pull_request.head.repo.clone_url || 'https://github.com/zejunchen-zejun/sglang.git' }} SGLANG_GITHUB_COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.event.head_commit.id }} @@ -39,7 +40,7 @@ jobs: run: | docker login -u rocmshared -p ${{ secrets.DOCKER_PASSWORD }} || true for i in {1..3}; do - docker pull ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125' }} && break || { + docker pull ${{ env.IMAGE }} && break || { echo "docker pull failed, retrying ($i/3)..." sleep 5 } @@ -48,7 +49,7 @@ jobs: - name: Generate Dockerfile run: | cat <<'EOF' > Dockerfile.mod - FROM ${{ github.event.inputs.image || 'rocm/ali-private:ubuntu22.04_rocm7.0.1.42_vllm_e858fc9_sglang_890fc1c_aiter_1b3efa9_torch2.8.0_20251125' }} + FROM ${{ env.IMAGE }} ENV GPU_ARCHS="gfx942" ENV PYTORCH_ROCM_ARCH="gfx942" RUN echo "=== Aiter version BEFORE uninstall ===" && pip show aiter || true From c93275b44d71ad5fee3a1268b30989ea64db7a85 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 16 Dec 2025 10:43:12 +0800 Subject: [PATCH 11/16] Use latest qwen3vl-project branch --- .github/workflows/sglang_benchmark_workflow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index f6695646beea..2a424151b8dd 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -62,7 +62,7 @@ jobs: RUN git clone https://github.com/ZLkanyo009/aiter.git /aiter && \ cd /aiter && \ - git checkout caa3a9b404b757f15effb7acfafb1bfa7573ecac && \ + git checkout qwen3vl-project && \ git submodule sync && git submodule update --init --recursive && \ python3 setup.py develop From e3c3b6f7e77e5e208d03a0a6b468e03fcea77c90 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 16 Dec 2025 15:38:35 +0800 Subject: [PATCH 12/16] Optimize VL model command --- scripts/ci/sglang_benchmark_workflow.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 5b1acd4e61cc..130a8ceef2cf 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -24,20 +24,22 @@ if [[ "${TYPE}" == "launch" ]]; then echo "========== LAUNCHING SERVER ========" if [[ "${model_name}" == "Qwen3-VL-235B" ]]; then export SGLANG_USE_AITER=1 - export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=1 python3 -m sglang.launch_server \ --model-path "${model_path}" \ --host localhost \ --port 9000 \ --tp-size "${TP}" \ - --ep-size "${EP}" \ --trust-remote-code \ --chunked-prefill-size 32768 \ - --mem-fraction-static 0.6 \ + --mem-fraction-static 0.90 \ --disable-radix-cache \ --max-prefill-tokens 32768 \ --cuda-graph-max-bs 128 \ --page-size 16 \ + --attention-backend aiter_attn \ + --mm-enable-dp-encoder \ + --enable-aiter-allreduce-fusion \ + --mm-processor-kwargs '{"max_pixels": 1638400, "min_pixels": 740}' \ --watchdog-timeout 1200 & sglang_pid=$! elif [[ "${model_name}" == "Qwen3-next" ]]; then From c2233dcf68fceb2507c95b152cee29f6d40c5461 Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 16 Dec 2025 15:53:10 +0800 Subject: [PATCH 13/16] Test --- scripts/ci/sglang_benchmark_workflow.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 130a8ceef2cf..0e52d61b5b23 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -36,7 +36,7 @@ if [[ "${TYPE}" == "launch" ]]; then --max-prefill-tokens 32768 \ --cuda-graph-max-bs 128 \ --page-size 16 \ - --attention-backend aiter_attn \ + --mm-attention-backend aiter_attn \ --mm-enable-dp-encoder \ --enable-aiter-allreduce-fusion \ --mm-processor-kwargs '{"max_pixels": 1638400, "min_pixels": 740}' \ From d6c0d8a32b81a788131b98efba53b9dc5a98c32c Mon Sep 17 00:00:00 2001 From: Xin Huang Date: Tue, 16 Dec 2025 16:06:32 +0800 Subject: [PATCH 14/16] Update Omni command --- scripts/ci/sglang_benchmark_workflow.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 0e52d61b5b23..a2acbc1e1355 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -64,22 +64,26 @@ if [[ "${TYPE}" == "launch" ]]; then sglang_pid=$! elif [[ "${model_name}" == "Qwen3-Omni" ]]; then echo "Qwen3-Omni-Server Launch" + export SGLANG_USE_CUDA_IPC_TRANSPORT=1 + export SGLANG_VLM_CACHE_SIZE_MB=0 export SGLANG_USE_AITER=1 + export USE_PA=1 export SGLANG_ROCM_USE_AITER_PA_ASM_PRESHUFFLE_LAYOUT=0 + export SGLANG_ROCM_USE_AITER_LINEAR_SHUFFLE=1 python3 -m sglang.launch_server \ --model-path "${model_path}" \ --host localhost \ --port 9000 \ --tp-size ${TP} \ - --ep-size ${EP} \ --trust-remote-code \ --mm-attention-backend "aiter_attn"\ - --chunked-prefill-size 16384 \ + --chunked-prefill-size 32768 \ --mem-fraction-static 0.85 \ --disable-radix-cache \ - --max-prefill-tokens 16384 \ - --cuda-graph-max-bs 64 \ + --max-prefill-tokens 32768 \ + --cuda-graph-max-bs 8 \ --page-size 64 \ + --mm-enable-dp-encoder \ --watchdog-timeout 1200 & sglang_pid=$! else From 59eb42d6e8799c09a02c6be0bd2e985777074a5b Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Tue, 16 Dec 2025 21:26:53 +0800 Subject: [PATCH 15/16] Update qwen3-omni benchmark arguments, remove dp setting --- .github/workflows/sglang_benchmark_workflow.yaml | 10 +++++----- scripts/ci/sglang_benchmark_workflow.sh | 10 ++++------ 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/sglang_benchmark_workflow.yaml b/.github/workflows/sglang_benchmark_workflow.yaml index 1e2ef5af2e5e..80ea95602b32 100644 --- a/.github/workflows/sglang_benchmark_workflow.yaml +++ b/.github/workflows/sglang_benchmark_workflow.yaml @@ -133,9 +133,9 @@ jobs: set -ex model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-VL-235B" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8" elif [ "$model_name" == "Qwen3-next" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" elif [ "$model_name" == "Qwen3-Omni" ]; then docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh launch $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1" else @@ -149,9 +149,9 @@ jobs: run: | model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-VL-235B" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/RedHatAI/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/ 8 8" elif [ "$model_name" == "Qwen3-next" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1 1" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Next-80B-A3B-Instruct/ 4 1" elif [ "$model_name" == "Qwen3-Omni" ]; then docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh evaluation $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1" else @@ -165,7 +165,7 @@ jobs: run: | model_name=${{ matrix.model }} if [ "$model_name" == "Qwen3-Omni" ]; then - docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 4 2" + docker exec sglang_test bash -c "scripts/ci/sglang_benchmark_workflow.sh performance $model_name /models/Qwen/Qwen3-Omni-30B-A3B-Instruct/ 4 1" else echo "Skip performance benchmark for model_name: ${model_name}" exit 0 diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index b9bec024a640..3f98d8964101 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -18,7 +18,6 @@ echo "Detect model_name: ${model_name}" echo "Detect model_path ${model_path}" echo "Detect TP ${TP}" echo "Detect EP ${EP}" -echo "Detect DP ${DP}" if [[ "${TYPE}" == "launch" ]]; then @@ -53,7 +52,6 @@ if [[ "${TYPE}" == "launch" ]]; then --port 9000 \ --tp-size ${TP} \ --ep-size ${EP} \ - --dp-size ${DP} \ --trust-remote-code \ --chunked-prefill-size 32768 \ --mem-fraction-static 0.85 \ @@ -163,15 +161,15 @@ elif [[ "${TYPE}" == "performance" ]]; then --port 9000 \ --model "${model_path}" \ --dataset-name image \ - --image-count 20 \ + --image-count 10 \ --image-resolution 960x1280 \ --random-input-len 8000 \ --random-output-len 500 \ - --max-concurrency 2 \ + --max-concurrency 1 \ --num-prompts 128 \ --flush-cache \ --skip-special-tokens \ - 2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}_DP${DP}.log + 2>&1 | tee performance_benchmark_${model_name}_TP${TP}_EP${EP}.log else echo "Unknown model_name: ${model_name}" exit 1 @@ -179,7 +177,7 @@ elif [[ "${TYPE}" == "performance" ]]; then else echo "Unknown TYPE: ${TYPE}" - echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP] [DP]" + echo "Usage: $0 {launch|evaluation|performance} [model_name] [model_path] [TP] [EP]" exit 1 fi From ce41b7f39069aec789e9cb031ea72d2184c1fd25 Mon Sep 17 00:00:00 2001 From: Xiake Sun Date: Tue, 16 Dec 2025 21:28:51 +0800 Subject: [PATCH 16/16] Remove dp setting --- scripts/ci/sglang_benchmark_workflow.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/ci/sglang_benchmark_workflow.sh b/scripts/ci/sglang_benchmark_workflow.sh index 3f98d8964101..458eddfeb4ab 100755 --- a/scripts/ci/sglang_benchmark_workflow.sh +++ b/scripts/ci/sglang_benchmark_workflow.sh @@ -7,7 +7,6 @@ model_name=${2:-Qwen3-VL-235B} model_path=${3:-/models/Qwen3-VL-235B-A22B-Instruct-FP8-dynamic/} TP=${4:-8} EP=${5:-8} -DP=${6:-8} export SGLANG_TORCH_PROFILER_DIR=./ export SGLANG_PROFILE_WITH_STACK=1 @@ -149,7 +148,7 @@ elif [[ "${TYPE}" == "evaluation" ]]; then python3 benchmark/mmmu/bench_sglang.py \ --port 9000 \ --concurrency 16 \ - | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}_DP${DP}.log + | tee vision_model_evaluation_${model_name}_TP${TP}_EP${EP}.log elif [[ "${TYPE}" == "performance" ]]; then echo