diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c396e678..55651efe 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -176,3 +176,73 @@ gptoss-fp4-mi355x-vllm: - { tp: 1, conc-start: 4, conc-end: 128 } - { tp: 4, conc-start: 4, conc-end: 4 } - { tp: 8, conc-start: 4, conc-end: 8 } + +dsr1-fp4-mi355x-atom: + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + model: amd/DeepSeek-R1-0528-MXFP4-Preview + model-prefix: dsr1 + runner: mi355x-m15-17 + precision: fp4 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + +dsr1-fp8-mi355x-atom: + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-m15-17 + precision: fp8 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 128 } + +gptoss-fp4-mi355x-atom: + image: rocm/atom:rocm7.1.1-ubuntu24.04-pytorch2.9-atom0.1.1-MI350x + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: mi355x-m15-17 + precision: fp4 + framework: atom + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 128 } diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index e414af03..88c47bd3 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -53,5 +53,8 @@ mi355x: - 'mi355x-amd_1' - 'mi355x-amd_2' - 'mi355x-amd_3' +mi355x-m15-17: +- 'mi355x-amdatom' +- 'mi355x-amdatomtw' gb200: - gb200-nv_0 diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 27ebd215..963366ab 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -120,13 +120,10 @@ jobs: # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true fi else - echo "[Docker] Cleaning up resources ..." - docker ps -aq | xargs -r docker rm -f - docker network prune -f - while [ -n "$(docker ps -aq)" ]; do - docker ps -a - sleep 5 - done + echo "[Docker] Cleaning up bmk-server container ..." + docker stop bmk-server 2>/dev/null || true + docker rm -f bmk-server 2>/dev/null || true + docker network rm bmk-net 2>/dev/null || true fi fi if command -v squeue >/dev/null 2>&1; then diff --git a/benchmarks/dsr1_fp4_mi355x_atom_docker.sh b/benchmarks/dsr1_fp4_mi355x_atom_docker.sh new file mode 100755 index 00000000..105e3539 --- /dev/null +++ b/benchmarks/dsr1_fp4_mi355x_atom_docker.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN="" +else + CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " +fi + +set -x +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + -tp $TP \ + --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN diff --git a/benchmarks/dsr1_fp8_mi355x_atom_docker.sh b/benchmarks/dsr1_fp8_mi355x_atom_docker.sh new file mode 100755 index 00000000..032291b9 --- /dev/null +++ b/benchmarks/dsr1_fp8_mi355x_atom_docker.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN="" +else + CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " +fi + +set -x + +BLOCK_SIZE=${BLOCK_SIZE:-16} +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + -tp $TP \ + --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN \ + --block-size $BLOCK_SIZE diff --git a/benchmarks/gptoss_fp4_mi355x_atom_docker.sh b/benchmarks/gptoss_fp4_mi355x_atom_docker.sh new file mode 100755 index 00000000..4fc5d21c --- /dev/null +++ b/benchmarks/gptoss_fp4_mi355x_atom_docker.sh @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# ========= Required Env Vars ========= +# HF_TOKEN +# HF_HUB_CACHE +# MODEL +# PORT +# TP +# CONC +# MAX_MODEL_LEN + +# Calculate max-model-len based on ISL and OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + CALCULATED_MAX_MODEL_LEN="" +else + CALCULATED_MAX_MODEL_LEN=" --max-model-len 10240 " +fi + +set -x +export HSA_NO_SCRATCH_RECLAIM=1 +python3 -m atom.entrypoints.openai_server \ + --model $MODEL \ + --server-port $PORT \ + -tp $TP \ + --kv_cache_dtype fp8 $CALCULATED_MAX_MODEL_LEN diff --git a/runners/launch_mi355x-amdatom.sh b/runners/launch_mi355x-amdatom.sh new file mode 100755 index 00000000..cae55690 --- /dev/null +++ b/runners/launch_mi355x-amdatom.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash + +# === Workflow-defined Env Vars === +# IMAGE +# MODEL +# TP +# HF_HUB_CACHE +# ISL +# OSL +# MAX_MODEL_LEN +# RANDOM_RANGE_RATIO +# CONC +# GITHUB_WORKSPACE +# RESULT_FILENAME +# HF_TOKEN +# FRAMEWORK + +HF_HUB_CACHE_MOUNT="/mnt/hf_hub_cache/" # Temp solution +PORT=8888 + +# Determine framework suffix for benchmark script +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "atom" ]] && printf '_atom' || printf '') + +network_name="bmk-net" +server_name="bmk-server" +client_name="bmk-client" + +# Cleanup: stop server container and remove network +docker stop $server_name 2>/dev/null || true +docker rm $server_name 2>/dev/null || true +docker network rm $network_name 2>/dev/null || true + +docker network create $network_name + +set -x +docker pull $IMAGE +DIGEST=$(docker inspect --format='{{index .RepoDigests 0}}' "$IMAGE" | cut -d'@' -f2) +echo "The image digest is: $DIGEST" + +set -x +docker run --rm -d --ipc=host --shm-size=16g --network=$network_name --name=$server_name \ +--privileged --cap-add=CAP_SYS_ADMIN --device=/dev/kfd --device=/dev/dri --device=/dev/mem \ +--cap-add=SYS_PTRACE --security-opt seccomp=unconfined \ +-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e HF_HUB_CACHE -e MODEL -e TP -e CONC -e MAX_MODEL_LEN -e PORT=$PORT \ +-e ISL -e OSL \ +--entrypoint=/bin/bash \ +$IMAGE \ +benchmarks/"${EXP_NAME%%_*}_${PRECISION}_mi355x${FRAMEWORK_SUFFIX}_docker.sh" + +set +x +while IFS= read -r line; do + printf '%s\n' "$line" + if [[ "$line" =~ Application\ startup\ complete ]]; then + break + fi +done < <(docker logs -f --tail=0 $server_name 2>&1) + +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + #NUM_PROMPTS=$(( CONC * 20 )) + NUM_PROMPTS=$(( CONC * 2 )) # atom has no much compilation overhead for dsr1 + else + #NUM_PROMPTS=$(( CONC * 50 )) + NUM_PROMPTS=$(( CONC * 10 )) # atom has no much compilation overhead for dsr1 + fi +else + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 2 )) + else + NUM_PROMPTS=$(( CONC * 10 )) + fi +fi + +set -x +echo $GITHUB_WORKSPACE +git clone https://github.com/kimbochen/bench_serving.git +git clone https://github.com/kimbochen/bench_serving.git $GITHUB_WORKSPACE/bench_serving + +sleep 5 + +set -x +docker run --rm --network=$network_name --name=$client_name \ +-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ \ +-e HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \ +--entrypoint=/bin/bash \ +$(echo "$IMAGE" | sed 's/#/\//') \ +-lc "pip install -q datasets pandas && \ +python3 bench_serving/benchmark_serving.py \ +--model=$MODEL --backend=vllm --base-url="http://$server_name:$PORT" \ +--dataset-name=random \ +--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ +--num-prompts=$NUM_PROMPTS \ +--max-concurrency=$CONC \ +--trust-remote-code \ +--request-rate=inf --ignore-eos \ +--save-result --percentile-metrics="ttft,tpot,itl,e2el" \ +--result-dir=/workspace/ --result-filename=$RESULT_FILENAME.json" + +if ls gpucore.* 1> /dev/null 2>&1; then + echo "gpucore files exist. not good" + rm -f gpucore.* +fi + + +# Cleanup: stop server container and remove network +docker stop $server_name 2>/dev/null || true +docker rm $server_name 2>/dev/null || true +docker network rm $network_name 2>/dev/null || true