diff --git a/examples/nixl/run_accuracy_test.sh b/examples/nixl/run_accuracy_test.sh index 23745263..d9d87899 100755 --- a/examples/nixl/run_accuracy_test.sh +++ b/examples/nixl/run_accuracy_test.sh @@ -2,12 +2,12 @@ #set -xe # Models to run -MODELS=( - "Qwen/Qwen3-0.6B" -) #MODELS=( -# "meta-llama/Llama-3.1-8B" +# "Qwen/Qwen3-0.6B" #) +MODELS=( + "meta-llama/Llama-3.1-8B-Instruct" +) export VLLM_USE_V1=1 export VLLM_SKIP_WARMUP="true" @@ -103,7 +103,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ @@ -136,7 +136,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ diff --git a/examples/nixl/run_benchmark_test.sh b/examples/nixl/run_benchmark_test.sh index c9b5ba19..dd20d600 100755 --- a/examples/nixl/run_benchmark_test.sh +++ b/examples/nixl/run_benchmark_test.sh @@ -11,16 +11,26 @@ set -xe MODELS=( - "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/" + "/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/" ) + +export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=1000000 +export VLLM_RPC_TIMEOUT=1000000000 +export NIXL_LOG_LEVEL=debug +#export UCX_LOG_LEVEL=debug export VLLM_USE_V1=1 -#export VLLM_SKIP_WARMUP=True +export VLLM_SKIP_WARMUP=True export PT_HPU_LAZY_MODE=1 export VLLM_EXPONENTIAL_BUCKETING=False #export VLLM_PROMPT_BS_BUCKET_MIN=1 #export VLLM_PROMPT_SEQ_BUCKET_MIN=1 -#export VLLM_PROMPT_SEQ_BUCKET_STEP=8192 -#export VLLM_PROMPT_SEQ_BUCKET_MAX=8192 +export VLLM_PROMPT_SEQ_BUCKET_MIN=8192 +export VLLM_PROMPT_SEQ_BUCKET_STEP=8192 +export VLLM_PROMPT_SEQ_BUCKET_MAX=8192 +export VLLM_DECODE_BLOCK_BUCKET_MIN=1024 +export VLLM_DECODE_BLOCK_BUCKET_MAX=1184 +export VLLM_USE_PADDING_AWARE_SCHEDULING=1 +export DECODER_TP_RATIO=2 # Number of prefill and decode instances to create NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1 @@ -28,6 +38,7 @@ NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} + # Find the git repository root directory #GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT="/home/vllm-nixl/vllm" @@ -98,17 +109,17 @@ run_tests_for_model() { # Calculate port number (base port + instance number) PORT=$((8300 + i)) # Calculate side channel port. Avoid clash with with TP workers. - SIDE_CHANNEL_PORT=$((6559 + i)) + SIDE_CHANNEL_PORT=$((5559 + i)) echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_DEVICES=1 MY_ROLE=PREFILL UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --long_prefill_token_threshold 8192 \ --max_num_batched_tokens 8192 \ - --gpu-memory-utilization 0.3 \ --disable-log-requests \ + --gpu-memory-utilization 0.3 \ --tensor-parallel-size $PREFILLER_TP_SIZE \ --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" @@ -132,12 +143,12 @@ run_tests_for_model() { # Calculate port number (base port + instance number) PORT=$((8400 + i)) # Calculate side channel port - SIDE_CHANNEL_PORT=$((5659 + i * $DECODER_TP_SIZE)) + SIDE_CHANNEL_PORT=$((4659 + i * $DECODER_TP_SIZE)) echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_DEVICES=2,3 MY_ROLE=DECODE UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --gpu-memory-utilization 0.3 \ --tensor-parallel-size $DECODER_TP_SIZE \ @@ -171,7 +182,7 @@ run_tests_for_model() { done # Build the command for the proxy server with all the hosts and ports - PROXY_CMD="python toy_proxy_server.py --port 9192" + PROXY_CMD="python toy_proxy_server.py --port 9111" # Add all prefill hosts and ports PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}" @@ -188,7 +199,7 @@ run_tests_for_model() { # Wait for the proxy to start sleep 10 -# curl -X POST -s http://localhost:9192/v1/completions \ +# curl -X POST -s http://localhost:9111/v1/completions \ # -H "Content-Type: application/json" \ # -d '{ # "model": "meta-llama/Llama-3.1-8B", @@ -198,42 +209,99 @@ run_tests_for_model() { # }' # sleep 5 # echo "--------------------===================-------------" -#curl -X POST -s http://localhost:9192/v1/completions \ -# -H "Content-Type: application/json" \ -# -d '{ -# "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/", -# "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.", -# "max_tokens": 5, -# "temperature": 0 -# }' - #curl -X POST -s http://localhost:9192/v1/completions \ - # -H "Content-Type: application/json" \ - # -d '{ - # "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/", - # "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", - # "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"], - # "max_tokens": 2, - # "temperature": 0 - # }' - sleep 2 +curl -X POST -s http://localhost:9111/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/", + "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.", + "max_tokens": 100, + "temperature": 0 + }' +#curl -X POST -s http://localhost:9111/v1/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/", +# "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", +# "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"], +# "max_tokens": 100, +# "temperature": 0 +# }' + #sleep 2 # Run lm eval for this model - echo "Running tests for $model_name" + #echo "Running tests for $model_name" #TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py - python3 ../../../../benchmarks/benchmark_serving.py \ - --port 9192 \ - --seed "$(date +%s)" \ - --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ - --dataset-name random \ - --random-input-len 8192 \ - --random-output-len 200 \ - --num-prompts 100 \ - --burstiness 100 \ - --request-rate 3.6 \ - --metric-percentiles 95 \ - --percentile-metrics ttft,tpot,itl,e2el \ - --backend openai \ - --endpoint /v1/completions \ - --ignore-eos + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 9111 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len 8192 \ + # --random-output-len 256 \ + # --num-prompts 32 \ + # --burstiness 100 \ + # --request-rate 3.6 \ + # --metric-percentiles 95 \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos + + #sleep 100 + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 8300 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len 8192 \ + # --random-output-len 200 \ + # --num-prompts 100 \ + # --burstiness 100 \ + # --request-rate 3.6 \ + # --metric-percentiles 95 \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos + qps=(0.5) #(0.1 0.25 0.5 1 2 3 4) # 5) + # explicit num_prompts mapping (must have same length as qps[]) + num_prompts=(32) #(32 64 128 256 256 256 256) # 256) + input_len=8192 + output_len=256 #56 + + # just sanity‐check lengths + #if [ "${#qps[@]}" -ne "${#num_prompts[@]}" ]; then + # echo "❌ qps[] and num_prompts[] must be the same length" + # exit 1 + #fi + + #for i in "${!qps[@]}"; do + #q=${qps[$i]} + #np=${num_prompts[$i]} + + #ts=$(date +"%Y%m%d_%H%M%S") + #logf="./nixlresult/run_in${input_len}_out${output_len}_qps${q//./p}_$ts.log" + + #echo "[$(date +"%Y-%m-%d %H:%M:%S")] input=${input_len}, output=${output_len}, qps=${q}, num_prompts=${np}" \ + # | tee "$logf" + + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 9111 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --tokenizer /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len "$input_len" \ + # --random-output-len 256 \ + # --num-prompts "$np" \ + # --request-rate "$q" \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --burstiness 100 \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos \ + # 2>&1 | tee -a "$logf" + + #done # Clean up before running next model cleanup_instances diff --git a/examples/nixl/run_benchmark_test_heter.sh b/examples/nixl/run_benchmark_test_heter.sh new file mode 100644 index 00000000..f1a1e1d2 --- /dev/null +++ b/examples/nixl/run_benchmark_test_heter.sh @@ -0,0 +1,318 @@ +#!/bin/bash +set -xe + +# Models to run +#MODELS=( +# "Qwen/Qwen3-0.6B" +#) +#MODELS=( +# "meta-llama/Llama-3.1-8B" +#) +MODELS=("meta-llama/Llama-3.1-8B-Instruct") + +#MODELS=( +# "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/" +#) +#MODELS=( +# "Qwen/Qwen3-0.6B" +#) +export VLLM_EXECUTE_MODEL_TIMEOUT_SECONDS=1000000 +export VLLM_RPC_TIMEOUT=1000000000 +export NIXL_LOG_LEVEL=debug +#export UCX_LOG_LEVEL=debug +export VLLM_USE_V1=1 +export VLLM_SKIP_WARMUP=True +export PT_HPU_LAZY_MODE=1 +export VLLM_EXPONENTIAL_BUCKETING=False +export VLLM_PROMPT_BS_BUCKET_MIN=1 +export VLLM_PROMPT_SEQ_BUCKET_MIN=1 +export VLLM_PROMPT_SEQ_BUCKET_MIN=8192 +export VLLM_PROMPT_SEQ_BUCKET_STEP=8192 +export VLLM_PROMPT_SEQ_BUCKET_MAX=8192 +export VLLM_DECODE_BLOCK_BUCKET_MIN=1024 +export VLLM_DECODE_BLOCK_BUCKET_MAX=1184 +export VLLM_USE_PADDING_AWARE_SCHEDULING=1 +export DECODER_TP_RATIO=2 + +# Number of prefill and decode instances to create +NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1 +NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1} # Default to 1 +PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} +DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} + + +# Find the git repository root directory +#GIT_ROOT=$(git rev-parse --show-toplevel) +GIT_ROOT="/home/vllm-nixl/vllm" + +#SMI_BIN=$(which nvidia-smi || which rocm-smi) + +# Trap the SIGINT signal (triggered by Ctrl+C) +trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT + +# Waits for vLLM to start. +wait_for_server() { + local port=$1 + timeout 1200 bash -c " + until curl -s localhost:${port}/v1/completions > /dev/null; do + sleep 1 + done" && return 0 || return 1 +} + +# Function to clean up previous instances +cleanup_instances() { + echo "Cleaning up any running vLLM instances..." + pkill -f "vllm serve" || true + sleep 2 +} + +# Handle to get model-specific arguments for deepseek +get_model_args() { + local model_name=$1 + local extra_args="" + + if [[ "$model_name" == "deepseek-ai/deepseek-vl2-tiny" ]]; then + extra_args="--hf_overrides '{\"architectures\": [\"DeepseekVLV2ForCausalLM\"]}' --trust-remote-code" + fi + + echo "$extra_args" +} + +get_num_gpus() { + if [[ "$SMI_BIN" == *"nvidia"* ]]; then + echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + else + echo "$($SMI_BIN -l | grep GPU | wc -l)" + fi +} + +# Function to run tests for a specific model +run_tests_for_model() { + local model_name=$1 + echo "================================" + echo "Testing model: $model_name" + echo "================================" + + # Get model-specific arguments + local model_args=$(get_model_args "$model_name") + + # Arrays to store all hosts and ports + PREFILL_HOSTS=() + PREFILL_PORTS=() + DECODE_HOSTS=() + DECODE_PORTS=() + + # Start prefill instances + for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do + # Calculate GPU ID - we'll distribute across available GPUs + #GPU_ID=$((i % $(get_num_gpus))) + GPU_ID=2 + + # Calculate port number (base port + instance number) + PORT=$((8300 + i)) + # Calculate side channel port. Avoid clash with with TP workers. + SIDE_CHANNEL_PORT=$((5559 + i)) + + echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" + + # Build the command with or without model-specific args + BASE_CMD="MY_ROLE=PREFILL UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + --port $PORT \ + --long_prefill_token_threshold 8192 \ + --max_num_batched_tokens 8192 \ + --disable-log-requests \ + --gpu-memory-utilization 0.3 \ + --tensor-parallel-size $PREFILLER_TP_SIZE \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Store host and port for proxy configuration + PREFILL_HOSTS+=("localhost") + PREFILL_PORTS+=($PORT) + done + + # Start decode instances + for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do + # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs + #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + # Calculate port number (base port + instance number) + PORT=$((8400 + i)) + # Calculate side channel port + SIDE_CHANNEL_PORT=$((4659 + i * $DECODER_TP_SIZE)) + + echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" + + # Build the command with or without model-specific args + BASE_CMD="MY_ROLE=DECODE UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + --port $PORT \ + --gpu-memory-utilization 0.3 \ + --tensor-parallel-size $DECODER_TP_SIZE \ + --long_prefill_token_threshold 8192 \ + --max_num_batched_tokens 8192 \ + --disable-log-requests \ + --kv-transfer-config '{\"kv_connector\":\"NixlConnector\",\"kv_role\":\"kv_both\",\"kv_buffer_device\":\"cpu\"}'" + + if [ -n "$model_args" ]; then + FULL_CMD="$BASE_CMD $model_args" + else + FULL_CMD="$BASE_CMD" + fi + + eval "$FULL_CMD &" + + # Store host and port for proxy configuration + DECODE_HOSTS+=("localhost") + DECODE_PORTS+=($PORT) + done + + # Wait for all instances to start + for PORT in "${PREFILL_PORTS[@]}"; do + echo "Waiting for prefill instance on port $PORT to start..." + wait_for_server $PORT + done + + for PORT in "${DECODE_PORTS[@]}"; do + echo "Waiting for decode instance on port $PORT to start..." + wait_for_server $PORT + done + + # Build the command for the proxy server with all the hosts and ports + PROXY_CMD="python toy_proxy_server.py --port 9111" + + # Add all prefill hosts and ports + PROXY_CMD+=" --prefiller-hosts ${PREFILL_HOSTS[@]}" + PROXY_CMD+=" --prefiller-ports ${PREFILL_PORTS[@]}" + + # Add all decode hosts and ports + PROXY_CMD+=" --decoder-hosts ${DECODE_HOSTS[@]}" + PROXY_CMD+=" --decoder-ports ${DECODE_PORTS[@]}" + + # Start the proxy server + echo "Starting proxy server with command: $PROXY_CMD" + $PROXY_CMD & + + # Wait for the proxy to start + sleep 10 + +# curl -X POST -s http://localhost:9111/v1/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "meta-llama/Llama-3.1-8B", +# "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]", +# "max_tokens": 5, +# "temperature": 0 +# }' +# sleep 5 +# echo "--------------------===================-------------" +curl -X POST -s http://localhost:9111/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2] Intel opened its first international manufacturing facility in 1972, in Malaysia, which would host multiple Intel operations, before opening assembly facilities and semiconductor plants in Singapore and Jerusalem in the early 1980s, and manufacturing and development centers in China, India, and Costa Rica in the 1990s.[31] By the early 1980s, its business was dominated by DRAM chips. However, increased competition from Japanese semiconductor manufacturers had, by 1983, dramatically reduced the profitability of this market. The growing success of the IBM personal computer, based on an Intel microprocessor, was among factors that convinced Gordon Moore (CEO since 1975) to shift the companys focus to microprocessors and to change fundamental aspects of that business model. Moores decision to sole-source Intels 386 chip played into the companys continuing success.", + "max_tokens": 50, + "temperature": 0 + }' +#curl -X POST -s http://localhost:9111/v1/completions \ +# -H "Content-Type: application/json" \ +# -d '{ +# "model": "/root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/", +# "prompt": ["This was a few months ago. It was my day off and the only thing I had to do was pick my girlfriend up from work at 9:00 pm. Other than that, I was free to loaf on the couch from morning to night, which is what I did. Around 8:00, I decided to shower before I left the house. Now, I have short hair that dries pretty quickly, but I am deeply vain about it, so I always dry it with the hairdryer right after I shower to ensure my hair doesnt get flat and weird. I never skip this step. So, I get out of the shower, start drying my hair... And then I wake up in bed. Its half an hour later. I feel like garbage, my entire body mysteriously hurts, and I am slowly realizing that I dont remember exiting the bathroom. My only clear thought is: oh shit, its 9:00! I have to pick up my girlfriend! Better shake myself awake. I dragged my aching carcass back to the bathroom, and this was when I noticed the massive blisters forming all over my hand. I was still pretty out of it, but I knew that this was a hospital visit kind of burn. My girlfriend then called to check in because I was running late and, despite my undoubtedly convincing argument that I was still perfectly fine to drive, she immediately knew something was wrong. She cabbed home and we got a ride to the ER. Turns out, I had my first ever seizure! It seems like during the seizure, I clenched the hairdryer in my fist and had it pointed at my other hand long enough to thoroughly cook it. The tissue loss is pretty deep in some areas and there was concerns about me retaining my mobility, but its been healing well so far.", +# "Mark Elliot Zuckerberg is an American businessman who co-founded the social media service Facebook and its parent company Meta Platforms, of which he is the chairman, chief executive officer, and controlling shareholder. Zuckerberg has been the subject of multiple lawsuits regarding the creation and ownership of the website as well as issues such as user privacy. Born in White Plains, New York, Zuckerberg briefly attended Harvard College, where he launched Facebook in February 2004 with his roommates Eduardo Saverin, Andrew McCollum, Dustin Moskovitz and Chris Hughes. Zuckerberg took the company public in May 2012 with majority shares. He became the worlds youngest self-made billionaire[a] in 2008, at age 23, and has consistently ranked among the worlds wealthiest individuals. According to Forbes, Zuckerbergs estimated net worth stood at US$221.2 billion as of May 2025, making him the second-richest individual in the world.[2]"], +# "max_tokens": 100, +# "temperature": 0 +# }' + #sleep 2 + # Run lm eval for this model + #echo "Running tests for $model_name" + #TEST_MODEL=$model_name python -m pytest -s -x test_accuracy.py + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 9111 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len 8192 \ + # --random-output-len 256 \ + # --num-prompts 32 \ + # --burstiness 100 \ + # --request-rate 3.6 \ + # --metric-percentiles 95 \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos + + #sleep 100 + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 8300 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len 8192 \ + # --random-output-len 200 \ + # --num-prompts 100 \ + # --burstiness 100 \ + # --request-rate 3.6 \ + # --metric-percentiles 95 \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos + qps=(0.5) #(0.1 0.25 0.5 1 2 3 4) # 5) + # explicit num_prompts mapping (must have same length as qps[]) + num_prompts=(32) #(32 64 128 256 256 256 256) # 256) + input_len=8192 + output_len=256 #56 + + # just sanity‐check lengths + #if [ "${#qps[@]}" -ne "${#num_prompts[@]}" ]; then + # echo "❌ qps[] and num_prompts[] must be the same length" + # exit 1 + #fi + + #for i in "${!qps[@]}"; do + #q=${qps[$i]} + #np=${num_prompts[$i]} + + #ts=$(date +"%Y%m%d_%H%M%S") + #logf="./nixlresult/run_in${input_len}_out${output_len}_qps${q//./p}_$ts.log" + + #echo "[$(date +"%Y-%m-%d %H:%M:%S")] input=${input_len}, output=${output_len}, qps=${q}, num_prompts=${np}" \ + # | tee "$logf" + + #python3 ../../../../benchmarks/benchmark_serving.py \ + # --port 9111 \ + # --seed "$(date +%s)" \ + # --model /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --tokenizer /root/software/data/pytorch/huggingface/hub/models--meta-llama--Llama-3.1-8B-Instruct/snapshots/0e9e39f249a16976918f6564b8830bc894c89659/ \ + # --dataset-name random \ + # --random-input-len "$input_len" \ + # --random-output-len 256 \ + # --num-prompts "$np" \ + # --request-rate "$q" \ + # --percentile-metrics ttft,tpot,itl,e2el \ + # --burstiness 100 \ + # --backend openai \ + # --endpoint /v1/completions \ + # --ignore-eos \ + # 2>&1 | tee -a "$logf" + + #done + + # Clean up before running next model + cleanup_instances + sleep 3 +} + +# Run tests for each model +for model in "${MODELS[@]}"; do + run_tests_for_model "$model" +done + +echo "All tests completed!" diff --git a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py index 3a5995ad..cb1bf334 100644 --- a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py +++ b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import torch -from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (NixlConnectorWorker) +from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import NixlConnectorWorker from vllm_gaudi.platform import logger import habana_frameworks.torch.core as htexp diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 877e1583..ec0b06e2 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -95,7 +95,7 @@ _TYPE_CACHE: dict[str, dict[str, Any]] = {} -hpu_buffer: list[list[torch.Tensor]] = [] +decoder_tp_ratio = int(os.getenv('DECODER_TP_RATIO', 1)) class BucketingFailedException(Exception): @@ -2813,6 +2813,26 @@ def unified_execute_model( ) return model_runner_output + def rewrite_kv_based_on_transfer_layout(self, scheduler_output: "SchedulerOutput"): + if scheduler_output.kv_connector_metadata: + for req_id, meta in scheduler_output.kv_connector_metadata.reqs_to_save.items(): + block_ids = meta.local_block_ids + for layer_idx in range(len(self.kv_caches)): + k = self.kv_caches[layer_idx][0] + v = self.kv_caches[layer_idx][1] + gb, h, d = v.shape + indices = torch.tensor(block_ids, device=v.device) + gbhd = [int(gb / self.block_size), self.block_size, h, d] + for kv_tensor in [k, v]: + kv = kv_tensor.reshape(gbhd) + kv_selected = torch.index_select(kv, 0, indices) + bc, bs, h, d = kv_selected.shape + shape = int(bs * h / decoder_tp_ratio * d) + blocks = torch.chunk(kv_selected, 2, dim=2) + vecs = [b.reshape([bc, shape]) for b in blocks] + kv_selected = torch.concat(vecs, dim=1).reshape(kv_selected.shape) + kv.index_copy_(dim=0, index=indices, source=kv_selected) + @torch.inference_mode() def execute_model( self, @@ -3041,6 +3061,9 @@ def execute_model( prompt_batch_idx=idx, is_prompt=True) self.profiler.record_counter(self.event_start, counters) + + if decoder_tp_ratio > 1: + self.rewrite_kv_based_on_transfer_layout(scheduler_output) if not warmup_mode: self.maybe_wait_for_kv_save() finished_sending, finished_recving = (self.get_finished_kv_transfers(scheduler_output)) @@ -4184,7 +4207,6 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None: get_kv_transfer_group().register_kv_caches(kv_caches) if self.vllm_config.kv_transfer_config.kv_buffer_device == "cpu": get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks) - global hpu_buffer htorch.hpu.synchronize() def get_supported_generation_tasks(self) -> list[GenerationTask]: @@ -4465,8 +4487,6 @@ def copy_kv_blocks( target_device = dst_device.type i = 0 - global hpu_buffer - use_hpu_buffer = False for layer_name in src_kv_caches: key_cache = src_kv_caches[layer_name][0] value_cache = src_kv_caches[layer_name][1] @@ -4479,14 +4499,10 @@ def copy_kv_blocks( if value_cache is not None: value_cache = value_cache.flatten(0, 1) - if direction == "d2h" and use_hpu_buffer: - hpu_buffer[i][0] = key_cache.index_select(0, src_slot_mapping) - hpu_buffer[i][1] = value_cache.index_select(0, src_slot_mapping) - else: - dst_kv_caches[layer_name][0].index_put_((dst_slot_mapping, ), - key_cache.index_select(0, src_slot_mapping).to(target_device)) - dst_kv_caches[layer_name][1].index_put_((dst_slot_mapping, ), - value_cache.index_select(0, src_slot_mapping).to(target_device)) + dst_kv_caches[layer_name][0].index_put_((dst_slot_mapping, ), + key_cache.index_select(0, src_slot_mapping).to(target_device)) + dst_kv_caches[layer_name][1].index_put_((dst_slot_mapping, ), + value_cache.index_select(0, src_slot_mapping).to(target_device)) if direction == "d2h": dst_kv_caches[layer_name] = dst_kv_caches[layer_name].unflatten(1, (-1, block_size))