From a6998aea823f1e0b9c5f5ad867797152f201ab99 Mon Sep 17 00:00:00 2001 From: cui36 Date: Fri, 31 Oct 2025 22:39:59 +0000 Subject: [PATCH 1/5] Add max-num-batched-tokens control --- .claude/settings.local.json | 10 +++++++++ .../bench_latency_benefit/bench-config.yaml | 21 +++++++++++-------- .../start_vllm_server.sh | 5 +++-- 3 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 00000000..f126d354 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,10 @@ +{ + "permissions": { + "allow": [ + "Bash(python:*)", + "Read(//root/.cache/vllm/**)" + ], + "deny": [], + "ask": [] + } +} diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml index 417b4b77..ef46ef4f 100644 --- a/benchmarks/bench_latency_benefit/bench-config.yaml +++ b/benchmarks/bench_latency_benefit/bench-config.yaml @@ -22,7 +22,7 @@ launch_delay_seconds: 30 # Delay between launching each instance instances: # instances configuration - name: instance1 - model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -38,12 +38,13 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=12346" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" + - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" - name: instance2 - model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -59,12 +60,13 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=30000" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" + - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" - name: instance3 - model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -80,6 +82,7 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=40000" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" \ No newline at end of file + - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/start_vllm_server.sh b/benchmarks/bench_latency_benefit/start_vllm_server.sh index 783f8359..9a08f61e 100644 --- a/benchmarks/bench_latency_benefit/start_vllm_server.sh +++ b/benchmarks/bench_latency_benefit/start_vllm_server.sh @@ -8,7 +8,7 @@ export VLLM_USE_V1=1 export VLLM_ATTENTION_BACKEND=FLASH_ATTN # Model configuration -MODEL="meta-llama/Llama-3.2-1B" # Use smaller model for testing +MODEL="meta-llama/Llama-3.1-8B-Instruct" PORT=8000 # Start vLLM server @@ -17,4 +17,5 @@ vllm serve "$MODEL" \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.5 \ --port="$PORT" \ - --tensor-parallel-size=1 + --tensor-parallel-size=1 \ + --max-num-batched-tokens 16384 From 5c9cbaadc1f321f741689fa7974965427f6fe763 Mon Sep 17 00:00:00 2001 From: cui36 Date: Sun, 2 Nov 2025 17:33:58 +0000 Subject: [PATCH 2/5] working on issue-197 --- benchmarks/bench_latency_benefit/run.sh | 12 ++ .../run_benchmark_fixed_rate.sh | 130 ++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100755 benchmarks/bench_latency_benefit/run.sh create mode 100755 benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh diff --git a/benchmarks/bench_latency_benefit/run.sh b/benchmarks/bench_latency_benefit/run.sh new file mode 100755 index 00000000..f078e4a6 --- /dev/null +++ b/benchmarks/bench_latency_benefit/run.sh @@ -0,0 +1,12 @@ +# Configuration +# Adjust num_prompts as needed (leave empty to use default calculation) +num_prompts=2000 +prompt_len=4096 # Default prompt length + +for max_rps in 1; do + for completion_len in 5; do +# for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do +# for completion_len in 64 128 256; do + ./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len + done +done \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh new file mode 100755 index 00000000..a8faced2 --- /dev/null +++ b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -ex + +# Usage: ./run_benchmark_fixed_rate.sh [NUM_PROMPTS] [DURATION] [MODEL_DELAY] [BURSTINESS] [PROMPT_LEN] +# Example: ./run_benchmark_fixed_rate.sh 12 256 720 60 30 10.0 4096 +# +# Parameters: +# FIXED_RPS - Fixed request rate (requests per second) +# COMPLETION_LEN - Completion length +# NUM_PROMPTS - Total number of prompts (optional, will calculate from FIXED_RPS * DURATION if not provided) +# DURATION - Duration in seconds (default: 30) +# MODEL_DELAY - Delay between models in seconds (default: DURATION) +# BURSTINESS - Higher values = more uniform timing (default: 10000.0) +# Use high values like 10-100 for near-constant intervals +# PROMPT_LEN - Prompt length (default: 4096) + +# Set environment variables +export KVCACHED_IPC_NAME=VLLM + +# Add vLLM benchmarks and kvcached to Python path +export PYTHONPATH="../../engine_integration/vllm-v0.9.2/benchmarks:../../:../../benchmarks:$PYTHONPATH" + +# Benchmark parameters +PROMPT_LEN=${7:-4096} +COMPLETION_LEN=$2 +BACKEND="vllm" +# Fixed request rate parameters +FIXED_RPS=$1 # Fixed request rate (requests per second) +DURATION=${4:-0} # Duration in seconds (default: 30s) +BURSTINESS=${6:-10000.0} # Higher burstiness for more uniform requests (default: 10000.0) + +# Calculate total number of requests +if [ -n "$3" ]; then + NUM_PROMPTS=$3 + echo "Using provided NUM_PROMPTS: $NUM_PROMPTS" +else + NUM_PROMPTS=$((FIXED_RPS * DURATION)) + echo "Calculated NUM_PROMPTS: $NUM_PROMPTS (fixed rate: ${FIXED_RPS} RPS for ${DURATION}s)" +fi + +mkdir -p results results/metrics + +# Define models and their configurations +MODELS=( + "meta-llama/Llama-3.1-8B-Instruct:12346" + "meta-llama/Llama-3.1-8B-Instruct:30000" + "meta-llama/Llama-3.1-8B-Instruct:40000" +) +NUM_MODELS=${#MODELS[@]} + +# Record unified start time +UNIFIED_START_TIME=$(date +%s.%N) +echo "Unified benchmark start time: $UNIFIED_START_TIME" + +# Model delay (can be adjusted if needed) +MODEL_DELAY=${5:-$DURATION} # Delay in seconds before starting next model (default: DURATION) + +# Arrays to store PIDs and result files +PIDS=() +RESULT_FILES=() + +# Run benchmarks for each model +for i in "${!MODELS[@]}"; do + # Parse model and port + MODEL=$(echo "${MODELS[$i]}" | cut -d':' -f1) + PORT=$(echo "${MODELS[$i]}" | cut -d':' -f2) + + # Generate model name and result file + MODEL_NAME=$(echo "$MODEL" | tr '/' '-') + MODEL_INDEX=$((i + 1)) + + # Generate result file name for fixed rate strategy + RESULT_FILE="results/metrics/${BACKEND}-${MODEL_NAME}-fixed-rate-${FIXED_RPS}rps-duration-${DURATION}s-burstiness-${BURSTINESS}-prompt_${PROMPT_LEN}-completion_${COMPLETION_LEN}-${MODEL_INDEX}-delay-${MODEL_DELAY}-model-num-${NUM_MODELS}-num-prompt-${NUM_PROMPTS}.json" + + # Add delay before starting next model (except for the first one) + if [ $i -gt 0 ] && [ "$MODEL_DELAY" -gt 0 ]; then + echo "Waiting ${MODEL_DELAY} seconds before starting Model ${MODEL_INDEX}..." + sleep $MODEL_DELAY + fi + + echo "Starting benchmark for $MODEL (Model ${MODEL_INDEX}) on port $PORT..." + + # Use fixed rate strategy + echo "Using fixed rate strategy: ${FIXED_RPS} RPS for ${DURATION} seconds (burstiness: ${BURSTINESS})" + + python bench_kvcached_vllm.py \ + --backend "$BACKEND" \ + --model "$MODEL" \ + --dataset-name random \ + --random-input-len "$PROMPT_LEN" \ + --random-output-len "$COMPLETION_LEN" \ + --num-prompts "$NUM_PROMPTS" \ + --host "localhost" \ + --port "$PORT" \ + --endpoint "/v1/completions" \ + --save-result \ + --result-filename "$RESULT_FILE" \ + --metadata "unified_start_time=$UNIFIED_START_TIME" \ + --request-rate "$FIXED_RPS" \ + --burstiness "$BURSTINESS" & + + # Store PID and result file + PIDS+=($!) + RESULT_FILES+=("$RESULT_FILE") + + echo "Started Model ${MODEL_INDEX} with PID ${PIDS[$i]}" +done + +# Wait for all benchmarks to complete +echo "Waiting for all benchmarks to complete..." +EXIT_CODES=() + +for i in "${!PIDS[@]}"; do + wait ${PIDS[$i]} + EXIT_CODE=$? + EXIT_CODES+=($EXIT_CODE) + echo "Model $((i + 1)) benchmark exit code: $EXIT_CODE" +done + +echo "All benchmarks completed!" +echo "Results saved to:" +for result_file in "${RESULT_FILES[@]}"; do + echo " - $result_file" +done + +# Summary of exit codes +echo "Exit code summary:" +for i in "${!EXIT_CODES[@]}"; do + echo " Model $((i + 1)): ${EXIT_CODES[$i]}" +done \ No newline at end of file From 49a285745024ca67f5a77a081adcaa274afacbeb Mon Sep 17 00:00:00 2001 From: cui36 Date: Sun, 2 Nov 2025 17:53:36 +0000 Subject: [PATCH 3/5] Remove .claude and ignore it --- .claude/settings.local.json | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index f126d354..00000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "permissions": { - "allow": [ - "Bash(python:*)", - "Read(//root/.cache/vllm/**)" - ], - "deny": [], - "ask": [] - } -} From 9725bfc27608ad954f3abf7dc6197a9701a76563 Mon Sep 17 00:00:00 2001 From: cui36 Date: Sun, 2 Nov 2025 18:26:41 +0000 Subject: [PATCH 4/5] try with llama3-8B --- benchmarks/bench_latency_benefit/bench-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml index ef46ef4f..4f917392 100644 --- a/benchmarks/bench_latency_benefit/bench-config.yaml +++ b/benchmarks/bench_latency_benefit/bench-config.yaml @@ -22,7 +22,7 @@ launch_delay_seconds: 30 # Delay between launching each instance instances: # instances configuration - name: instance1 - model: Qwen/Qwen2-7B + model: meta-llama/Llama-3.1-8B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -44,7 +44,7 @@ instances: # instances configuration - "--max-num-batched-tokens=8192" - name: instance2 - model: Qwen/Qwen2-7B + model: meta-llama/Llama-3.1-8B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -66,7 +66,7 @@ instances: # instances configuration - "--max-num-batched-tokens=8192" - name: instance3 - model: Qwen/Qwen2-7B + model: meta-llama/Llama-3.1-8B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv From 39cb7c21377416d15afb21d92ca92ffa528fbd13 Mon Sep 17 00:00:00 2001 From: cui36 Date: Tue, 4 Nov 2025 04:00:56 +0000 Subject: [PATCH 5/5] adjust config to the issue --- .../bench_latency_benefit/bench-config.yaml | 15 +++++++++------ benchmarks/bench_latency_benefit/run.sh | 9 ++++----- .../run_benchmark_fixed_rate.sh | 6 +++--- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml index 4f917392..dc3ae7ed 100644 --- a/benchmarks/bench_latency_benefit/bench-config.yaml +++ b/benchmarks/bench_latency_benefit/bench-config.yaml @@ -22,7 +22,8 @@ launch_delay_seconds: 30 # Delay between launching each instance instances: # instances configuration - name: instance1 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -40,11 +41,12 @@ instances: # instances configuration - "--port=12346" - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len=62000" + # - "--max-model-len=62000" - "--max-num-batched-tokens=8192" - name: instance2 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -62,11 +64,12 @@ instances: # instances configuration - "--port=30000" - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len=62000" + # - "--max-model-len=62000" - "--max-num-batched-tokens=8192" - name: instance3 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -84,5 +87,5 @@ instances: # instances configuration - "--port=40000" - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len=62000" + # - "--max-model-len=62000" - "--max-num-batched-tokens=8192" \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/run.sh b/benchmarks/bench_latency_benefit/run.sh index f078e4a6..3c5bfe95 100755 --- a/benchmarks/bench_latency_benefit/run.sh +++ b/benchmarks/bench_latency_benefit/run.sh @@ -1,10 +1,9 @@ # Configuration -# Adjust num_prompts as needed (leave empty to use default calculation) -num_prompts=2000 -prompt_len=4096 # Default prompt length +num_prompts=300000 +prompt_len=512 # Default prompt length -for max_rps in 1; do - for completion_len in 5; do +for max_rps in 50; do + for completion_len in 1; do # for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do # for completion_len in 64 128 256; do ./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len diff --git a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh index a8faced2..296453fc 100755 --- a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh +++ b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh @@ -42,9 +42,9 @@ mkdir -p results results/metrics # Define models and their configurations MODELS=( - "meta-llama/Llama-3.1-8B-Instruct:12346" - "meta-llama/Llama-3.1-8B-Instruct:30000" - "meta-llama/Llama-3.1-8B-Instruct:40000" + "Qwen/Qwen2-7B-Instruct:12346" + "Qwen/Qwen2-7B-Instruct:30000" + "Qwen/Qwen2-7B-Instruct:40000" ) NUM_MODELS=${#MODELS[@]}