diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml index 417b4b77..dc3ae7ed 100644 --- a/benchmarks/bench_latency_benefit/bench-config.yaml +++ b/benchmarks/bench_latency_benefit/bench-config.yaml @@ -22,7 +22,8 @@ launch_delay_seconds: 30 # Delay between launching each instance instances: # instances configuration - name: instance1 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -38,12 +39,14 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=12346" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" + # - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" - name: instance2 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -59,12 +62,14 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=30000" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" + # - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" - name: instance3 - model: meta-llama/Llama-3.1-8B-Instruct + # model: meta-llama/Llama-3.1-8B-Instruct + model: Qwen/Qwen2-7B-Instruct engine: vllm using_venv: true venv_path: ../engine_integration/vllm-v0.9.2/.venv @@ -80,6 +85,7 @@ instances: # instances configuration - "--no-enable-prefix-caching" - "--host=localhost" - "--port=40000" - # - "--gpu-memory-utilization 0.31" + - "--gpu-memory-utilization=0.31" - "--enable-sleep-mode" - - "--max-model-len 62000" \ No newline at end of file + # - "--max-model-len=62000" + - "--max-num-batched-tokens=8192" \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/run.sh b/benchmarks/bench_latency_benefit/run.sh new file mode 100755 index 00000000..3c5bfe95 --- /dev/null +++ b/benchmarks/bench_latency_benefit/run.sh @@ -0,0 +1,11 @@ +# Configuration +num_prompts=300000 +prompt_len=512 # Default prompt length + +for max_rps in 50; do + for completion_len in 1; do +# for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do +# for completion_len in 64 128 256; do + ./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len + done +done \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh new file mode 100755 index 00000000..296453fc --- /dev/null +++ b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -ex + +# Usage: ./run_benchmark_fixed_rate.sh [NUM_PROMPTS] [DURATION] [MODEL_DELAY] [BURSTINESS] [PROMPT_LEN] +# Example: ./run_benchmark_fixed_rate.sh 12 256 720 60 30 10.0 4096 +# +# Parameters: +# FIXED_RPS - Fixed request rate (requests per second) +# COMPLETION_LEN - Completion length +# NUM_PROMPTS - Total number of prompts (optional, will calculate from FIXED_RPS * DURATION if not provided) +# DURATION - Duration in seconds (default: 30) +# MODEL_DELAY - Delay between models in seconds (default: DURATION) +# BURSTINESS - Higher values = more uniform timing (default: 10000.0) +# Use high values like 10-100 for near-constant intervals +# PROMPT_LEN - Prompt length (default: 4096) + +# Set environment variables +export KVCACHED_IPC_NAME=VLLM + +# Add vLLM benchmarks and kvcached to Python path +export PYTHONPATH="../../engine_integration/vllm-v0.9.2/benchmarks:../../:../../benchmarks:$PYTHONPATH" + +# Benchmark parameters +PROMPT_LEN=${7:-4096} +COMPLETION_LEN=$2 +BACKEND="vllm" +# Fixed request rate parameters +FIXED_RPS=$1 # Fixed request rate (requests per second) +DURATION=${4:-0} # Duration in seconds (default: 30s) +BURSTINESS=${6:-10000.0} # Higher burstiness for more uniform requests (default: 10000.0) + +# Calculate total number of requests +if [ -n "$3" ]; then + NUM_PROMPTS=$3 + echo "Using provided NUM_PROMPTS: $NUM_PROMPTS" +else + NUM_PROMPTS=$((FIXED_RPS * DURATION)) + echo "Calculated NUM_PROMPTS: $NUM_PROMPTS (fixed rate: ${FIXED_RPS} RPS for ${DURATION}s)" +fi + +mkdir -p results results/metrics + +# Define models and their configurations +MODELS=( + "Qwen/Qwen2-7B-Instruct:12346" + "Qwen/Qwen2-7B-Instruct:30000" + "Qwen/Qwen2-7B-Instruct:40000" +) +NUM_MODELS=${#MODELS[@]} + +# Record unified start time +UNIFIED_START_TIME=$(date +%s.%N) +echo "Unified benchmark start time: $UNIFIED_START_TIME" + +# Model delay (can be adjusted if needed) +MODEL_DELAY=${5:-$DURATION} # Delay in seconds before starting next model (default: DURATION) + +# Arrays to store PIDs and result files +PIDS=() +RESULT_FILES=() + +# Run benchmarks for each model +for i in "${!MODELS[@]}"; do + # Parse model and port + MODEL=$(echo "${MODELS[$i]}" | cut -d':' -f1) + PORT=$(echo "${MODELS[$i]}" | cut -d':' -f2) + + # Generate model name and result file + MODEL_NAME=$(echo "$MODEL" | tr '/' '-') + MODEL_INDEX=$((i + 1)) + + # Generate result file name for fixed rate strategy + RESULT_FILE="results/metrics/${BACKEND}-${MODEL_NAME}-fixed-rate-${FIXED_RPS}rps-duration-${DURATION}s-burstiness-${BURSTINESS}-prompt_${PROMPT_LEN}-completion_${COMPLETION_LEN}-${MODEL_INDEX}-delay-${MODEL_DELAY}-model-num-${NUM_MODELS}-num-prompt-${NUM_PROMPTS}.json" + + # Add delay before starting next model (except for the first one) + if [ $i -gt 0 ] && [ "$MODEL_DELAY" -gt 0 ]; then + echo "Waiting ${MODEL_DELAY} seconds before starting Model ${MODEL_INDEX}..." + sleep $MODEL_DELAY + fi + + echo "Starting benchmark for $MODEL (Model ${MODEL_INDEX}) on port $PORT..." + + # Use fixed rate strategy + echo "Using fixed rate strategy: ${FIXED_RPS} RPS for ${DURATION} seconds (burstiness: ${BURSTINESS})" + + python bench_kvcached_vllm.py \ + --backend "$BACKEND" \ + --model "$MODEL" \ + --dataset-name random \ + --random-input-len "$PROMPT_LEN" \ + --random-output-len "$COMPLETION_LEN" \ + --num-prompts "$NUM_PROMPTS" \ + --host "localhost" \ + --port "$PORT" \ + --endpoint "/v1/completions" \ + --save-result \ + --result-filename "$RESULT_FILE" \ + --metadata "unified_start_time=$UNIFIED_START_TIME" \ + --request-rate "$FIXED_RPS" \ + --burstiness "$BURSTINESS" & + + # Store PID and result file + PIDS+=($!) + RESULT_FILES+=("$RESULT_FILE") + + echo "Started Model ${MODEL_INDEX} with PID ${PIDS[$i]}" +done + +# Wait for all benchmarks to complete +echo "Waiting for all benchmarks to complete..." +EXIT_CODES=() + +for i in "${!PIDS[@]}"; do + wait ${PIDS[$i]} + EXIT_CODE=$? + EXIT_CODES+=($EXIT_CODE) + echo "Model $((i + 1)) benchmark exit code: $EXIT_CODE" +done + +echo "All benchmarks completed!" +echo "Results saved to:" +for result_file in "${RESULT_FILES[@]}"; do + echo " - $result_file" +done + +# Summary of exit codes +echo "Exit code summary:" +for i in "${!EXIT_CODES[@]}"; do + echo " Model $((i + 1)): ${EXIT_CODES[$i]}" +done \ No newline at end of file diff --git a/benchmarks/bench_latency_benefit/start_vllm_server.sh b/benchmarks/bench_latency_benefit/start_vllm_server.sh index 783f8359..9a08f61e 100644 --- a/benchmarks/bench_latency_benefit/start_vllm_server.sh +++ b/benchmarks/bench_latency_benefit/start_vllm_server.sh @@ -8,7 +8,7 @@ export VLLM_USE_V1=1 export VLLM_ATTENTION_BACKEND=FLASH_ATTN # Model configuration -MODEL="meta-llama/Llama-3.2-1B" # Use smaller model for testing +MODEL="meta-llama/Llama-3.1-8B-Instruct" PORT=8000 # Start vLLM server @@ -17,4 +17,5 @@ vllm serve "$MODEL" \ --no-enable-prefix-caching \ --gpu-memory-utilization 0.5 \ --port="$PORT" \ - --tensor-parallel-size=1 + --tensor-parallel-size=1 \ + --max-num-batched-tokens 16384