Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .claude/settings.local.json
Comment thread
cui36 marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
{
"permissions": {
"allow": [
"Bash(python:*)",
"Read(//root/.cache/vllm/**)"
],
"deny": [],
"ask": []
}
}
21 changes: 12 additions & 9 deletions benchmarks/bench_latency_benefit/bench-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ launch_delay_seconds: 30 # Delay between launching each instance

instances: # instances configuration
- name: instance1
model: meta-llama/Llama-3.1-8B-Instruct
model: Qwen/Qwen2-7B
engine: vllm
using_venv: true
venv_path: ../engine_integration/vllm-v0.9.2/.venv
Expand All @@ -38,12 +38,13 @@ instances: # instances configuration
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=12346"
# - "--gpu-memory-utilization 0.31"
- "--gpu-memory-utilization=0.31"
- "--enable-sleep-mode"
- "--max-model-len 62000"
- "--max-model-len=62000"
- "--max-num-batched-tokens=8192"

- name: instance2
model: meta-llama/Llama-3.1-8B-Instruct
model: Qwen/Qwen2-7B
engine: vllm
using_venv: true
venv_path: ../engine_integration/vllm-v0.9.2/.venv
Expand All @@ -59,12 +60,13 @@ instances: # instances configuration
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=30000"
# - "--gpu-memory-utilization 0.31"
- "--gpu-memory-utilization=0.31"
- "--enable-sleep-mode"
- "--max-model-len 62000"
- "--max-model-len=62000"
- "--max-num-batched-tokens=8192"

- name: instance3
model: meta-llama/Llama-3.1-8B-Instruct
model: Qwen/Qwen2-7B
engine: vllm
using_venv: true
venv_path: ../engine_integration/vllm-v0.9.2/.venv
Expand All @@ -80,6 +82,7 @@ instances: # instances configuration
- "--no-enable-prefix-caching"
- "--host=localhost"
- "--port=40000"
# - "--gpu-memory-utilization 0.31"
- "--gpu-memory-utilization=0.31"
- "--enable-sleep-mode"
- "--max-model-len 62000"
- "--max-model-len=62000"
- "--max-num-batched-tokens=8192"
12 changes: 12 additions & 0 deletions benchmarks/bench_latency_benefit/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Configuration
# Adjust num_prompts as needed (leave empty to use default calculation)
num_prompts=2000
prompt_len=4096 # Default prompt length

for max_rps in 1; do
for completion_len in 5; do
# for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do
# for completion_len in 64 128 256; do
./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len
done
done
130 changes: 130 additions & 0 deletions benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#!/bin/bash
set -ex

# Usage: ./run_benchmark_fixed_rate.sh <FIXED_RPS> <COMPLETION_LEN> [NUM_PROMPTS] [DURATION] [MODEL_DELAY] [BURSTINESS] [PROMPT_LEN]
# Example: ./run_benchmark_fixed_rate.sh 12 256 720 60 30 10.0 4096
#
# Parameters:
# FIXED_RPS - Fixed request rate (requests per second)
# COMPLETION_LEN - Completion length
# NUM_PROMPTS - Total number of prompts (optional, will calculate from FIXED_RPS * DURATION if not provided)
# DURATION - Duration in seconds (default: 30)
# MODEL_DELAY - Delay between models in seconds (default: DURATION)
# BURSTINESS - Higher values = more uniform timing (default: 10000.0)
# Use high values like 10-100 for near-constant intervals
# PROMPT_LEN - Prompt length (default: 4096)

# Set environment variables
export KVCACHED_IPC_NAME=VLLM

# Add vLLM benchmarks and kvcached to Python path
export PYTHONPATH="../../engine_integration/vllm-v0.9.2/benchmarks:../../:../../benchmarks:$PYTHONPATH"

# Benchmark parameters
PROMPT_LEN=${7:-4096}
COMPLETION_LEN=$2
BACKEND="vllm"
# Fixed request rate parameters
FIXED_RPS=$1 # Fixed request rate (requests per second)
DURATION=${4:-0} # Duration in seconds (default: 30s)
BURSTINESS=${6:-10000.0} # Higher burstiness for more uniform requests (default: 10000.0)

# Calculate total number of requests
if [ -n "$3" ]; then
NUM_PROMPTS=$3
echo "Using provided NUM_PROMPTS: $NUM_PROMPTS"
else
NUM_PROMPTS=$((FIXED_RPS * DURATION))
echo "Calculated NUM_PROMPTS: $NUM_PROMPTS (fixed rate: ${FIXED_RPS} RPS for ${DURATION}s)"
fi

mkdir -p results results/metrics

# Define models and their configurations
MODELS=(
"meta-llama/Llama-3.1-8B-Instruct:12346"
"meta-llama/Llama-3.1-8B-Instruct:30000"
"meta-llama/Llama-3.1-8B-Instruct:40000"
)
NUM_MODELS=${#MODELS[@]}

# Record unified start time
UNIFIED_START_TIME=$(date +%s.%N)
echo "Unified benchmark start time: $UNIFIED_START_TIME"

# Model delay (can be adjusted if needed)
MODEL_DELAY=${5:-$DURATION} # Delay in seconds before starting next model (default: DURATION)

# Arrays to store PIDs and result files
PIDS=()
RESULT_FILES=()

# Run benchmarks for each model
for i in "${!MODELS[@]}"; do
# Parse model and port
MODEL=$(echo "${MODELS[$i]}" | cut -d':' -f1)
PORT=$(echo "${MODELS[$i]}" | cut -d':' -f2)

# Generate model name and result file
MODEL_NAME=$(echo "$MODEL" | tr '/' '-')
MODEL_INDEX=$((i + 1))

# Generate result file name for fixed rate strategy
RESULT_FILE="results/metrics/${BACKEND}-${MODEL_NAME}-fixed-rate-${FIXED_RPS}rps-duration-${DURATION}s-burstiness-${BURSTINESS}-prompt_${PROMPT_LEN}-completion_${COMPLETION_LEN}-${MODEL_INDEX}-delay-${MODEL_DELAY}-model-num-${NUM_MODELS}-num-prompt-${NUM_PROMPTS}.json"

# Add delay before starting next model (except for the first one)
if [ $i -gt 0 ] && [ "$MODEL_DELAY" -gt 0 ]; then
echo "Waiting ${MODEL_DELAY} seconds before starting Model ${MODEL_INDEX}..."
sleep $MODEL_DELAY
fi

echo "Starting benchmark for $MODEL (Model ${MODEL_INDEX}) on port $PORT..."

# Use fixed rate strategy
echo "Using fixed rate strategy: ${FIXED_RPS} RPS for ${DURATION} seconds (burstiness: ${BURSTINESS})"

python bench_kvcached_vllm.py \
--backend "$BACKEND" \
--model "$MODEL" \
--dataset-name random \
--random-input-len "$PROMPT_LEN" \
--random-output-len "$COMPLETION_LEN" \
--num-prompts "$NUM_PROMPTS" \
--host "localhost" \
--port "$PORT" \
--endpoint "/v1/completions" \
--save-result \
--result-filename "$RESULT_FILE" \
--metadata "unified_start_time=$UNIFIED_START_TIME" \
--request-rate "$FIXED_RPS" \
--burstiness "$BURSTINESS" &

# Store PID and result file
PIDS+=($!)
RESULT_FILES+=("$RESULT_FILE")

echo "Started Model ${MODEL_INDEX} with PID ${PIDS[$i]}"
done

# Wait for all benchmarks to complete
echo "Waiting for all benchmarks to complete..."
EXIT_CODES=()

for i in "${!PIDS[@]}"; do
wait ${PIDS[$i]}
EXIT_CODE=$?
EXIT_CODES+=($EXIT_CODE)
echo "Model $((i + 1)) benchmark exit code: $EXIT_CODE"
done

echo "All benchmarks completed!"
echo "Results saved to:"
for result_file in "${RESULT_FILES[@]}"; do
echo " - $result_file"
done

# Summary of exit codes
echo "Exit code summary:"
for i in "${!EXIT_CODES[@]}"; do
echo " Model $((i + 1)): ${EXIT_CODES[$i]}"
done
5 changes: 3 additions & 2 deletions benchmarks/bench_latency_benefit/start_vllm_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ export VLLM_USE_V1=1
export VLLM_ATTENTION_BACKEND=FLASH_ATTN

# Model configuration
MODEL="meta-llama/Llama-3.2-1B" # Use smaller model for testing
MODEL="meta-llama/Llama-3.1-8B-Instruct"
PORT=8000

# Start vLLM server
Expand All @@ -17,4 +17,5 @@ vllm serve "$MODEL" \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.5 \
--port="$PORT" \
--tensor-parallel-size=1
--tensor-parallel-size=1 \
--max-num-batched-tokens 16384