From a6998aea823f1e0b9c5f5ad867797152f201ab99 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Fri, 31 Oct 2025 22:39:59 +0000
Subject: [PATCH 1/5] Add max-num-batched-tokens control

---
 .claude/settings.local.json                   | 10 +++++++++
 .../bench_latency_benefit/bench-config.yaml   | 21 +++++++++++--------
 .../start_vllm_server.sh                      |  5 +++--
 3 files changed, 25 insertions(+), 11 deletions(-)
 create mode 100644 .claude/settings.local.json

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 00000000..f126d354
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,10 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(python:*)",
+      "Read(//root/.cache/vllm/**)"
+    ],
+    "deny": [],
+    "ask": []
+  }
+}
diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml
index 417b4b77..ef46ef4f 100644
--- a/benchmarks/bench_latency_benefit/bench-config.yaml
+++ b/benchmarks/bench_latency_benefit/bench-config.yaml
@@ -22,7 +22,7 @@ launch_delay_seconds: 30  # Delay between launching each instance
 
 instances: # instances configuration
   - name: instance1
-    model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -38,12 +38,13 @@ instances: # instances configuration
       - "--no-enable-prefix-caching"
       - "--host=localhost"
       - "--port=12346"
-      # - "--gpu-memory-utilization 0.31"
+      - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len 62000"
+      - "--max-model-len=62000"
+      - "--max-num-batched-tokens=8192"
 
   - name: instance2
-    model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -59,12 +60,13 @@ instances: # instances configuration
       - "--no-enable-prefix-caching"
       - "--host=localhost"
       - "--port=30000"
-      # - "--gpu-memory-utilization 0.31"
+      - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len 62000"
+      - "--max-model-len=62000"
+      - "--max-num-batched-tokens=8192"
 
   - name: instance3
-    model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -80,6 +82,7 @@ instances: # instances configuration
       - "--no-enable-prefix-caching"
       - "--host=localhost"
       - "--port=40000"
-      # - "--gpu-memory-utilization 0.31"
+      - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len 62000"
\ No newline at end of file
+      - "--max-model-len=62000"
+      - "--max-num-batched-tokens=8192"
\ No newline at end of file
diff --git a/benchmarks/bench_latency_benefit/start_vllm_server.sh b/benchmarks/bench_latency_benefit/start_vllm_server.sh
index 783f8359..9a08f61e 100644
--- a/benchmarks/bench_latency_benefit/start_vllm_server.sh
+++ b/benchmarks/bench_latency_benefit/start_vllm_server.sh
@@ -8,7 +8,7 @@ export VLLM_USE_V1=1
 export VLLM_ATTENTION_BACKEND=FLASH_ATTN
 
 # Model configuration
-MODEL="meta-llama/Llama-3.2-1B"  # Use smaller model for testing
+MODEL="meta-llama/Llama-3.1-8B-Instruct"
 PORT=8000
 
 # Start vLLM server
@@ -17,4 +17,5 @@ vllm serve "$MODEL" \
     --no-enable-prefix-caching \
     --gpu-memory-utilization 0.5 \
     --port="$PORT" \
-    --tensor-parallel-size=1
+    --tensor-parallel-size=1 \
+    --max-num-batched-tokens 16384

From 5c9cbaadc1f321f741689fa7974965427f6fe763 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 2 Nov 2025 17:33:58 +0000
Subject: [PATCH 2/5] working on issue-197

---
 benchmarks/bench_latency_benefit/run.sh       |  12 ++
 .../run_benchmark_fixed_rate.sh               | 130 ++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100755 benchmarks/bench_latency_benefit/run.sh
 create mode 100755 benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh

diff --git a/benchmarks/bench_latency_benefit/run.sh b/benchmarks/bench_latency_benefit/run.sh
new file mode 100755
index 00000000..f078e4a6
--- /dev/null
+++ b/benchmarks/bench_latency_benefit/run.sh
@@ -0,0 +1,12 @@
+# Configuration
+# Adjust num_prompts as needed (leave empty to use default calculation)
+num_prompts=2000
+prompt_len=4096  # Default prompt length
+
+for max_rps in 1; do
+    for completion_len in 5; do
+# for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do
+#     for completion_len in 64 128 256; do
+        ./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len
+    done
+done
\ No newline at end of file
diff --git a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
new file mode 100755
index 00000000..a8faced2
--- /dev/null
+++ b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
@@ -0,0 +1,130 @@
+#!/bin/bash
+set -ex
+
+# Usage: ./run_benchmark_fixed_rate.sh <FIXED_RPS> <COMPLETION_LEN> [NUM_PROMPTS] [DURATION] [MODEL_DELAY] [BURSTINESS] [PROMPT_LEN]
+# Example: ./run_benchmark_fixed_rate.sh 12 256 720 60 30 10.0 4096
+#
+# Parameters:
+#   FIXED_RPS      - Fixed request rate (requests per second)
+#   COMPLETION_LEN - Completion length
+#   NUM_PROMPTS    - Total number of prompts (optional, will calculate from FIXED_RPS * DURATION if not provided)
+#   DURATION       - Duration in seconds (default: 30)
+#   MODEL_DELAY    - Delay between models in seconds (default: DURATION)
+#   BURSTINESS     - Higher values = more uniform timing (default: 10000.0)
+#                    Use high values like 10-100 for near-constant intervals
+#   PROMPT_LEN     - Prompt length (default: 4096)
+
+# Set environment variables
+export KVCACHED_IPC_NAME=VLLM
+
+# Add vLLM benchmarks and kvcached to Python path
+export PYTHONPATH="../../engine_integration/vllm-v0.9.2/benchmarks:../../:../../benchmarks:$PYTHONPATH"
+
+# Benchmark parameters
+PROMPT_LEN=${7:-4096}
+COMPLETION_LEN=$2
+BACKEND="vllm"
+# Fixed request rate parameters
+FIXED_RPS=$1               # Fixed request rate (requests per second)
+DURATION=${4:-0}          # Duration in seconds (default: 30s)
+BURSTINESS=${6:-10000.0}   # Higher burstiness for more uniform requests (default: 10000.0)
+
+# Calculate total number of requests
+if [ -n "$3" ]; then
+    NUM_PROMPTS=$3
+    echo "Using provided NUM_PROMPTS: $NUM_PROMPTS"
+else
+    NUM_PROMPTS=$((FIXED_RPS * DURATION))
+    echo "Calculated NUM_PROMPTS: $NUM_PROMPTS (fixed rate: ${FIXED_RPS} RPS for ${DURATION}s)"
+fi
+
+mkdir -p results results/metrics
+
+# Define models and their configurations
+MODELS=(
+    "meta-llama/Llama-3.1-8B-Instruct:12346"
+    "meta-llama/Llama-3.1-8B-Instruct:30000"
+    "meta-llama/Llama-3.1-8B-Instruct:40000"
+)
+NUM_MODELS=${#MODELS[@]}
+
+# Record unified start time
+UNIFIED_START_TIME=$(date +%s.%N)
+echo "Unified benchmark start time: $UNIFIED_START_TIME"
+
+# Model delay (can be adjusted if needed)
+MODEL_DELAY=${5:-$DURATION}       # Delay in seconds before starting next model (default: DURATION)
+
+# Arrays to store PIDs and result files
+PIDS=()
+RESULT_FILES=()
+
+# Run benchmarks for each model
+for i in "${!MODELS[@]}"; do
+    # Parse model and port
+    MODEL=$(echo "${MODELS[$i]}" | cut -d':' -f1)
+    PORT=$(echo "${MODELS[$i]}" | cut -d':' -f2)
+
+    # Generate model name and result file
+    MODEL_NAME=$(echo "$MODEL" | tr '/' '-')
+    MODEL_INDEX=$((i + 1))
+
+    # Generate result file name for fixed rate strategy
+    RESULT_FILE="results/metrics/${BACKEND}-${MODEL_NAME}-fixed-rate-${FIXED_RPS}rps-duration-${DURATION}s-burstiness-${BURSTINESS}-prompt_${PROMPT_LEN}-completion_${COMPLETION_LEN}-${MODEL_INDEX}-delay-${MODEL_DELAY}-model-num-${NUM_MODELS}-num-prompt-${NUM_PROMPTS}.json"
+
+    # Add delay before starting next model (except for the first one)
+    if [ $i -gt 0 ] && [ "$MODEL_DELAY" -gt 0 ]; then
+        echo "Waiting ${MODEL_DELAY} seconds before starting Model ${MODEL_INDEX}..."
+        sleep $MODEL_DELAY
+    fi
+
+    echo "Starting benchmark for $MODEL (Model ${MODEL_INDEX}) on port $PORT..."
+
+    # Use fixed rate strategy
+    echo "Using fixed rate strategy: ${FIXED_RPS} RPS for ${DURATION} seconds (burstiness: ${BURSTINESS})"
+
+    python bench_kvcached_vllm.py \
+        --backend "$BACKEND" \
+        --model "$MODEL" \
+        --dataset-name random \
+        --random-input-len "$PROMPT_LEN" \
+        --random-output-len "$COMPLETION_LEN" \
+        --num-prompts "$NUM_PROMPTS" \
+        --host "localhost" \
+        --port "$PORT" \
+        --endpoint "/v1/completions" \
+        --save-result \
+        --result-filename "$RESULT_FILE" \
+        --metadata "unified_start_time=$UNIFIED_START_TIME" \
+        --request-rate "$FIXED_RPS" \
+        --burstiness "$BURSTINESS" &
+
+    # Store PID and result file
+    PIDS+=($!)
+    RESULT_FILES+=("$RESULT_FILE")
+
+    echo "Started Model ${MODEL_INDEX} with PID ${PIDS[$i]}"
+done
+
+# Wait for all benchmarks to complete
+echo "Waiting for all benchmarks to complete..."
+EXIT_CODES=()
+
+for i in "${!PIDS[@]}"; do
+    wait ${PIDS[$i]}
+    EXIT_CODE=$?
+    EXIT_CODES+=($EXIT_CODE)
+    echo "Model $((i + 1)) benchmark exit code: $EXIT_CODE"
+done
+
+echo "All benchmarks completed!"
+echo "Results saved to:"
+for result_file in "${RESULT_FILES[@]}"; do
+    echo "  - $result_file"
+done
+
+# Summary of exit codes
+echo "Exit code summary:"
+for i in "${!EXIT_CODES[@]}"; do
+    echo "  Model $((i + 1)): ${EXIT_CODES[$i]}"
+done
\ No newline at end of file

From 49a285745024ca67f5a77a081adcaa274afacbeb Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 2 Nov 2025 17:53:36 +0000
Subject: [PATCH 3/5] Remove .claude and ignore it

---
 .claude/settings.local.json | 10 ----------
 1 file changed, 10 deletions(-)
 delete mode 100644 .claude/settings.local.json

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
deleted file mode 100644
index f126d354..00000000
--- a/.claude/settings.local.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{
-  "permissions": {
-    "allow": [
-      "Bash(python:*)",
-      "Read(//root/.cache/vllm/**)"
-    ],
-    "deny": [],
-    "ask": []
-  }
-}

From 9725bfc27608ad954f3abf7dc6197a9701a76563 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Sun, 2 Nov 2025 18:26:41 +0000
Subject: [PATCH 4/5] try with llama3-8B

---
 benchmarks/bench_latency_benefit/bench-config.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml
index ef46ef4f..4f917392 100644
--- a/benchmarks/bench_latency_benefit/bench-config.yaml
+++ b/benchmarks/bench_latency_benefit/bench-config.yaml
@@ -22,7 +22,7 @@ launch_delay_seconds: 30  # Delay between launching each instance
 
 instances: # instances configuration
   - name: instance1
-    model: Qwen/Qwen2-7B
+    model: meta-llama/Llama-3.1-8B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -44,7 +44,7 @@ instances: # instances configuration
       - "--max-num-batched-tokens=8192"
 
   - name: instance2
-    model: Qwen/Qwen2-7B
+    model: meta-llama/Llama-3.1-8B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -66,7 +66,7 @@ instances: # instances configuration
       - "--max-num-batched-tokens=8192"
 
   - name: instance3
-    model: Qwen/Qwen2-7B
+    model: meta-llama/Llama-3.1-8B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv

From 39cb7c21377416d15afb21d92ca92ffa528fbd13 Mon Sep 17 00:00:00 2001
From: cui36 <starlingcui.110@gmail.com>
Date: Tue, 4 Nov 2025 04:00:56 +0000
Subject: [PATCH 5/5] adjust config to the issue

---
 .../bench_latency_benefit/bench-config.yaml       | 15 +++++++++------
 benchmarks/bench_latency_benefit/run.sh           |  9 ++++-----
 .../run_benchmark_fixed_rate.sh                   |  6 +++---
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/benchmarks/bench_latency_benefit/bench-config.yaml b/benchmarks/bench_latency_benefit/bench-config.yaml
index 4f917392..dc3ae7ed 100644
--- a/benchmarks/bench_latency_benefit/bench-config.yaml
+++ b/benchmarks/bench_latency_benefit/bench-config.yaml
@@ -22,7 +22,8 @@ launch_delay_seconds: 30  # Delay between launching each instance
 
 instances: # instances configuration
   - name: instance1
-    model: meta-llama/Llama-3.1-8B-Instruct
+    # model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -40,11 +41,12 @@ instances: # instances configuration
       - "--port=12346"
       - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len=62000"
+      # - "--max-model-len=62000"
       - "--max-num-batched-tokens=8192"
 
   - name: instance2
-    model: meta-llama/Llama-3.1-8B-Instruct
+    # model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -62,11 +64,12 @@ instances: # instances configuration
       - "--port=30000"
       - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len=62000"
+      # - "--max-model-len=62000"
       - "--max-num-batched-tokens=8192"
 
   - name: instance3
-    model: meta-llama/Llama-3.1-8B-Instruct
+    # model: meta-llama/Llama-3.1-8B-Instruct
+    model: Qwen/Qwen2-7B-Instruct
     engine: vllm
     using_venv: true
     venv_path:  ../engine_integration/vllm-v0.9.2/.venv
@@ -84,5 +87,5 @@ instances: # instances configuration
       - "--port=40000"
       - "--gpu-memory-utilization=0.31"
       - "--enable-sleep-mode"
-      - "--max-model-len=62000"
+      # - "--max-model-len=62000"
       - "--max-num-batched-tokens=8192"
\ No newline at end of file
diff --git a/benchmarks/bench_latency_benefit/run.sh b/benchmarks/bench_latency_benefit/run.sh
index f078e4a6..3c5bfe95 100755
--- a/benchmarks/bench_latency_benefit/run.sh
+++ b/benchmarks/bench_latency_benefit/run.sh
@@ -1,10 +1,9 @@
 # Configuration
-# Adjust num_prompts as needed (leave empty to use default calculation)
-num_prompts=2000
-prompt_len=4096  # Default prompt length
+num_prompts=300000
+prompt_len=512  # Default prompt length
 
-for max_rps in 1; do
-    for completion_len in 5; do
+for max_rps in 50; do
+    for completion_len in 1; do
 # for max_rps in 1 2 3 4 5 6 7 8 9 10 15 20 25 30 40; do
 #     for completion_len in 64 128 256; do
         ./run_benchmark_fixed_rate.sh $max_rps $completion_len $num_prompts "" "" "" $prompt_len
diff --git a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
index a8faced2..296453fc 100755
--- a/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
+++ b/benchmarks/bench_latency_benefit/run_benchmark_fixed_rate.sh
@@ -42,9 +42,9 @@ mkdir -p results results/metrics
 
 # Define models and their configurations
 MODELS=(
-    "meta-llama/Llama-3.1-8B-Instruct:12346"
-    "meta-llama/Llama-3.1-8B-Instruct:30000"
-    "meta-llama/Llama-3.1-8B-Instruct:40000"
+    "Qwen/Qwen2-7B-Instruct:12346"
+    "Qwen/Qwen2-7B-Instruct:30000"
+    "Qwen/Qwen2-7B-Instruct:40000"
 )
 NUM_MODELS=${#MODELS[@]}