Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .cd/server/server_output.env
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ MAX_MODEL_LEN
TOTAL_GPU_MEM
MODEL_DTYPE
QUANT_DTYPE
fsdpa
BLOCK_SIZE
VLLM_PROMPT_BS_BUCKET_MIN
VLLM_PROMPT_BS_BUCKET_STEP
Expand Down Expand Up @@ -57,3 +56,5 @@ MAX_NUM_SEQS
VLLM_PROMPT_SEQ_BUCKET_MAX
VLLM_CONTIGUOUS_PA
VLLM_DEFRAG
ASYNC_SCHEDULING
VLLM_WEIGHT_LOAD_FORCE_SYNC
1 change: 1 addition & 0 deletions .cd/server/server_user.env
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ MAX_NUM_SEQS
TENSOR_PARALLEL_SIZE
VLLM_EXPONENTIAL_BUCKETING
GPU_MEM_UTILIZATION
ASYNC_SCHEDULING
38 changes: 19 additions & 19 deletions .cd/server/settings_vllm.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,false,false
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
7 changes: 5 additions & 2 deletions .cd/templates/template_vllm_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#@VARS

if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling
EXTRA_ARGS+=" --async_scheduling"
fi

## Start server
vllm serve $MODEL \
--block-size $BLOCK_SIZE \
Expand All @@ -11,8 +15,7 @@ vllm serve $MODEL \
--max-model-len $MAX_MODEL_LEN \
--gpu-memory-utilization $GPU_MEM_UTILIZATION \
--max-num-seqs $MAX_NUM_SEQS \
--async_scheduling \
--generation-config vllm \
--max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \
--disable-log-requests \
--disable-log-requests ${EXTRA_ARGS} \
2>&1 | tee -a logs/vllm_server.log