From 5277da0a7ad4038ac47f5700ff462dface0836d6 Mon Sep 17 00:00:00 2001 From: Neelesh Gokhale Date: Wed, 1 Oct 2025 03:40:41 +0000 Subject: [PATCH] Fix Llama 405B docker loading Signed-off-by: Neelesh Gokhale --- .cd/server/server_output.env | 3 ++- .cd/server/server_user.env | 1 + .cd/server/settings_vllm.csv | 38 +++++++++++++-------------- .cd/templates/template_vllm_server.sh | 7 +++-- 4 files changed, 27 insertions(+), 22 deletions(-) diff --git a/.cd/server/server_output.env b/.cd/server/server_output.env index a4b9efab..dccdef0a 100644 --- a/.cd/server/server_output.env +++ b/.cd/server/server_output.env @@ -6,7 +6,6 @@ MAX_MODEL_LEN TOTAL_GPU_MEM MODEL_DTYPE QUANT_DTYPE -fsdpa BLOCK_SIZE VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP @@ -57,3 +56,5 @@ MAX_NUM_SEQS VLLM_PROMPT_SEQ_BUCKET_MAX VLLM_CONTIGUOUS_PA VLLM_DEFRAG +ASYNC_SCHEDULING +VLLM_WEIGHT_LOAD_FORCE_SYNC diff --git a/.cd/server/server_user.env b/.cd/server/server_user.env index 8d166427..3dd52ba0 100644 --- a/.cd/server/server_user.env +++ b/.cd/server/server_user.env @@ -10,3 +10,4 @@ MAX_NUM_SEQS TENSOR_PARALLEL_SIZE VLLM_EXPONENTIAL_BUCKETING GPU_MEM_UTILIZATION +ASYNC_SCHEDULING diff --git a/.cd/server/settings_vllm.csv b/.cd/server/settings_vllm.csv index d6d52438..bb677487 100644 --- a/.cd/server/settings_vllm.csv +++ b/.cd/server/settings_vllm.csv @@ -1,19 +1,19 @@ -MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG -meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false -mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,false,false -Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false -ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true -ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true -Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false +MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC +meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1 +Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 +ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0 +Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0 diff --git a/.cd/templates/template_vllm_server.sh b/.cd/templates/template_vllm_server.sh index 503d5ded..b6db4e8c 100644 --- a/.cd/templates/template_vllm_server.sh +++ b/.cd/templates/template_vllm_server.sh @@ -2,6 +2,10 @@ #@VARS +if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling + EXTRA_ARGS+=" --async_scheduling" +fi + ## Start server vllm serve $MODEL \ --block-size $BLOCK_SIZE \ @@ -11,8 +15,7 @@ vllm serve $MODEL \ --max-model-len $MAX_MODEL_LEN \ --gpu-memory-utilization $GPU_MEM_UTILIZATION \ --max-num-seqs $MAX_NUM_SEQS \ - --async_scheduling \ --generation-config vllm \ --max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \ - --disable-log-requests \ + --disable-log-requests ${EXTRA_ARGS} \ 2>&1 | tee -a logs/vllm_server.log