@@ -38,16 +38,12 @@ wait_for_server() {
3838launch_chunked_prefill () {
3939  model=" meta-llama/Meta-Llama-3.1-8B-Instruct" 
4040  #  disagg prefill
41-   CUDA_VISIBLE_DEVICES=0 python3 \
42-     -m vllm.entrypoints.openai.api_server \
43-     --model $model  \
41+   CUDA_VISIBLE_DEVICES=0 vllm serve $model  \
4442    --port 8100 \
4543    --max-model-len 10000 \
4644    --enable-chunked-prefill \
4745    --gpu-memory-utilization 0.6 & 
48-   CUDA_VISIBLE_DEVICES=1 python3 \
49-     -m vllm.entrypoints.openai.api_server \
50-     --model $model  \
46+   CUDA_VISIBLE_DEVICES=1 vllm serve $model  \
5147    --port 8200 \
5248    --max-model-len 10000 \
5349    --enable-chunked-prefill \
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
6258launch_disagg_prefill () {
6359  model=" meta-llama/Meta-Llama-3.1-8B-Instruct" 
6460  #  disagg prefill
65-   CUDA_VISIBLE_DEVICES=0 python3 \
66-     -m vllm.entrypoints.openai.api_server \
67-     --model $model  \
61+   CUDA_VISIBLE_DEVICES=0 vllm serve $model  \
6862    --port 8100 \
6963    --max-model-len 10000 \
7064    --gpu-memory-utilization 0.6 \
7165    --kv-transfer-config \
7266    ' {"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}'   & 
7367
74-   CUDA_VISIBLE_DEVICES=1 python3 \
75-     -m vllm.entrypoints.openai.api_server \
76-     --model $model  \
68+   CUDA_VISIBLE_DEVICES=1 vllm serve $model  \
7769    --port 8200 \
7870    --max-model-len 10000 \
7971    --gpu-memory-utilization 0.6 \
0 commit comments