@@ -38,16 +38,12 @@ wait_for_server() {
3838launch_chunked_prefill () {
3939 model=" meta-llama/Meta-Llama-3.1-8B-Instruct"
4040 # disagg prefill
41- CUDA_VISIBLE_DEVICES=0 python3 \
42- -m vllm.entrypoints.openai.api_server \
43- --model $model \
41+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
4442 --port 8100 \
4543 --max-model-len 10000 \
4644 --enable-chunked-prefill \
4745 --gpu-memory-utilization 0.6 &
48- CUDA_VISIBLE_DEVICES=1 python3 \
49- -m vllm.entrypoints.openai.api_server \
50- --model $model \
46+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
5147 --port 8200 \
5248 --max-model-len 10000 \
5349 --enable-chunked-prefill \
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
6258launch_disagg_prefill () {
6359 model=" meta-llama/Meta-Llama-3.1-8B-Instruct"
6460 # disagg prefill
65- CUDA_VISIBLE_DEVICES=0 python3 \
66- -m vllm.entrypoints.openai.api_server \
67- --model $model \
61+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
6862 --port 8100 \
6963 --max-model-len 10000 \
7064 --gpu-memory-utilization 0.6 \
7165 --kv-transfer-config \
7266 ' {"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
7367
74- CUDA_VISIBLE_DEVICES=1 python3 \
75- -m vllm.entrypoints.openai.api_server \
76- --model $model \
68+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
7769 --port 8200 \
7870 --max-model-len 10000 \
7971 --gpu-memory-utilization 0.6 \
0 commit comments