@@ -38,16 +38,12 @@ wait_for_server() {
38
38
launch_chunked_prefill () {
39
39
model=" meta-llama/Meta-Llama-3.1-8B-Instruct"
40
40
# disagg prefill
41
- CUDA_VISIBLE_DEVICES=0 python3 \
42
- -m vllm.entrypoints.openai.api_server \
43
- --model $model \
41
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
44
42
--port 8100 \
45
43
--max-model-len 10000 \
46
44
--enable-chunked-prefill \
47
45
--gpu-memory-utilization 0.6 &
48
- CUDA_VISIBLE_DEVICES=1 python3 \
49
- -m vllm.entrypoints.openai.api_server \
50
- --model $model \
46
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
51
47
--port 8200 \
52
48
--max-model-len 10000 \
53
49
--enable-chunked-prefill \
@@ -62,18 +58,14 @@ launch_chunked_prefill() {
62
58
launch_disagg_prefill () {
63
59
model=" meta-llama/Meta-Llama-3.1-8B-Instruct"
64
60
# disagg prefill
65
- CUDA_VISIBLE_DEVICES=0 python3 \
66
- -m vllm.entrypoints.openai.api_server \
67
- --model $model \
61
+ CUDA_VISIBLE_DEVICES=0 vllm serve $model \
68
62
--port 8100 \
69
63
--max-model-len 10000 \
70
64
--gpu-memory-utilization 0.6 \
71
65
--kv-transfer-config \
72
66
' {"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
73
67
74
- CUDA_VISIBLE_DEVICES=1 python3 \
75
- -m vllm.entrypoints.openai.api_server \
76
- --model $model \
68
+ CUDA_VISIBLE_DEVICES=1 vllm serve $model \
77
69
--port 8200 \
78
70
--max-model-len 10000 \
79
71
--gpu-memory-utilization 0.6 \
0 commit comments