Skip to content

Commit f9b0545

Browse files
authored
[PD Disaggregation] [Refine] Refine splitwise deployment (#5151)
* Refine splitwise deployment * up
1 parent 2d1dade commit f9b0545

File tree

15 files changed

+364
-485
lines changed

15 files changed

+364
-485
lines changed

examples/splitwise/README.md

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Run the Examples on NVIDIA CUDA GPU
2+
3+
## Prepare the Environment
4+
Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) to pull the docker image, such as:
5+
```
6+
docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.3.0
7+
```
8+
9+
In the docker container, the [NVIDIA MLNX_OFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/) and [Redis](https://redis.io/) are pre-installed.
10+
11+
## Build and install FastDeploy
12+
13+
```
14+
git clone https://github.com/PaddlePaddle/FastDeploy
15+
cd FastDeploy
16+
17+
export ENABLE_FD_RDMA=1
18+
19+
# Argument 1: Whether to build wheel package (1 for yes, 0 for compile only)
20+
# Argument 2: Python interpreter path
21+
# Argument 3: Whether to compile CPU inference operators
22+
# Argument 4: Target GPU architectures
23+
bash build.sh 1 python false [80,90]
24+
```
25+
26+
## Run the Examples
27+
28+
Run the shell scripts in this directory, ```bash start_v0_tp1.sh``` or ```bash start_v1_tp1.sh```
29+
30+
Note that, there are two methods for splitwise deployment:
31+
* v0: using splitwise_scheduler or dp_scheduler, in which the requests are scheduled in the engine.
32+
* v1: using router, in which the requests are scheduled in the router.
33+
34+
# Run the Examples on Kunlunxin XPU
35+
36+
Coming soon...

examples/splitwise/start_mixed.sh

Lines changed: 14 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,41 +3,35 @@ set -e
33

44
# Test mixed server + router
55

6-
wait_for_health() {
7-
local server_port=$1
8-
while true; do
9-
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
10-
if [ "$status_code" -eq 200 ]; then
11-
break
12-
else
13-
echo "Service not ready. Retrying in 2s..."
14-
sleep 2
15-
fi
16-
done
17-
}
18-
196
# prepare environment
20-
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
21-
7+
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
228
export FD_DEBUG=1
23-
export ENABLE_V1_KVCACHE_SCHEDULER=0
24-
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
259

2610
unset http_proxy && unset https_proxy
2711
rm -rf log_*
12+
source ./utils.sh
2813

2914
S1_PORT=52400
3015
S2_PORT=52500
3116
ROUTER_PORT=52600
3217

18+
ports=(
19+
$S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
20+
$S2_PORT $((S2_PORT + 1)) $((S2_PORT + 2)) $((S2_PORT + 3))
21+
$ROUTER_PORT
22+
)
23+
check_ports "${ports[@]}" || {
24+
echo "❌ Some ports are in use. Please release them."
25+
exit 1
26+
}
27+
3328
# start router
3429
export FD_LOG_DIR="log_router"
3530
mkdir -p ${FD_LOG_DIR}
3631

3732
nohup python -m fastdeploy.router.launch \
3833
--port ${ROUTER_PORT} \
3934
2>&1 >${FD_LOG_DIR}/nohup &
40-
sleep 1
4135

4236
# start modelserver 0
4337
export CUDA_VISIBLE_DEVICES=0
@@ -53,7 +47,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
5347
--max-model-len 32768 \
5448
--router "0.0.0.0:${ROUTER_PORT}" \
5549
2>&1 >${FD_LOG_DIR}/nohup &
56-
sleep 1
5750

5851
wait_for_health ${S1_PORT}
5952

@@ -76,12 +69,13 @@ wait_for_health ${S2_PORT}
7669

7770
# send request
7871
sleep 10 # make sure server is registered to router
72+
echo "send request..."
7973
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
8074
-H "Content-Type: application/json" \
8175
-d '{
8276
"messages": [
8377
{"role": "user", "content": "hello"}
8478
],
8579
"max_tokens": 20,
86-
"stream": true
80+
"stream": false
8781
}'

examples/splitwise/start_v0_tp1.sh

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,8 @@ set -e
66
# v0: using splitwise_scheduler or dp_scheduler
77
# v1: using local_scheduler + router
88

9-
wait_for_health() {
10-
local server_port=$1
11-
while true; do
12-
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
13-
if [ "$status_code" -eq 200 ]; then
14-
break
15-
else
16-
echo "Service not ready. Retrying in 2s..."
17-
sleep 2
18-
fi
19-
done
20-
}
21-
229
# prepare environment
23-
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
24-
10+
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
2511
export FD_DEBUG=1
2612
export ENABLE_V1_KVCACHE_SCHEDULER=1
2713
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -37,10 +23,21 @@ fi
3723

3824
unset http_proxy && unset https_proxy
3925
rm -rf log_*
26+
source ./utils.sh
4027

4128
P_PORT=52400
4229
D_PORT=52500
43-
REDIS_PORT=56388
30+
REDIS_PORT="${REDIS_PORT:-56388}"
31+
32+
ports=(
33+
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
34+
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
35+
$REDIS_PORT
36+
)
37+
check_ports "${ports[@]}" || {
38+
echo "❌ Some ports are in use. Please release them."
39+
exit 1
40+
}
4441

4542
# start redis
4643
if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
@@ -104,12 +101,13 @@ wait_for_health ${D_PORT}
104101

105102
# send request
106103
sleep 10 # make sure server is registered to router
104+
echo "send request..."
107105
curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
108106
-H "Content-Type: application/json" \
109107
-d '{
110108
"messages": [
111109
{"role": "user", "content": "hello"}
112110
],
113111
"max_tokens": 20,
114-
"stream": true
112+
"stream": false
115113
}'

examples/splitwise/start_v0_tp2.sh

Lines changed: 0 additions & 111 deletions
This file was deleted.

examples/splitwise/start_v1_tp1.sh

Lines changed: 16 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,22 +6,8 @@ set -e
66
# v0: using splitwise_scheduler or dp_scheduler
77
# v1: using local_scheduler + router
88

9-
wait_for_health() {
10-
local server_port=$1
11-
while true; do
12-
status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
13-
if [ "$status_code" -eq 200 ]; then
14-
break
15-
else
16-
echo "Service not ready. Retrying in 2s..."
17-
sleep 2
18-
fi
19-
done
20-
}
21-
229
# prepare environment
23-
MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
24-
10+
export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
2511
export FD_DEBUG=1
2612
export ENABLE_V1_KVCACHE_SCHEDULER=1
2713
export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -37,10 +23,21 @@ fi
3723

3824
unset http_proxy && unset https_proxy
3925
rm -rf log_*
26+
source ./utils.sh
4027

4128
P_PORT=52400
4229
D_PORT=52500
43-
ROUTER_PORT=52600
30+
ROUTER_PORT=52700
31+
32+
ports=(
33+
$P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
34+
$D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
35+
$ROUTER_PORT
36+
)
37+
check_ports "${ports[@]}" || {
38+
echo "❌ Some ports are in use. Please release them."
39+
exit 1
40+
}
4441

4542
# start router
4643
export FD_LOG_DIR="log_router"
@@ -50,7 +47,6 @@ nohup python -m fastdeploy.router.launch \
5047
--port ${ROUTER_PORT} \
5148
--splitwise \
5249
2>&1 >${FD_LOG_DIR}/nohup &
53-
sleep 1
5450

5551
# start prefill
5652
export CUDA_VISIBLE_DEVICES=0
@@ -96,12 +92,13 @@ wait_for_health ${D_PORT}
9692

9793
# send request
9894
sleep 10 # make sure server is registered to router
95+
echo "send request..."
9996
curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
10097
-H "Content-Type: application/json" \
10198
-d '{
10299
"messages": [
103100
{"role": "user", "content": "hello"}
104101
],
105-
"max_tokens": 20,
106-
"stream": true
102+
"max_tokens": 100,
103+
"stream": false
107104
}'

0 commit comments

Comments
 (0)