PaddlePaddle
diff --git a/‎examples/splitwise/README.md‎
Lines changed: 36 additions & 0 deletions b/‎examples/splitwise/README.md‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎examples/splitwise/start_mixed.sh‎
Lines changed: 14 additions & 20 deletions b/‎examples/splitwise/start_mixed.sh‎
Lines changed: 14 additions & 20 deletions
diff --git a/‎examples/splitwise/start_v0_tp1.sh‎
Lines changed: 15 additions & 17 deletions b/‎examples/splitwise/start_v0_tp1.sh‎
Lines changed: 15 additions & 17 deletions
diff --git a/‎examples/splitwise/start_v0_tp2.sh‎
Lines changed: 0 additions & 111 deletions b/‎examples/splitwise/start_v0_tp2.sh‎
Lines changed: 0 additions & 111 deletions
diff --git a/‎examples/splitwise/start_v1_tp1.sh‎
Lines changed: 16 additions & 19 deletions b/‎examples/splitwise/start_v1_tp1.sh‎
Lines changed: 16 additions & 19 deletions
@@ -0,0 +1,36 @@
+# Run the Examples on NVIDIA CUDA GPU
+
+## Prepare the Environment
+Refer to [NVIDIA CUDA GPU Installation](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/) to pull the docker image, such as:
+```
+docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/paddlepaddle/fastdeploy-cuda-12.6:2.3.0
+```
+
+In the docker container, the [NVIDIA MLNX_OFED](https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/) and [Redis](https://redis.io/) are pre-installed.
+
+## Build and install FastDeploy
+
+```
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+
+export ENABLE_FD_RDMA=1
+
+# Argument 1: Whether to build wheel package (1 for yes, 0 for compile only)
+# Argument 2: Python interpreter path
+# Argument 3: Whether to compile CPU inference operators
+# Argument 4: Target GPU architectures
+bash build.sh 1 python false [80,90]
+```
+
+## Run the Examples
+
+Run the shell scripts in this directory, ```bash start_v0_tp1.sh``` or ```bash start_v1_tp1.sh```
+
+Note that, there are two methods for splitwise deployment:
+* v0: using splitwise_scheduler or dp_scheduler, in which the requests are scheduled in the engine.
+* v1: using router, in which the requests are scheduled in the router.
+
+# Run the Examples on Kunlunxin XPU
+
+Coming soon...
@@ -3,41 +3,35 @@ set -e
 
 # Test mixed server + router
 
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
 # prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-
+export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
 export FD_DEBUG=1
-export ENABLE_V1_KVCACHE_SCHEDULER=0
-export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
+source ./utils.sh
 
 S1_PORT=52400
 S2_PORT=52500
 ROUTER_PORT=52600
 
+ports=(
+    $S1_PORT $((S1_PORT + 1)) $((S1_PORT + 2)) $((S1_PORT + 3))
+    $S2_PORT $((S2_PORT + 1)) $((S2_PORT + 2)) $((S2_PORT + 3))
+    $ROUTER_PORT
+)
+check_ports "${ports[@]}" || {
+    echo "❌ Some ports are in use. Please release them."
+    exit 1
+}
+
 # start router
 export FD_LOG_DIR="log_router"
 mkdir -p ${FD_LOG_DIR}
 
 nohup python -m fastdeploy.router.launch \
     --port ${ROUTER_PORT} \
     2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
 
 # start modelserver 0
 export CUDA_VISIBLE_DEVICES=0
@@ -53,7 +47,6 @@ nohup python -m fastdeploy.entrypoints.openai.api_server \
        --max-model-len 32768 \
        --router "0.0.0.0:${ROUTER_PORT}" \
        2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
 
 wait_for_health ${S1_PORT}
 
@@ -76,12 +69,13 @@ wait_for_health ${S2_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
+echo "send request..."
 curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
     {"role": "user", "content": "hello"}
   ],
   "max_tokens": 20,
-  "stream": true
+  "stream": false
 }'
@@ -6,22 +6,8 @@ set -e
 # v0: using splitwise_scheduler or dp_scheduler
 # v1: using local_scheduler + router
 
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
 # prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-
+export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=1
 export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -37,10 +23,21 @@ fi
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
+source ./utils.sh
 
 P_PORT=52400
 D_PORT=52500
-REDIS_PORT=56388
+REDIS_PORT="${REDIS_PORT:-56388}"
+
+ports=(
+    $P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
+    $D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
+    $REDIS_PORT
+)
+check_ports "${ports[@]}" || {
+    echo "❌ Some ports are in use. Please release them."
+    exit 1
+}
 
 # start redis
 if ! redis-cli -p ${REDIS_PORT} ping &>/dev/null; then
@@ -104,12 +101,13 @@ wait_for_health ${D_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
+echo "send request..."
 curl -X POST "http://0.0.0.0:${D_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
     {"role": "user", "content": "hello"}
   ],
   "max_tokens": 20,
-  "stream": true
+  "stream": false
 }'
@@ -6,22 +6,8 @@ set -e
 # v0: using splitwise_scheduler or dp_scheduler
 # v1: using local_scheduler + router
 
-wait_for_health() {
-       local server_port=$1
-       while true; do
-       status_code=$(curl -s -o /dev/null -w "%{http_code}" "http://0.0.0.0:${server_port}/health" || echo "000")
-       if [ "$status_code" -eq 200 ]; then
-              break
-       else
-              echo "Service not ready. Retrying in 2s..."
-              sleep 2
-       fi
-       done
-}
-
 # prepare environment
-MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
-
+export MODEL_NAME="PaddlePaddle/ERNIE-4.5-0.3B-Paddle"
 export FD_DEBUG=1
 export ENABLE_V1_KVCACHE_SCHEDULER=1
 export KVCACHE_GDRCOPY_FLUSH_ENABLE=1
@@ -37,10 +23,21 @@ fi
 
 unset http_proxy && unset https_proxy
 rm -rf log_*
+source ./utils.sh
 
 P_PORT=52400
 D_PORT=52500
-ROUTER_PORT=52600
+ROUTER_PORT=52700
+
+ports=(
+    $P_PORT $((P_PORT + 1)) $((P_PORT + 2)) $((P_PORT + 3)) $((P_PORT + 4)) $((P_PORT + 5))
+    $D_PORT $((D_PORT + 1)) $((D_PORT + 2)) $((D_PORT + 3)) $((D_PORT + 4)) $((D_PORT + 5))
+    $ROUTER_PORT
+)
+check_ports "${ports[@]}" || {
+    echo "❌ Some ports are in use. Please release them."
+    exit 1
+}
 
 # start router
 export FD_LOG_DIR="log_router"
@@ -50,7 +47,6 @@ nohup python -m fastdeploy.router.launch \
     --port ${ROUTER_PORT} \
     --splitwise \
     2>&1 >${FD_LOG_DIR}/nohup &
-sleep 1
 
 # start prefill
 export CUDA_VISIBLE_DEVICES=0
@@ -96,12 +92,13 @@ wait_for_health ${D_PORT}
 
 # send request
 sleep 10  # make sure server is registered to router
+echo "send request..."
 curl -X POST "http://0.0.0.0:${ROUTER_PORT}/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -d '{
   "messages": [
     {"role": "user", "content": "hello"}
   ],
-  "max_tokens": 20,
-  "stream": true
+  "max_tokens": 100,
+  "stream": false
 }'