From 72a6cd7609393620a3140bdd0f14e4da4eef2d68 Mon Sep 17 00:00:00 2001 From: Daniel Huang Date: Thu, 11 Dec 2025 17:14:49 -0800 Subject: [PATCH 1/2] Add ucx test Signed-off-by: Daniel Huang --- tests/full_tests/ci_gsm8k_tests.sh | 9 +++++++++ tests/unit_tests/run_accuracy_test.sh | 11 +++++++---- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 6cfd5fa09..43984fc77 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -313,6 +313,15 @@ run_pd_disaggregate_nixl_libfabric_test() { echo "✅ PD disaggregate through NIXL libfabric." } +run_pd_disaggregate_nixl_ucx_test() { + echo "➡️ Testing PD disaggregate through NIXL UCX." + git clone https://github.com/intel-staging/ucx.git -b intel_gaudi_gdr_enabling_0 + bash ucx/setup_nixl_ucx.sh + rm -rf ucx + cd ${VLLM_GAUDI_PREFIX}/tests/unit_tests; DECODER_TP_SIZE=1 NIXL_BUFFER_DEVICE=hpu VLLM_NIXL_BACKEND=UCX bash run_accuracy_test.sh + echo "✅ PD disaggregate through NIXL UCX." +} + # sleep mode run_sleep_mode_test() { echo "Testing basic model with sleep mode / wake up functionality" diff --git a/tests/unit_tests/run_accuracy_test.sh b/tests/unit_tests/run_accuracy_test.sh index 3b877d09b..abda9dfd5 100755 --- a/tests/unit_tests/run_accuracy_test.sh +++ b/tests/unit_tests/run_accuracy_test.sh @@ -29,8 +29,12 @@ export PT_HPU_LAZY_MODE=1 NIXL_BUFFER_DEVICE=${NIXL_BUFFER_DEVICE:-"cpu"} VLLM_NIXL_BACKEND=${VLLM_NIXL_BACKEND:-"UCX"} +UCX_TLS="tcp" if [ "$VLLM_NIXL_BACKEND" == "UCX" ]; then export VLLM_NIXL_DEVICE_TO_DEVICE=false + if [ "$NIXL_BUFFER_DEVICE" == "hpu" ]; then + UCX_TLS="gaudi_gdr,ib,rc,ud" + fi else export VLLM_NIXL_DEVICE_TO_DEVICE=true fi @@ -42,8 +46,7 @@ PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1} DECODER_TP_SIZE=${DECODER_TP_SIZE:-2} # Find the git repository root directory -#GIT_ROOT=$(git rev-parse --show-toplevel) -GIT_ROOT="/home/vllm-nixl/vllm" +GIT_ROOT=$(git rev-parse --show-toplevel) #SMI_BIN=$(which nvidia-smi || which rocm-smi) @@ -116,7 +119,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="RANK=0 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ @@ -149,7 +152,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="RANK=1 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ From 46c9b86deaa8e3ba93076482e45f054b922d9633 Mon Sep 17 00:00:00 2001 From: Daniel Huang Date: Mon, 5 Jan 2026 16:02:55 -0800 Subject: [PATCH 2/2] Add install script Signed-off-by: Daniel Huang --- tests/full_tests/ci_gsm8k_tests.sh | 6 +-- tools/install_nixl_gaudi_gdr.sh | 63 ++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 4 deletions(-) create mode 100755 tools/install_nixl_gaudi_gdr.sh diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 43984fc77..59be10c82 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -315,10 +315,8 @@ run_pd_disaggregate_nixl_libfabric_test() { run_pd_disaggregate_nixl_ucx_test() { echo "➡️ Testing PD disaggregate through NIXL UCX." - git clone https://github.com/intel-staging/ucx.git -b intel_gaudi_gdr_enabling_0 - bash ucx/setup_nixl_ucx.sh - rm -rf ucx - cd ${VLLM_GAUDI_PREFIX}/tests/unit_tests; DECODER_TP_SIZE=1 NIXL_BUFFER_DEVICE=hpu VLLM_NIXL_BACKEND=UCX bash run_accuracy_test.sh + bash "${VLLM_GAUDI_PREFIX}/tools/install_nixl_gaudi_gdr.sh" + DECODER_TP_SIZE=1 NIXL_BUFFER_DEVICE=hpu VLLM_NIXL_BACKEND=UCX bash "${VLLM_GAUDI_PREFIX}/tests/unit_tests/run_accuracy_test.sh" echo "✅ PD disaggregate through NIXL UCX." } diff --git a/tools/install_nixl_gaudi_gdr.sh b/tools/install_nixl_gaudi_gdr.sh new file mode 100755 index 000000000..5beeb1613 --- /dev/null +++ b/tools/install_nixl_gaudi_gdr.sh @@ -0,0 +1,63 @@ +#!/bin/bash + +set -e + +UCX_DIR=${UCX_DIR:-"/tmp/ucx_source"} +NIXL_DIR=${NIXL_DIR:-"/tmp/nixl_source"} +UCX_INSTALL_DIR=${UCX_INSTALL_DIR:-"/tmp/ucx_install"} + +UCX_REPO_URL="https://github.com/openucx/ucx.git" +UCX_COMMIT="1df7b045d36c1e84f2fe9f251de83fb9103fc80e" +NIXL_REPO_URL="https://github.com/ai-dynamo/nixl.git" +NIXL_BRANCH="0.7.0" + +# Device specific configuration +if command -v nvidia-smi >/dev/null 2>&1; then + DEVICE="cuda" +elif command -v hl-smi >/dev/null 2>&1; then + DEVICE="hpu" +else + echo "Unknown device, aborting install." + exit 1 +fi + +echo "UCX_DIR: $UCX_DIR" +echo "NIXL_DIR: $NIXL_DIR" + +echo "Installing prerequisites" +apt-get update +apt install -y build-essential cmake libibverbs1 libibverbs-dev librdmacm1 librdmacm-dev rdma-core \ + pkg-config meson ninja-build autoconf libtool libcjson-dev libaio-dev pybind11-dev + +echo "Installing UCX ($UCX_COMMIT) to $UCX_INSTALL_DIR" +ucx_root=$(dirname "$UCX_DIR") +mkdir -p "$ucx_root" +[[ -d $UCX_DIR ]] || git clone "$UCX_REPO_URL" "$UCX_DIR" +cd "$UCX_DIR" && git checkout "$UCX_COMMIT" +./autogen.sh +if [ "$DEVICE" == "hpu" ]; then + ./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=yes --enable-examples --enable-mt +else + ./configure --prefix="$UCX_INSTALL_DIR" --with-mlx5=no --with-gaudi=no --enable-examples --enable-mt --with-cuda=/usr/local/cuda +fi +make -j 8 && make -j install-strip && ldconfig + +echo "Installing NIXL ($NIXL_BRANCH) to $NIXL_DIR" +nixl_root=$(dirname "$NIXL_DIR") +mkdir -p "$nixl_root" +[[ -d $NIXL_DIR ]] || git clone -b "$NIXL_BRANCH" "$NIXL_REPO_URL" "$NIXL_DIR" +cd "$NIXL_DIR" +meson setup --reconfigure build -Ducx_path="$UCX_INSTALL_DIR" -Dinstall_headers=true -Ddisable_gds_backend=false +sed -i "s|\(option('ucx_path', type: 'string', value: \)'[^']*|\1'$UCX_INSTALL_DIR|" "$NIXL_DIR/meson_options.txt" +cd build +ninja && ninja install + +pip install "$NIXL_DIR" + +echo "Completed nixl install" +echo "" +echo "Set these env vars after installing: " +echo 'export UCX_MEMTYPE_CACHE=0' +echo 'export LD_LIBRARY_PATH="/opt/nvidia/nvda_nixl/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"' +echo 'export LD_LIBRARY_PATH="${UCX_INSTALL_DIR}/lib:${LD_LIBRARY_PATH}"' +echo ' e.g. export LD_LIBRARY_PATH="/tmp/ucx_install/lib:${LD_LIBRARY_PATH}"'