Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions examples/nixl/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
#GIT_ROOT=$(git rev-parse --show-toplevel)
GIT_ROOT="/home/vllm-nixl/vllm"

#SMI_BIN=$(which nvidia-smi || which rocm-smi)
SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
Expand Down Expand Up @@ -67,6 +67,8 @@ get_model_args() {
get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
elif [[ "$SMI_BIN" == *"hl"* ]]; then
echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
Expand All @@ -91,8 +93,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
#GPU_ID=$((i % $(get_num_gpus)))
GPU_ID=2
GPU_ID=$((i % $(get_num_gpus)))

# Calculate port number (base port + instance number)
PORT=$((8700 + i))
Expand All @@ -102,7 +103,7 @@ run_tests_for_model() {
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=0 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--max_num_batched_tokens 8192 \
Expand All @@ -126,7 +127,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8800 + i))
# Calculate side channel port
Expand All @@ -135,7 +136,7 @@ run_tests_for_model() {
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=1 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--max_num_batched_tokens 8192 \
Expand Down
13 changes: 7 additions & 6 deletions examples/nixl/run_benchmark_profile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
#GIT_ROOT=$(git rev-parse --show-toplevel)
GIT_ROOT="/home/vllm-nixl/vllm"

#SMI_BIN=$(which nvidia-smi || which rocm-smi)
SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
Expand Down Expand Up @@ -70,6 +70,8 @@ get_model_args() {
get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
elif [[ "$SMI_BIN" == *"hl"* ]]; then
echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
Expand All @@ -94,8 +96,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
#GPU_ID=$((i % $(get_num_gpus)))
GPU_ID=2
GPU_ID=$((i % $(get_num_gpus)))

# Calculate port number (base port + instance number)
PORT=$((8300))
Expand All @@ -105,7 +106,7 @@ run_tests_for_model() {
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--long_prefill_token_threshold 8192 \
--max_num_batched_tokens 8192 \
Expand All @@ -130,7 +131,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8400))
# Calculate side channel port
Expand All @@ -139,7 +140,7 @@ run_tests_for_model() {
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--gpu-memory-utilization 0.3 \
--tensor-parallel-size $DECODER_TP_SIZE \
Expand Down
13 changes: 7 additions & 6 deletions examples/nixl/run_benchmark_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
#GIT_ROOT=$(git rev-parse --show-toplevel)
GIT_ROOT="/home/vllm-nixl/vllm"

#SMI_BIN=$(which nvidia-smi || which rocm-smi)
SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
Expand Down Expand Up @@ -67,6 +67,8 @@ get_model_args() {
get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
elif [[ "$SMI_BIN" == *"hl"* ]]; then
echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
Expand All @@ -91,8 +93,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
#GPU_ID=$((i % $(get_num_gpus)))
GPU_ID=2
GPU_ID=$((i % $(get_num_gpus)))

# Calculate port number (base port + instance number)
PORT=$((8300 + i))
Expand All @@ -102,7 +103,7 @@ run_tests_for_model() {
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--long_prefill_token_threshold 8192 \
--max_num_batched_tokens 8192 \
Expand All @@ -127,7 +128,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8400 + i))
# Calculate side channel port
Expand All @@ -136,7 +137,7 @@ run_tests_for_model() {
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--gpu-memory-utilization 0.3 \
--tensor-parallel-size $DECODER_TP_SIZE \
Expand Down
14 changes: 7 additions & 7 deletions examples/nixl/run_hpu_disagg_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
#GIT_ROOT=$(git rev-parse --show-toplevel)
GIT_ROOT="/home/vllm-nixl/vllm"

#SMI_BIN=$(which nvidia-smi || which rocm-smi)
SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
Expand Down Expand Up @@ -58,6 +58,8 @@ get_model_args() {
get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
elif [[ "$SMI_BIN" == *"hl"* ]]; then
echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
Expand All @@ -82,8 +84,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
#GPU_ID=$((i % $(get_num_gpus)))
GPU_ID=2
GPU_ID=$((i % $(get_num_gpus)))

# Calculate port number (base port + instance number)
PORT=$((8300 + i))
Expand All @@ -93,7 +94,7 @@ run_tests_for_model() {
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--disable-log-requests \
Expand All @@ -117,8 +118,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
GPU_ID=6
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8400 + i))
# Calculate side channel port
Expand All @@ -127,7 +127,7 @@ run_tests_for_model() {
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--disable-log-requests \
Expand Down
13 changes: 7 additions & 6 deletions tests/unit_tests/run_accuracy_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-2}
# Find the git repository root directory
GIT_ROOT=$(git rev-parse --show-toplevel)

#SMI_BIN=$(which nvidia-smi || which rocm-smi)
SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi)

# Trap the SIGINT signal (triggered by Ctrl+C)
trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT
Expand Down Expand Up @@ -85,6 +85,8 @@ get_model_args() {
get_num_gpus() {
if [[ "$SMI_BIN" == *"nvidia"* ]]; then
echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)"
elif [[ "$SMI_BIN" == *"hl"* ]]; then
echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)"
else
echo "$($SMI_BIN -l | grep GPU | wc -l)"
fi
Expand All @@ -109,8 +111,7 @@ run_tests_for_model() {
# Start prefill instances
for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs
#GPU_ID=$((i % $(get_num_gpus)))
GPU_ID=2
GPU_ID=$((i % $(get_num_gpus)))

# Calculate port number (base port + instance number)
PORT=$((8700 + i))
Expand All @@ -120,7 +121,7 @@ run_tests_for_model() {
echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=0 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--max_num_batched_tokens 8192 \
Expand All @@ -144,7 +145,7 @@ run_tests_for_model() {
# Start decode instances
for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do
# Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs
#GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus)))
# Calculate port number (base port + instance number)
PORT=$((8800 + i))
# Calculate side channel port
Expand All @@ -153,7 +154,7 @@ run_tests_for_model() {
echo "Starting decode instance $i on GPU $GPU_ID, port $PORT"

# Build the command with or without model-specific args
BASE_CMD="RANK=1 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \
--port $PORT \
--enforce-eager \
--max_num_batched_tokens 8192 \
Expand Down
57 changes: 57 additions & 0 deletions vllm_gaudi/v1/worker/hpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,60 @@ def __init__(
self.kv_cache_sleeping = False
self.kv_cache_config = None

# Select available Habana modules before initializing the device.
self._configure_habana_visible_modules()

def _configure_habana_visible_modules(self):
import pyhlml
pyhlml.hlmlInit()
try:
available_module_ids = []
device_count = torch.hpu.device_count()
if device_count < 1:
raise RuntimeError("No Habana devices found.")
for i in range(device_count):
try:
device = pyhlml.hlmlDeviceGetHandleByIndex(i)
utility = pyhlml.hlmlDeviceGetUtilizationRates(device)
if utility.aip == 0 and utility.memory == 0:
module_id = pyhlml.hlmlDeviceGetModuleID(device)
available_module_ids.append(module_id)
except Exception:
continue
if len(available_module_ids) < 1:
raise RuntimeError("No available Habana modules found. All modules are currently in use.")
env_visible_modules = os.getenv("HABANA_VISIBLE_MODULES")
if env_visible_modules is None:
if len(available_module_ids) < self.parallel_config.world_size:
raise RuntimeError(
f"Not enough available modules for world_size={self.parallel_config.world_size}.")
available_modules_str = ",".join(map(str, sorted(available_module_ids)))
logger.info("HABANA_VISIBLE_MODULES is not set, using all available modules: %s", available_modules_str)
os.environ["HABANA_VISIBLE_MODULES"] = available_modules_str
else:
if not all(c.isdigit() for c in env_visible_modules.split(",")):
raise RuntimeError(f"Invalid HABANA_VISIBLE_MODULES={env_visible_modules}. "
"It should be a comma-separated list of integers.")
env_module_ids = list(map(int, env_visible_modules.split(",")))
if any(module_id < 0 or module_id >= device_count for module_id in env_module_ids):
raise RuntimeError(f"Invalid HABANA_VISIBLE_MODULES={env_visible_modules}. "
f"Module IDs should be between 0 and {device_count - 1}.")
if any(env_module_id not in available_module_ids for env_module_id in env_module_ids):
logger.warning("Some device for HABANA_VISIBLE_MODULES=%s are not available.", env_visible_modules)
selected_modules = [x for x in env_module_ids if x in available_module_ids]
if len(selected_modules) < self.parallel_config.world_size:
raise RuntimeError(
f"Not enough available modules for world_size={self.parallel_config.world_size}. "
"Set HABANA_VISIBLE_MODULES to include more available modules and try again.")
else:
selected_modules_str = ",".join(map(str, sorted(selected_modules)))
os.environ["HABANA_VISIBLE_MODULES"] = selected_modules_str
logger.warning("Using selected available modules: %s", selected_modules_str)
except Exception as e:
raise e
finally:
pyhlml.hlmlShutdown()

def init_profiler(self):
"""Initialize the profiler."""
if envs.VLLM_TORCH_PROFILER_DIR:
Expand Down Expand Up @@ -134,7 +188,10 @@ def stop_profile(self):
self.profiler.stop()

def init_device(self):
# Set the device for this worker.
self.device = torch.device("hpu")
torch.hpu.set_device(self.local_rank)

# Initialize the distributed environment.
init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank)
# Set random seed.
Expand Down