diff --git a/examples/nixl/run_accuracy_test.sh b/examples/nixl/run_accuracy_test.sh index ee999fd45..983be2b26 100755 --- a/examples/nixl/run_accuracy_test.sh +++ b/examples/nixl/run_accuracy_test.sh @@ -31,7 +31,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} #GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT="/home/vllm-nixl/vllm" -#SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi) # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -67,6 +67,8 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + elif [[ "$SMI_BIN" == *"hl"* ]]; then + echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)" else echo "$($SMI_BIN -l | grep GPU | wc -l)" fi @@ -91,8 +93,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - #GPU_ID=$((i % $(get_num_gpus))) - GPU_ID=2 + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8700 + i)) @@ -102,7 +103,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ @@ -126,7 +127,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8800 + i)) # Calculate side channel port @@ -135,7 +136,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=tcp VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ diff --git a/examples/nixl/run_benchmark_profile.sh b/examples/nixl/run_benchmark_profile.sh index 171d4b73b..8e79a1419 100644 --- a/examples/nixl/run_benchmark_profile.sh +++ b/examples/nixl/run_benchmark_profile.sh @@ -34,7 +34,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} #GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT="/home/vllm-nixl/vllm" -#SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi) # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -70,6 +70,8 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + elif [[ "$SMI_BIN" == *"hl"* ]]; then + echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)" else echo "$($SMI_BIN -l | grep GPU | wc -l)" fi @@ -94,8 +96,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - #GPU_ID=$((i % $(get_num_gpus))) - GPU_ID=2 + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8300)) @@ -105,7 +106,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --long_prefill_token_threshold 8192 \ --max_num_batched_tokens 8192 \ @@ -130,7 +131,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8400)) # Calculate side channel port @@ -139,7 +140,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --gpu-memory-utilization 0.3 \ --tensor-parallel-size $DECODER_TP_SIZE \ diff --git a/examples/nixl/run_benchmark_test.sh b/examples/nixl/run_benchmark_test.sh index 766e09ae4..97b826998 100755 --- a/examples/nixl/run_benchmark_test.sh +++ b/examples/nixl/run_benchmark_test.sh @@ -31,7 +31,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} #GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT="/home/vllm-nixl/vllm" -#SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi) # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -67,6 +67,8 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + elif [[ "$SMI_BIN" == *"hl"* ]]; then + echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)" else echo "$($SMI_BIN -l | grep GPU | wc -l)" fi @@ -91,8 +93,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - #GPU_ID=$((i % $(get_num_gpus))) - GPU_ID=2 + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8300 + i)) @@ -102,7 +103,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --long_prefill_token_threshold 8192 \ --max_num_batched_tokens 8192 \ @@ -127,7 +128,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8400 + i)) # Calculate side channel port @@ -136,7 +137,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=rc,ud,ib VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --gpu-memory-utilization 0.3 \ --tensor-parallel-size $DECODER_TP_SIZE \ diff --git a/examples/nixl/run_hpu_disagg_accuracy_test.sh b/examples/nixl/run_hpu_disagg_accuracy_test.sh index cac822cdb..bde3295c4 100755 --- a/examples/nixl/run_hpu_disagg_accuracy_test.sh +++ b/examples/nixl/run_hpu_disagg_accuracy_test.sh @@ -22,7 +22,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-1} #GIT_ROOT=$(git rev-parse --show-toplevel) GIT_ROOT="/home/vllm-nixl/vllm" -#SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi) # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -58,6 +58,8 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + elif [[ "$SMI_BIN" == *"hl"* ]]; then + echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)" else echo "$($SMI_BIN -l | grep GPU | wc -l)" fi @@ -82,8 +84,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - #GPU_ID=$((i % $(get_num_gpus))) - GPU_ID=2 + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8300 + i)) @@ -93,7 +94,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --disable-log-requests \ @@ -117,8 +118,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) - GPU_ID=6 + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8400 + i)) # Calculate side channel port @@ -127,7 +127,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="CUDA_VISIBLE_DEVICES=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --disable-log-requests \ diff --git a/tests/unit_tests/run_accuracy_test.sh b/tests/unit_tests/run_accuracy_test.sh index 7ab10554b..76220ffb4 100755 --- a/tests/unit_tests/run_accuracy_test.sh +++ b/tests/unit_tests/run_accuracy_test.sh @@ -49,7 +49,7 @@ DECODER_TP_SIZE=${DECODER_TP_SIZE:-2} # Find the git repository root directory GIT_ROOT=$(git rev-parse --show-toplevel) -#SMI_BIN=$(which nvidia-smi || which rocm-smi) +SMI_BIN=$(which nvidia-smi || which rocm-smi || which hl-smi) # Trap the SIGINT signal (triggered by Ctrl+C) trap 'kill $(jobs -pr)' SIGINT SIGTERM EXIT @@ -85,6 +85,8 @@ get_model_args() { get_num_gpus() { if [[ "$SMI_BIN" == *"nvidia"* ]]; then echo "$($SMI_BIN --query-gpu=name --format=csv,noheader | wc -l)" + elif [[ "$SMI_BIN" == *"hl"* ]]; then + echo "$($SMI_BIN -Q index,name -f csv | tail -n +2 | wc -l)" else echo "$($SMI_BIN -l | grep GPU | wc -l)" fi @@ -109,8 +111,7 @@ run_tests_for_model() { # Start prefill instances for i in $(seq 0 $((NUM_PREFILL_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs - #GPU_ID=$((i % $(get_num_gpus))) - GPU_ID=2 + GPU_ID=$((i % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8700 + i)) @@ -120,7 +121,7 @@ run_tests_for_model() { echo "Starting prefill instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=0 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='0,1,2,3' RANK=$GPU_ID UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ @@ -144,7 +145,7 @@ run_tests_for_model() { # Start decode instances for i in $(seq 0 $((NUM_DECODE_INSTANCES-1))); do # Calculate GPU ID - we'll distribute across available GPUs, starting from after prefill GPUs - #GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) + GPU_ID=$(((i + NUM_PREFILL_INSTANCES) % $(get_num_gpus))) # Calculate port number (base port + instance number) PORT=$((8800 + i)) # Calculate side channel port @@ -153,7 +154,7 @@ run_tests_for_model() { echo "Starting decode instance $i on GPU $GPU_ID, port $PORT" # Build the command with or without model-specific args - BASE_CMD="RANK=1 UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ + BASE_CMD="HABANA_VISIBLE_MODULES='4,5,6,7' RANK=$GPU_ID UCX_TLS=$UCX_TLS VLLM_NIXL_SIDE_CHANNEL_PORT=$SIDE_CHANNEL_PORT vllm serve $model_name \ --port $PORT \ --enforce-eager \ --max_num_batched_tokens 8192 \ diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py index ae5705e54..0810f2773 100644 --- a/vllm_gaudi/v1/worker/hpu_worker.py +++ b/vllm_gaudi/v1/worker/hpu_worker.py @@ -94,6 +94,60 @@ def __init__( self.kv_cache_sleeping = False self.kv_cache_config = None + # Select available Habana modules before initializing the device. + self._configure_habana_visible_modules() + + def _configure_habana_visible_modules(self): + import pyhlml + pyhlml.hlmlInit() + try: + available_module_ids = [] + device_count = torch.hpu.device_count() + if device_count < 1: + raise RuntimeError("No Habana devices found.") + for i in range(device_count): + try: + device = pyhlml.hlmlDeviceGetHandleByIndex(i) + utility = pyhlml.hlmlDeviceGetUtilizationRates(device) + if utility.aip == 0 and utility.memory == 0: + module_id = pyhlml.hlmlDeviceGetModuleID(device) + available_module_ids.append(module_id) + except Exception: + continue + if len(available_module_ids) < 1: + raise RuntimeError("No available Habana modules found. All modules are currently in use.") + env_visible_modules = os.getenv("HABANA_VISIBLE_MODULES") + if env_visible_modules is None: + if len(available_module_ids) < self.parallel_config.world_size: + raise RuntimeError( + f"Not enough available modules for world_size={self.parallel_config.world_size}.") + available_modules_str = ",".join(map(str, sorted(available_module_ids))) + logger.info("HABANA_VISIBLE_MODULES is not set, using all available modules: %s", available_modules_str) + os.environ["HABANA_VISIBLE_MODULES"] = available_modules_str + else: + if not all(c.isdigit() for c in env_visible_modules.split(",")): + raise RuntimeError(f"Invalid HABANA_VISIBLE_MODULES={env_visible_modules}. " + "It should be a comma-separated list of integers.") + env_module_ids = list(map(int, env_visible_modules.split(","))) + if any(module_id < 0 or module_id >= device_count for module_id in env_module_ids): + raise RuntimeError(f"Invalid HABANA_VISIBLE_MODULES={env_visible_modules}. " + f"Module IDs should be between 0 and {device_count - 1}.") + if any(env_module_id not in available_module_ids for env_module_id in env_module_ids): + logger.warning("Some device for HABANA_VISIBLE_MODULES=%s are not available.", env_visible_modules) + selected_modules = [x for x in env_module_ids if x in available_module_ids] + if len(selected_modules) < self.parallel_config.world_size: + raise RuntimeError( + f"Not enough available modules for world_size={self.parallel_config.world_size}. " + "Set HABANA_VISIBLE_MODULES to include more available modules and try again.") + else: + selected_modules_str = ",".join(map(str, sorted(selected_modules))) + os.environ["HABANA_VISIBLE_MODULES"] = selected_modules_str + logger.warning("Using selected available modules: %s", selected_modules_str) + except Exception as e: + raise e + finally: + pyhlml.hlmlShutdown() + def init_profiler(self): """Initialize the profiler.""" if envs.VLLM_TORCH_PROFILER_DIR: @@ -134,7 +188,10 @@ def stop_profile(self): self.profiler.stop() def init_device(self): + # Set the device for this worker. self.device = torch.device("hpu") + torch.hpu.set_device(self.local_rank) + # Initialize the distributed environment. init_worker_distributed_environment(self.vllm_config, self.rank, self.distributed_init_method, self.local_rank) # Set random seed.