diff --git a/.cd/docker-compose.yml b/.cd/docker-compose.yml index 292f031af..abef40215 100644 --- a/.cd/docker-compose.yml +++ b/.cd/docker-compose.yml @@ -10,20 +10,33 @@ services: - HF_TOKEN - HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES:-all} - PYTHONUNBUFFERED=1 + - DRY_RUN + - ${HOST_PORT:-8000}:8000 + - http_proxy + - https_proxy + - no_proxy env_file: - ./server/server_user.env volumes: - /mnt/hf_cache:/mnt/hf_cache + - ${PWD}/logs:/root/scripts/logs + - ${PWD}:/local ports: - - "8000:8000" + - ${HOST_PORT}:8000 cap_add: - SYS_NICE ipc: host runtime: habana - restart: unless-stopped + restart: on-failure command: ["server", "--config-file", "${VLLM_SERVER_CONFIG_FILE}", "--config-name", "${VLLM_SERVER_CONFIG_NAME}"] healthcheck: - test: ["CMD", "sh", "-c", "[ -f logs/vllm_server.log ] && grep -q 'Application startup complete' logs/vllm_server.log"] + test: + [ + "CMD", + "sh", + "-c", + "if [ \"$DRY_RUN\" = \"1\" ]; then exit 0; else [ -f logs/vllm_server.log ] && grep -q 'Application startup complete' logs/vllm_server.log; fi" + ] interval: 10s timeout: 2s retries: 500 @@ -38,10 +51,16 @@ services: condition: service_healthy environment: - MODEL - - HF_TOKEN=${HF_TOKEN} + - HF_TOKEN - PYTHONUNBUFFERED=1 + - DRY_RUN + - ${HOST_PORT:-8000}:8000 + - http_proxy + - https_proxy + - no_proxy env_file: - ./benchmark/benchmark_user.env volumes: - - /tmp/logs:/root/scripts/logs + - ${PWD}/logs:/root/scripts/logs + - ${PWD}:/local command: ["benchmark", "--config-file", "${VLLM_BENCHMARK_CONFIG_FILE}", "--config-name", "${VLLM_BENCHMARK_CONFIG_NAME}"] diff --git a/.cd/entrypoints/script_generator.py b/.cd/entrypoints/script_generator.py index 684d1e9c7..569b68658 100644 --- a/.cd/entrypoints/script_generator.py +++ b/.cd/entrypoints/script_generator.py @@ -1,15 +1,48 @@ # SPDX-License-Identifier: Apache-2.0 import os +import shutil +import sys +from pathlib import Path + + +def shutil_copy(source_file, destination_dir): + try: + src_path = Path(source_file) + dst_dir_path = Path(destination_dir) + + dst_path = dst_dir_path / src_path.name + + # Ensure the destination directory exists + dst_dir_path.mkdir(parents=True, exist_ok=True) + + shutil.copy(src_path, dst_path) + print(f"[Info] File '{source_file}' saved at '{dst_path}'") + + except FileNotFoundError: + print(f"Error: The source file '{source_file}' was not found.") + except PermissionError: + print(f"Error: Permission denied. Cannot access '{source_file}' or write to '{destination_dir}'.") + except shutil.SameFileError: + print("Error: Source and destination files are the same.") + except Exception as e: + print(f"An unexpected error occurred: {e}") class ScriptGenerator: - def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None): + def __init__(self, + template_script_path, + output_script_path, + variables, + log_dir="logs", + dry_run_dir="/local/", + varlist_conf_path=None): self.template_script_path = template_script_path self.varlist_conf_path = varlist_conf_path self.output_script_path = output_script_path self.variables = variables self.log_dir = log_dir + self.dry_run_dir = dry_run_dir self.log_file = os.path.join(self.log_dir, f"{os.path.splitext(os.path.basename(self.output_script_path))[0]}.log") @@ -56,5 +89,14 @@ def create_and_run(self): # Run the generated script and redirect output to log file print(f"Starting script, logging to {self.log_file}") - os.makedirs(self.log_dir, exist_ok=True) - os.execvp("bash", ["bash", self.output_script_path]) + try: + os.makedirs(self.log_dir, exist_ok=True) + except Exception: + print(f"Error: could not create {self.log_dir}.") + + if os.environ.get("DRY_RUN") == '1': + shutil_copy(self.output_script_path, self.dry_run_dir) + print(f"[INFO] This is a dry run to save the command line file {self.output_script_path}.") + sys.exit(0) + else: + os.execvp("bash", ["bash", self.output_script_path]) diff --git a/.cd/server/server_output.env b/.cd/server/server_output.env index 1d288a25f..4532f53f7 100644 --- a/.cd/server/server_output.env +++ b/.cd/server/server_output.env @@ -1,11 +1,7 @@ MODEL DTYPE -DEVICE_NAME TENSOR_PARALLEL_SIZE MAX_MODEL_LEN -TOTAL_GPU_MEM -MODEL_DTYPE -QUANT_DTYPE BLOCK_SIZE VLLM_PROMPT_BS_BUCKET_MIN VLLM_PROMPT_BS_BUCKET_STEP @@ -17,40 +13,14 @@ VLLM_PROMPT_SEQ_BUCKET_STEP VLLM_PROMPT_CTX_BUCKET_STEP VLLM_DECODE_BLOCK_BUCKET_MIN VLLM_DECODE_BLOCK_BUCKET_STEP -NUM_HIDDEN_LAYERS -HIDDEN_SIZE -NUM_KEY_VALUE_HEADS -NUM_ATTENTION_HEADS -CACHE_DTYPE_BYTES -LIMIT_MODEL_LEN PT_HPU_LAZY_MODE VLLM_SKIP_WARMUP VLLM_EXPONENTIAL_BUCKETING MAX_NUM_BATCHED_TOKENS PT_HPU_ENABLE_LAZY_COLLECTIVES -DEVICE_HPU_MEM -MODEL_MEM_IN_GB -USABLE_MEM GPU_MEM_UTILIZATION -KV_CACHE_PER_SEQ -EST_MAX_NUM_SEQS -EST_HPU_BLOCKS -DECODE_BS_RAMP_GRAPHS -DECODE_BS_STEP_GRAPHS -DECODE_BLOCK_RAMP_GRAPHS -DECODE_BLOCK_STEP_GRAPHS -NUM_DECODE_GRAPHS -PROMPT_BS_RAMP_GRAPHS -PROMPT_BS_STEP_GRAPHS -PROMPT_SEQ_RAMP_GRAPHS -PROMPT_SEQ_STEP_GRAPHS -EST_NUM_PROMPT_GRAPHS -EST_GRAPH_PROMPT_RATIO VLLM_GRAPH_PROMPT_RATIO -DECODE_GRAPH_TARGET_GB -EST_GRAPH_RESERVE_MEM VLLM_GRAPH_RESERVED_MEM -KV_CACHE_MEM MAX_NUM_SEQS VLLM_CONTIGUOUS_PA VLLM_DEFRAG diff --git a/docs/getting_started/quickstart/quickstart_configuration.md b/docs/getting_started/quickstart/quickstart_configuration.md index 77cf4d593..4561df821 100644 --- a/docs/getting_started/quickstart/quickstart_configuration.md +++ b/docs/getting_started/quickstart/quickstart_configuration.md @@ -139,6 +139,71 @@ docker run -it --rm \ This method provides full flexibility over how the vLLM server is executed within the container. +## Dry Run to create vLLM sever and client command line + +Set environment variable **DRY_RUN=1** +DRY_RUN env var set to 1 create a copy of vllm-server.sh or vllm-benchmark.sh command line file on the host machine, without launching the server or the client. + +Example - Docker Compose + +```bash +MODEL="Qwen/Qwen2.5-14B-Instruct" \ +HF_TOKEN="" \ +DOCKER_IMAGE="vault.habana.ai/gaudi-docker/{{ VERSION }}/ubuntu24.04/habanalabs/vllm-installer-{{ PT_VERSION }}:latest" \ +TENSOR_PARALLEL_SIZE=1 \ +MAX_MODEL_LEN=2048 \ +DRY_RUN=1 \ +docker compose up +``` + +Example - Docker Run + +```bash +docker run -it --rm \ + -e MODEL=$MODEL \ + -e HF_TOKEN=$HF_TOKEN \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + --cap-add=sys_nice \ + --ipc=host \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -p 8000:8000 \ + -e DRY_RUN=1 \ + -v ${PWD}:/local \ + --name vllm-server \ + +``` + +!!! note + While launching the vLLM server using Docker Run command for Dry Run, make sure to mount the present working directory as `-v ${PWD}:/local`. + +## Save vLLM sever and client log files + +If vLLM server is launched using Docker Compose command, the log files are saved at `vllm-gaudi/.cd/logs/` by default. + +If vLLM server is launched using Docker Run command, the user can save the log files by creating a directory named `logs` and mount this log directory as `-v ${PWD}/logs:/root/scripts/logs`. + +## Create multiple vLLM services using Docker Compose + +Set environment variables **HOST_PORT** and **COMPOSE_PROJECT_NAME** +Example + +```bash +MODEL="Qwen/Qwen2.5-14B-Instruct" \ +HF_TOKEN="" \ +DOCKER_IMAGE="vault.habana.ai/gaudi-docker/{{ VERSION }}/ubuntu24.04/habanalabs/vllm-installer-{{ PT_VERSION }}:latest" \ +TENSOR_PARALLEL_SIZE=1 \ +MAX_MODEL_LEN=2048 \ +HOST_PORT=9000 \ +COMPOSE_PROJECT_NAME=serv1 \ +docker compose up +``` + +!!! note + The default values, when these vars not set, are `HOST_PORT=8000` and `COMPOSE_PROJECT_NAME=cd`. + ## Pinning CPU Cores for Memory Access Coherence To improve memory-access coherence and release CPUs to other CPU-only workloads, such as vLLM serving with Llama3 8B, you can pin CPU cores based on different CPU Non-Uniform Memory Access (NUMA) nodes using the automatically generated `docker-compose.override.yml` file. The following procedure explains the process.