vllm-project · rajanintel24 · Dec 16, 2025 · Dec 17, 2025 · Jan 8, 2026 · Jan 8, 2026
@@ -10,20 +10,33 @@ services:
       - HF_TOKEN
       - HABANA_VISIBLE_DEVICES=${HABANA_VISIBLE_DEVICES:-all}
       - PYTHONUNBUFFERED=1
+      - DRY_RUN
+      - ${HOST_PORT:-8000}:8000
+      - http_proxy
+      - https_proxy
+      - no_proxy
     env_file:
       - ./server/server_user.env
     volumes:
       - /mnt/hf_cache:/mnt/hf_cache
+      - ${PWD}/logs:/root/scripts/logs
+      - ${PWD}:/local
     ports:
-      - "8000:8000"
+      - ${HOST_PORT}:8000
     cap_add:
       - SYS_NICE
     ipc: host
     runtime: habana
     restart: unless-stopped
     command: ["server", "--config-file", "${VLLM_SERVER_CONFIG_FILE}", "--config-name", "${VLLM_SERVER_CONFIG_NAME}"]
     healthcheck:
-      test: ["CMD", "sh", "-c", "[ -f logs/vllm_server.log ] && grep -q 'Application startup complete' logs/vllm_server.log"]
+      test:
+        [
+          "CMD",
+          "sh",
+          "-c",
+          "if [ \"$DRY_RUN\" = \"1\" ]; then exit 0; else [ -f logs/vllm_server.log ] && grep -q 'Application startup complete' logs/vllm_server.log; fi"
+        ]
       interval: 10s
       timeout: 2s
       retries: 500
@@ -38,10 +51,16 @@ services:
         condition: service_healthy
     environment:
       - MODEL
-      - HF_TOKEN=${HF_TOKEN}
+      - HF_TOKEN
       - PYTHONUNBUFFERED=1
+      - DRY_RUN
+      - ${HOST_PORT:-8000}:8000
+      - http_proxy
+      - https_proxy
+      - no_proxy
     env_file:
       - ./benchmark/benchmark_user.env
     volumes:
-      - /tmp/logs:/root/scripts/logs
+      - ${PWD}/logs:/root/scripts/logs
+      - ${PWD}:/local
     command: ["benchmark", "--config-file", "${VLLM_BENCHMARK_CONFIG_FILE}", "--config-name", "${VLLM_BENCHMARK_CONFIG_NAME}"]
@@ -1,15 +1,49 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
+import shutil
+import sys
+import time
+from pathlib import Path
+
+
+def shutil_copy(source_file, destination_dir):
+    try:
+        src_path = Path(source_file)
+        dst_dir_path = Path(destination_dir)
+
+        dst_path = dst_dir_path / src_path.name
+
+        # Ensure the destination directory exists
+        dst_dir_path.mkdir(parents=True, exist_ok=True)
+
+        shutil.copy(src_path, dst_path)
+        print(f"[Info] File '{source_file}' saved at '{dst_path}'")
+
+    except FileNotFoundError:
+        print(f"Error: The source file '{source_file}' was not found.")
+    except PermissionError:
+        print(f"Error: Permission denied. Cannot access '{source_file}' or write to '{destination_dir}'.")
+    except shutil.SameFileError:
+        print("Error: Source and destination files are the same.")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
 
 
 class ScriptGenerator:
 
-    def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None):
+    def __init__(self,
+                 template_script_path,
+                 output_script_path,
+                 variables,
+                 log_dir="logs",
+                 dry_run_dir="/local/",
+                 varlist_conf_path=None):
         self.template_script_path = template_script_path
         self.varlist_conf_path = varlist_conf_path
         self.output_script_path = output_script_path
         self.variables = variables
         self.log_dir = log_dir
+        self.dry_run_dir = dry_run_dir
         self.log_file = os.path.join(self.log_dir,
                                      f"{os.path.splitext(os.path.basename(self.output_script_path))[0]}.log")
 
@@ -56,5 +90,21 @@ def create_and_run(self):
 
         # Run the generated script and redirect output to log file
         print(f"Starting script, logging to {self.log_file}")
-        os.makedirs(self.log_dir, exist_ok=True)
-        os.execvp("bash", ["bash", self.output_script_path])
+        try:
+            os.makedirs(self.log_dir, exist_ok=True)
+        except Exception:
+            print(f"Error: could not create {self.log_dir}.")
+
+        if os.environ.get("DRY_RUN") == '1':
+            shutil_copy(self.output_script_path, self.dry_run_dir)
+
+            print(f"[INFO] This is a dry run to save the command line file {self.output_script_path}.")
+            try:
+                while True:
+                    print("[INFO] Press Ctrl+C to exit.")
+                    time.sleep(60)
+            except KeyboardInterrupt:
+                print("Exiting the DRY_RUN execution.")
+                sys.exit(0)
+        else:
+            os.execvp("bash", ["bash", self.output_script_path])
@@ -1,11 +1,7 @@
 MODEL
 DTYPE
-DEVICE_NAME
 TENSOR_PARALLEL_SIZE
 MAX_MODEL_LEN
-TOTAL_GPU_MEM
-MODEL_DTYPE
-QUANT_DTYPE
 BLOCK_SIZE
 VLLM_PROMPT_BS_BUCKET_MIN
 VLLM_PROMPT_BS_BUCKET_STEP
@@ -17,40 +13,14 @@ VLLM_PROMPT_SEQ_BUCKET_STEP
 VLLM_PROMPT_CTX_BUCKET_STEP
 VLLM_DECODE_BLOCK_BUCKET_MIN
 VLLM_DECODE_BLOCK_BUCKET_STEP
-NUM_HIDDEN_LAYERS
-HIDDEN_SIZE
-NUM_KEY_VALUE_HEADS
-NUM_ATTENTION_HEADS
-CACHE_DTYPE_BYTES
-LIMIT_MODEL_LEN
 PT_HPU_LAZY_MODE
 VLLM_SKIP_WARMUP
 VLLM_EXPONENTIAL_BUCKETING
 MAX_NUM_BATCHED_TOKENS
 PT_HPU_ENABLE_LAZY_COLLECTIVES
-DEVICE_HPU_MEM
-MODEL_MEM_IN_GB
-USABLE_MEM
 GPU_MEM_UTILIZATION
-KV_CACHE_PER_SEQ
-EST_MAX_NUM_SEQS
-EST_HPU_BLOCKS
-DECODE_BS_RAMP_GRAPHS
-DECODE_BS_STEP_GRAPHS
-DECODE_BLOCK_RAMP_GRAPHS
-DECODE_BLOCK_STEP_GRAPHS
-NUM_DECODE_GRAPHS
-PROMPT_BS_RAMP_GRAPHS
-PROMPT_BS_STEP_GRAPHS
-PROMPT_SEQ_RAMP_GRAPHS
-PROMPT_SEQ_STEP_GRAPHS
-EST_NUM_PROMPT_GRAPHS
-EST_GRAPH_PROMPT_RATIO
 VLLM_GRAPH_PROMPT_RATIO
-DECODE_GRAPH_TARGET_GB
-EST_GRAPH_RESERVE_MEM
 VLLM_GRAPH_RESERVED_MEM
-KV_CACHE_MEM
 MAX_NUM_SEQS
 VLLM_CONTIGUOUS_PA
 VLLM_DEFRAG

@@ -137,6 +137,70 @@ docker run -it --rm \
 
 This method provides full flexibility over how the vLLM server is executed within the container.
 
+## Dry Run to create vLLM sever and client command line
+
+Set environment variable **DRY_RUN=1**  
+DRY_RUN env var set to 1 create a copy of vllm-server.sh or vllm-benchmark.sh command line file on the host machine, without launching the server or the client.
+
+Example - Docker Compose
+
+```bash
+MODEL="Qwen/Qwen2.5-14B-Instruct" \
+HF_TOKEN="<your huggingface token>" \
+DOCKER_IMAGE="vault.habana.ai/gaudi-docker/{{ VERSION }}/ubuntu24.04/habanalabs/vllm-installer-{{ PT_VERSION }}:latest" \
+TENSOR_PARALLEL_SIZE=1 \
+MAX_MODEL_LEN=2048 \
+DRY_RUN=1 \
+docker compose up
+```
+
+Example - Docker Run
+
+```bash
+docker run -it --rm \
+    -e MODEL=$MODEL \
+    -e HF_TOKEN=$HF_TOKEN \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e no_proxy=$no_proxy \
+    --cap-add=sys_nice \
+    --ipc=host \
+    --runtime=habana \
+    -e HABANA_VISIBLE_DEVICES=all \
+    -p 8000:8000 \
+    -v ${PWD}:/local \
+    --name vllm-server \
+    <docker image name>
+```
+
+!!! note
+    While launching the vLLM server using Docker Run command for Dry Run, make sure to mount the present working directory as `-v ${PWD}:/local`.
+
+## To save vLLM sever and client log files
+
+If vLLM server is launched using Docker Compose command, the log files are saved at `vllm-gaudi/.cd/logs/` by default.
+
+If vLLM server is launched using Docker Run command, the user can save the log files by creating a directory named `logs` and mount this log directory as `-v ${PWD}/logs:/root/scripts/logs`.
+
+## To create multiple vLLM services using Docker Compose
+
+Set environment variables **HOST_PORT** and **COMPOSE_PROJECT_NAME**  
+Example
+
+```bash
+MODEL="Qwen/Qwen2.5-14B-Instruct" \
+HF_TOKEN="<your huggingface token>" \
+DOCKER_IMAGE="vault.habana.ai/gaudi-docker/{{ VERSION }}/ubuntu24.04/habanalabs/vllm-installer-{{ PT_VERSION }}:latest" \
+TENSOR_PARALLEL_SIZE=1 \
+MAX_MODEL_LEN=2048 \
+HOST_PORT=9000 \
+COMPOSE_PROJECT_NAME=serv1 \
+docker compose up
+```
+
+!!! note
+    The default values, when these vars not set, are `HOST_PORT=8000` and `COMPOSE_PROJECT_NAME=cd`.
+
 ## Pinning CPU Cores for Memory Access Coherence
 
 To improve memory-access coherence and release CPUs to other CPU-only workloads, such as vLLM serving with Llama3 8B, you can pin CPU cores based on different CPU Non-Uniform Memory Access (NUMA) nodes using the automatically generated `docker-compose.override.yml` file. The following procedure explains the process.