diff --git a/.cd/Dockerfile.rhel.tenc.pytorch.vllm b/.cd/Dockerfile.rhel.tenc.pytorch.vllm index 2794dda34..d3eb34c50 100644 --- a/.cd/Dockerfile.rhel.tenc.pytorch.vllm +++ b/.cd/Dockerfile.rhel.tenc.pytorch.vllm @@ -13,8 +13,9 @@ ARG TORCH_TYPE_SUFFIX FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-${TORCH_TYPE_SUFFIX}installer-${PT_VERSION}:${REVISION} # Parameterize commit/branch for vllm-fork checkout -ARG VLLM_GAUDI_COMMIT=v0.10.1 -ARG VLLM_PROJECT_COMMIT=v0.10.1 +ARG VLLM_GAUDI_COMMIT=main +# leave empty to use last-good-commit-for-vllm-gaudi +ARG VLLM_PROJECT_COMMIT= ARG BASE_NAME ENV BASE_NAME=${BASE_NAME} @@ -39,23 +40,36 @@ ENV VLLM_PATH=/workspace/vllm-project ENV VLLM_PATH2=/workspace/vllm-gaudi # Clone the vllm-project repository and install inside the container -RUN mkdir -p $VLLM_PATH && \ +# --- START: COMBINED RUN COMMAND --- +RUN \ + # Clone vllm-gaudi and get the commit hash for the vllm-project/vllm + set -e && \ + mkdir -p $VLLM_PATH2 && \ + git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \ + cd $VLLM_PATH2 && \ + if [ -z "${VLLM_PROJECT_COMMIT}" ]; then \ + VLLM_PROJECT_COMMIT=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) && \ + echo "Found vLLM commit hash: ${VLLM_PROJECT_COMMIT}"; \ + else \ + echo "Using vLLM commit : ${VLLM_PROJECT_COMMIT}"; \ + fi && \ + mkdir -p $VLLM_PATH && \ + # Clone vllm-project/vllm and use configured or last good commit hash git clone https://github.com/vllm-project/vllm.git $VLLM_PATH && \ cd $VLLM_PATH && \ git remote add upstream https://github.com/vllm-project/vllm.git && \ git fetch upstream --tags || true && \ git checkout ${VLLM_PROJECT_COMMIT} && \ - bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \ - VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e . - -# Clone the vllm-gaudi repository and install inside the container -RUN mkdir -p $VLLM_PATH2 && \ - git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \ + # Install vllm-project/vllm + bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \ + VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \ + # Install vllm-gaudi plugin cd $VLLM_PATH2 && \ git checkout ${VLLM_GAUDI_COMMIT} && \ - VLLM_TARGET_DEVICE=hpu && pip install -v -e $VLLM_PATH2 + VLLM_TARGET_DEVICE=hpu pip install -v . --no-build-isolation +# --- END: COMBINED RUN COMMAND --- - # to be enabled later PWolsza +# to be enabled later PWolsza # RUN pip3 install -v -e $VLLM_PATH/tests/vllm_test_utils # Install additional Python packages @@ -70,4 +84,4 @@ COPY benchmark /root/scripts/benchmark/ WORKDIR /root/scripts # Set entrypoint script -ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"] \ No newline at end of file +ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"] diff --git a/.cd/Dockerfile.ubuntu.pytorch.vllm b/.cd/Dockerfile.ubuntu.pytorch.vllm index 81574d688..ea8d8d4d6 100644 --- a/.cd/Dockerfile.ubuntu.pytorch.vllm +++ b/.cd/Dockerfile.ubuntu.pytorch.vllm @@ -4,7 +4,7 @@ # Parameterize base image components ARG DOCKER_URL=vault.habana.ai/gaudi-docker ARG VERSION=1.22.0 -ARG BASE_NAME=ubuntu22.04 +ARG BASE_NAME=ubuntu24.04 ARG PT_VERSION=2.7.1 ARG REVISION=latest ARG REPO_TYPE=habanalabs @@ -12,9 +12,9 @@ ARG REPO_TYPE=habanalabs FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION} # Parameterize commit/branch for vllm-project & vllm-gaudi checkout -ARG VLLM_GAUDI_COMMIT=v0.10.2_next -ARG VLLM_PROJECT_COMMIT=v0.10.2 - +ARG VLLM_GAUDI_COMMIT=main +# leave empty to use last-good-commit-for-vllm-gaudi +ARG VLLM_PROJECT_COMMIT= ENV OMPI_MCA_btl_vader_single_copy_mechanism=none RUN apt update && \ @@ -30,24 +30,34 @@ RUN echo "dash dash/sh boolean false" | debconf-set-selections && \ ENV ENV=~/.profile # Clone the vllm-project repository and install inside the container - -RUN mkdir -p $VLLM_PATH && \ +# --- START: COMBINED RUN COMMAND --- +RUN \ + # Clone vllm-gaudi and get the commit hash for the vllm-project/vllm + set -e && \ + mkdir -p $VLLM_PATH2 && \ + git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \ + cd $VLLM_PATH2 && \ + if [ -z "${VLLM_PROJECT_COMMIT}" ]; then \ + VLLM_PROJECT_COMMIT=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) && \ + echo "Found vLLM commit hash: ${VLLM_PROJECT_COMMIT}"; \ + else \ + echo "Using vLLM commit : ${VLLM_PROJECT_COMMIT}"; \ + fi && \ + mkdir -p $VLLM_PATH && \ + # Clone vllm-project/vllm and use configured or last good commit hash git clone https://github.com/vllm-project/vllm.git $VLLM_PATH && \ cd $VLLM_PATH && \ git remote add upstream https://github.com/vllm-project/vllm.git && \ git fetch upstream --tags || true && \ git checkout ${VLLM_PROJECT_COMMIT} && \ - bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \ - VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . - -# Clone the vllm-gaudi repository and install inside the container - -RUN mkdir -p $VLLM_PATH2 && \ - git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \ + # Install vllm-project/vllm + bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \ + VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \ + # Install vllm-gaudi plugin cd $VLLM_PATH2 && \ -# Comment: enable if vllm-gaudi release version is used otherwise main - git checkout ${VLLM_GAUDI_COMMIT} && \ - VLLM_TARGET_DEVICE=hpu && pip install -v $VLLM_PATH2 --no-build-isolation + git checkout ${VLLM_GAUDI_COMMIT} && \ + VLLM_TARGET_DEVICE=hpu pip install -v . --no-build-isolation +# --- END: COMBINED RUN COMMAND --- # Install additional Python packages RUN pip install datasets && \ diff --git a/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest b/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest index 68feedf0b..b7c589827 100644 --- a/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest +++ b/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest @@ -45,7 +45,7 @@ RUN \ git remote add upstream https://github.com/vllm-project/vllm.git && \ git fetch upstream --tags || true && \ git checkout ${VLLM_COMMIT_HASH} && \ - pip install -r <(sed '/^[torch]/d' requirements/build.txt) && \ + pip install -r <(sed '/^torch/d' requirements/build.txt) && \ VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \ \ # Install vllm-gaudi diff --git a/.cd/benchmark/benchmark_defaults.yaml b/.cd/benchmark/benchmark_defaults.yaml index cc2b65b10..0147e628b 100644 --- a/.cd/benchmark/benchmark_defaults.yaml +++ b/.cd/benchmark/benchmark_defaults.yaml @@ -29,12 +29,10 @@ model_text: model_vision: MODELS: - - meta-llama/Llama-3.2-11B-Vision-Instruct - - meta-llama/Llama-3.2-90B-Vision-Instruct - Qwen/Qwen2.5-VL-7B-Instruct DATASET: lmarena-ai/vision-arena-bench-v0.1 DATASET_NAME: hf BACKEND: openai-chat ENDPOINT: /v1/chat/completions CONCURRENT_REQ: 64 - NUM_PROMPTS: 500 \ No newline at end of file + NUM_PROMPTS: 500 diff --git a/.cd/benchmark/benchmark_scenarios_vision.yaml b/.cd/benchmark/benchmark_scenarios_vision.yaml index b9e438cf5..8e00db022 100644 --- a/.cd/benchmark/benchmark_scenarios_vision.yaml +++ b/.cd/benchmark/benchmark_scenarios_vision.yaml @@ -1,8 +1,2 @@ -llama32-11B-Vision-Instruct: - MODEL: meta-llama/Llama-3.2-11B-Vision-Instruct - -llama32-90B-Vision-Instruct: - MODEL: meta-llama/Llama-3.2-90B-Vision-Instruct - qwen2.5-vl-7b-instruct: MODEL: Qwen/Qwen2.5-VL-7B-Instruct diff --git a/.cd/entrypoints/entrypoint_main.py b/.cd/entrypoints/entrypoint_main.py index c107414a7..babfce32a 100644 --- a/.cd/entrypoints/entrypoint_main.py +++ b/.cd/entrypoints/entrypoint_main.py @@ -190,6 +190,7 @@ def run(self): output_script_path="vllm_server.sh", variables=variables, log_dir="logs", + varlist_conf_path="server/server_output.env", ).create_and_run() elif self.mode == "benchmark": print("[INFO] Starting container in benchmark mode.") diff --git a/.cd/entrypoints/script_generator.py b/.cd/entrypoints/script_generator.py index e48062d22..684d1e9c7 100644 --- a/.cd/entrypoints/script_generator.py +++ b/.cd/entrypoints/script_generator.py @@ -4,8 +4,9 @@ class ScriptGenerator: - def __init__(self, template_script_path, output_script_path, variables, log_dir="logs"): + def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None): self.template_script_path = template_script_path + self.varlist_conf_path = varlist_conf_path self.output_script_path = output_script_path self.variables = variables self.log_dir = log_dir @@ -19,7 +20,16 @@ def generate_script(self, vars_dict): """ with open(self.template_script_path) as f: template = f.read() - export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()]) + # Create our output list + if self.varlist_conf_path: + output_dict = {} + with open(self.varlist_conf_path) as var_file: + for line in var_file: + param = line.strip() + output_dict[param] = vars_dict[param] + export_lines = "\n".join([f"export {k}={v}" for k, v in output_dict.items()]) + else: + export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()]) script_content = template.replace("#@VARS", export_lines) with open(self.output_script_path, 'w') as f: f.write(script_content) diff --git a/.cd/server/server_output.env b/.cd/server/server_output.env new file mode 100644 index 000000000..dccdef0ae --- /dev/null +++ b/.cd/server/server_output.env @@ -0,0 +1,60 @@ +MODEL +DTYPE +DEVICE_NAME +TENSOR_PARALLEL_SIZE +MAX_MODEL_LEN +TOTAL_GPU_MEM +MODEL_DTYPE +QUANT_DTYPE +BLOCK_SIZE +VLLM_PROMPT_BS_BUCKET_MIN +VLLM_PROMPT_BS_BUCKET_STEP +VLLM_DECODE_BS_BUCKET_MIN +VLLM_DECODE_BS_BUCKET_STEP +VLLM_PROMPT_SEQ_BUCKET_MIN +VLLM_PROMPT_SEQ_BUCKET_STEP +VLLM_DECODE_BLOCK_BUCKET_MIN +VLLM_DECODE_BLOCK_BUCKET_STEP +MAX_NUM_PREFILL_SEQS +NUM_HIDDEN_LAYERS +HIDDEN_SIZE +NUM_KEY_VALUE_HEADS +NUM_ATTENTION_HEADS +CACHE_DTYPE_BYTES +LIMIT_MODEL_LEN +PT_HPU_LAZY_MODE +VLLM_DELAYED_SAMPLING +VLLM_SKIP_WARMUP +EXPERIMENTAL_WEIGHT_SHARING +VLLM_EXPONENTIAL_BUCKETING +MAX_NUM_BATCHED_TOKENS +PT_HPU_ENABLE_LAZY_COLLECTIVES +DEVICE_HPU_MEM +MODEL_MEM_IN_GB +USABLE_MEM +GPU_MEM_UTILIZATION +KV_CACHE_PER_SEQ +EST_MAX_NUM_SEQS +EST_HPU_BLOCKS +DECODE_BS_RAMP_GRAPHS +DECODE_BS_STEP_GRAPHS +DECODE_BLOCK_RAMP_GRAPHS +DECODE_BLOCK_STEP_GRAPHS +NUM_DECODE_GRAPHS +PROMPT_BS_RAMP_GRAPHS +PROMPT_BS_STEP_GRAPHS +PROMPT_SEQ_RAMP_GRAPHS +PROMPT_SEQ_STEP_GRAPHS +EST_NUM_PROMPT_GRAPHS +EST_GRAPH_PROMPT_RATIO +VLLM_GRAPH_PROMPT_RATIO +DECODE_GRAPH_TARGET_GB +EST_GRAPH_RESERVE_MEM +VLLM_GRAPH_RESERVED_MEM +KV_CACHE_MEM +MAX_NUM_SEQS +VLLM_PROMPT_SEQ_BUCKET_MAX +VLLM_CONTIGUOUS_PA +VLLM_DEFRAG +ASYNC_SCHEDULING +VLLM_WEIGHT_LOAD_FORCE_SYNC diff --git a/.cd/server/server_user.env b/.cd/server/server_user.env index 8d1664272..3dd52ba00 100644 --- a/.cd/server/server_user.env +++ b/.cd/server/server_user.env @@ -10,3 +10,4 @@ MAX_NUM_SEQS TENSOR_PARALLEL_SIZE VLLM_EXPONENTIAL_BUCKETING GPU_MEM_UTILIZATION +ASYNC_SCHEDULING diff --git a/.cd/server/settings_vllm.csv b/.cd/server/settings_vllm.csv index b616d0e49..00d2e6b47 100644 --- a/.cd/server/settings_vllm.csv +++ b/.cd/server/settings_vllm.csv @@ -1,21 +1,19 @@ -MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS -meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048 -mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048 -mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048 -mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048 -deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.2-11B-Vision-Instruct,1,8448,128,2,21340441670,2,2,19.87483507,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,40,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048 -meta-llama/Llama-3.2-90B-Vision-Instruct,4,8448,512,2,177186710646,2,2,165.0179835,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,100,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048 -ibm-granite/granite-8b-code-instruct-4k,1,2048,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048 -ibm-granite/granite-20b-code-instruct-8k,1,2048,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,16,80,2,65536,1,TRUE,FALSE,0,FALSE,2048 -Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048 +MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC +meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,1 +Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 +ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0 +Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0 diff --git a/.cd/server/vllm_autocalc_rules.py b/.cd/server/vllm_autocalc_rules.py index 5841b670a..30290edb8 100644 --- a/.cd/server/vllm_autocalc_rules.py +++ b/.cd/server/vllm_autocalc_rules.py @@ -82,8 +82,12 @@ def calc_DECODE_BLOCK_STEP_GRAPHS(ctx): def calc_NUM_DECODE_GRAPHS(ctx): # 3d update - return ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) * - (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS'])) / 2 + decode_graphs = ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) * + (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS'])) + if ctx['VLLM_CONTIGUOUS_PA']: + return decode_graphs + else: + return decode_graphs / 2 def calc_PROMPT_BS_RAMP_GRAPHS(ctx): diff --git a/.cd/templates/template_vllm_benchmark.sh b/.cd/templates/template_vllm_benchmark.sh index 3af3e3f3d..19b23d215 100644 --- a/.cd/templates/template_vllm_benchmark.sh +++ b/.cd/templates/template_vllm_benchmark.sh @@ -3,7 +3,7 @@ #@VARS # Wait for vLLM server to be ready -until curl -s http://localhost:8000${ENDPOINT} > /dev/null; do +until curl -s http://localhost:8000/v1/models > /dev/null; do echo "Waiting for vLLM server to be ready..." sleep 15 done @@ -35,4 +35,4 @@ vllm bench serve \ --metric-percentiles 90 \ --ignore-eos \ --trust-remote-code \ -2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log \ No newline at end of file +2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log diff --git a/.cd/templates/template_vllm_server.sh b/.cd/templates/template_vllm_server.sh index c28cd3ed5..b6db4e8cd 100644 --- a/.cd/templates/template_vllm_server.sh +++ b/.cd/templates/template_vllm_server.sh @@ -2,6 +2,10 @@ #@VARS +if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling + EXTRA_ARGS+=" --async_scheduling" +fi + ## Start server vllm serve $MODEL \ --block-size $BLOCK_SIZE \ @@ -11,5 +15,7 @@ vllm serve $MODEL \ --max-model-len $MAX_MODEL_LEN \ --gpu-memory-utilization $GPU_MEM_UTILIZATION \ --max-num-seqs $MAX_NUM_SEQS \ - --disable-log-requests \ + --generation-config vllm \ + --max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \ + --disable-log-requests ${EXTRA_ARGS} \ 2>&1 | tee -a logs/vllm_server.log diff --git a/.cd/tests/test_vllm_autocalc_rules.py b/.cd/tests/test_vllm_autocalc_rules.py index d14a07bc6..17a504e16 100644 --- a/.cd/tests/test_vllm_autocalc_rules.py +++ b/.cd/tests/test_vllm_autocalc_rules.py @@ -1,4 +1,5 @@ # SPDX-License-Identifier: Apache-2.0 +import pytest import math import server.vllm_autocalc_rules as rules @@ -110,14 +111,16 @@ def test_calc_DECODE_BLOCK_STEP_GRAPHS(): assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected -def test_calc_NUM_DECODE_GRAPHS(): +@pytest.mark.parametrize("cpa", ["true", "false"]) +def test_calc_NUM_DECODE_GRAPHS(cpa): ctx = { 'DECODE_BS_RAMP_GRAPHS': 2, 'DECODE_BS_STEP_GRAPHS': 3, 'DECODE_BLOCK_RAMP_GRAPHS': 4, - 'DECODE_BLOCK_STEP_GRAPHS': 5 + 'DECODE_BLOCK_STEP_GRAPHS': 5, + 'VLLM_CONTIGUOUS_PA': cpa } - expected = ((2 + 3) * (4 + 5)) / 2 + expected = (2 + 3) * (4 + 5) if cpa else (2 + 3) * (4 + 5) / 2 assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 12c9134b7..e17f4f422 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -2,3 +2,4 @@ self-hosted-runner: labels: - ucb-vllm-cicd-g2 - hourly-ci + - pr-ci diff --git a/.github/workflows/create-release-branch.yaml b/.github/workflows/create-release-branch.yaml index 9941b11c8..cd40773ef 100644 --- a/.github/workflows/create-release-branch.yaml +++ b/.github/workflows/create-release-branch.yaml @@ -120,9 +120,27 @@ jobs: echo "branch_name=pre_${{ github.event.inputs.branch_name }}" >> "$GITHUB_OUTPUT" fi - setup_and_build: - runs-on: hourly-ci + # --- NEW JOB --- + # This job runs after prep, picks one 'hourly-ci' runner, + # and outputs its name so all other test jobs can target it. + discover_runner: + name: "Discover Self-Hosted Runner" needs: [prepare-release-branch] + runs-on: hourly-ci + outputs: + runner_name: ${{ steps.get_name.outputs.name }} + steps: + - name: Get runner name + id: get_name + run: | + echo "This workflow will run on: ${{ runner.name }}" + echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT" + + setup_and_build: + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: "Checkout the release branch" uses: actions/checkout@v4 @@ -168,8 +186,10 @@ jobs: echo "Docker image built successfully." run_unit_tests: - needs: [prepare-release-branch, setup_and_build] - runs-on: hourly-ci + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, setup_and_build, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run pytest in tests/unit_tests run: | @@ -191,8 +211,10 @@ jobs: echo "Test script exited with code: $EXITCODE" discover_tests: - runs-on: hourly-ci - needs: [prepare-release-branch] + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -204,9 +226,9 @@ jobs: id: set-matrix run: | TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \ - awk '{print $1}' | \ - sed 's/()//' | \ - jq -R . | jq -s -c . ) + awk '{print $1}' | \ + sed 's/()//' | \ + jq -R . | jq -s -c . ) echo "Discovered test matrix: $TEST_FUNCTIONS" if [ "$TEST_FUNCTIONS" = "[]" ]; then @@ -216,8 +238,10 @@ jobs: echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT" e2e: - needs: [prepare-release-branch, setup_and_build, discover_tests] - runs-on: hourly-ci + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, setup_and_build, discover_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} strategy: fail-fast: false matrix: @@ -245,8 +269,10 @@ jobs: echo "Test script exited with code: $EXITCODE" run_data_parallel_test: - needs: [prepare-release-branch, setup_and_build] - runs-on: hourly-ci + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, setup_and_build, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run Data Parallel test run: | @@ -271,8 +297,10 @@ jobs: echo "Test script exited with code: $EXITCODE" run_pd_disaggregate_test: - needs: [prepare-release-branch, setup_and_build] - runs-on: hourly-ci + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, setup_and_build, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run PD disaggregate test run: | @@ -298,8 +326,10 @@ jobs: echo "Test script exited with code: $EXITCODE" run_hpu_perf_tests: - needs: [prepare-release-branch, setup_and_build] - runs-on: hourly-ci + # --- UPDATED: Add discover_runner dependency --- + needs: [prepare-release-branch, setup_and_build, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run Sharegpt performance tests with warmup run: | @@ -324,6 +354,8 @@ jobs: summarize_and_notify: name: "Summarize Test Results and Notify" runs-on: ubuntu-latest + # --- This job runs on ubuntu-latest, so no runner change is needed --- + # It will correctly wait for all the test jobs to finish if: needs.prepare-release-branch.result == 'success' needs: - prepare-release-branch diff --git a/.github/workflows/hourly-ci.yaml b/.github/workflows/hourly-ci.yaml index 2131cc8f2..4805c5de2 100644 --- a/.github/workflows/hourly-ci.yaml +++ b/.github/workflows/hourly-ci.yaml @@ -13,13 +13,29 @@ on: workflow_dispatch: {} jobs: - # JOB 1: Sets up the environment and builds the Docker image needed for all tests. + # JOB 1: (NEW) Discovers an available runner and locks it for all subsequent jobs. + discover_runner: + runs-on: hourly-ci # Picks any available runner from the 'hourly-ci' pool + outputs: + runner_name: ${{ steps.get_name.outputs.name }} + steps: + - name: Get runner name + id: get_name + # This command gets the unique name of the runner (e.g., "my-runner-123") + # and saves it as an output variable + run: | + echo "This workflow will run on: ${{ runner.name }}" + echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT" + + # JOB 2: (UPDATED) Sets up the environment and builds the Docker image. setup_and_build: if: | github.event_name == 'workflow_dispatch' || github.ref == 'refs/heads/main' - runs-on: hourly-ci - needs: discover_tests + # <-- UPDATED: Now needs 'discover_tests' AND 'discover_runner' + needs: [discover_tests, discover_runner] + # <-- UPDATED: Runs on the specific runner from the discover_runner job + runs-on: ${{ needs.discover_runner.outputs.runner_name }} permissions: contents: read # Required to checkout code and read history outputs: @@ -103,9 +119,12 @@ jobs: EOF echo "Docker image built successfully." + # JOB 3: (UPDATED) run_unit_tests: - needs: setup_and_build - runs-on: hourly-ci + # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner' + needs: [setup_and_build, discover_runner] + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run pytest in tests/unit_tests run: | @@ -126,8 +145,12 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" + # JOB 4: (UPDATED) discover_tests: - runs-on: hourly-ci + # <-- UPDATED: Now needs 'discover_runner' + needs: discover_runner + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -140,9 +163,9 @@ jobs: # naming convention, excluding the main 'run_all_tests' function itself. # The final list is formatted into a JSON array required for the matrix strategy. TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \ - awk '{print $1}' | \ - sed 's/()//' | \ - jq -R . | jq -s -c . ) + awk '{print $1}' | \ + sed 's/()//' | \ + jq -R . | jq -s -c . ) echo "Discovered test matrix: $TEST_FUNCTIONS" # Fail the job if no tests were found. @@ -152,9 +175,12 @@ jobs: fi echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT" + # JOB 5: (UPDATED) e2e: - needs: [setup_and_build, discover_tests] - runs-on: hourly-ci + # <-- UPDATED: Now needs 'setup_and_build', 'discover_tests', AND 'discover_runner' + needs: [setup_and_build, discover_tests, discover_runner] + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} strategy: fail-fast: false matrix: @@ -183,9 +209,12 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" + # JOB 6: (UPDATED) run_data_parallel_test: - needs: setup_and_build - runs-on: hourly-ci + # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner' + needs: [setup_and_build, discover_runner] + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run Data Parallel test run: | @@ -209,9 +238,12 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" + # JOB 7: (UPDATED) run_pd_disaggregate_test: - needs: setup_and_build - runs-on: hourly-ci + # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner' + needs: [setup_and_build, discover_runner] + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run PD disaggregate test run: | @@ -236,9 +268,12 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" + # JOB 8: (UPDATED) store_last_stable_vllm_commit: - needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test] - runs-on: hourly-ci + # <-- UPDATED: Now needs all test jobs AND 'discover_runner' + needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test, discover_runner] + # <-- UPDATED: Runs on the specific runner + runs-on: ${{ needs.discover_runner.outputs.runner_name }} permissions: contents: write # Permission is required to push a commit steps: diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml index 7968fefe4..59707351b 100644 --- a/.github/workflows/pre-merge.yaml +++ b/.github/workflows/pre-merge.yaml @@ -22,7 +22,6 @@ concurrency: jobs: gatekeeper: runs-on: ubuntu-latest - if: github.event.action == 'opened' || github.event.before != github.event.after permissions: # Required to read the status of checks and PR details checks: read @@ -107,8 +106,26 @@ jobs: echo "Failing this job to prevent the main CI from running." exit 1 + # --- NEW JOB --- + # This job runs first on the self-hosted pool, picks a runner, + # and outputs its name so all other jobs can target it. + discover_runner: + needs: gatekeeper + runs-on: pr-ci + outputs: + runner_name: ${{ steps.get_name.outputs.name }} + steps: + - name: Get runner name + id: get_name + run: | + echo "This workflow will run on: ${{ runner.name }}" + echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT" + discover_tests: - runs-on: ucb-vllm-cicd-g2 + # --- UPDATED: Add discover_runner dependency --- + needs: discover_runner + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} outputs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: @@ -124,9 +141,9 @@ jobs: # naming convention, excluding the main 'run_all_tests' function itself. # The final list is formatted into a JSON array required for the matrix strategy. TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \ - awk '{print $1}' | \ - sed 's/()//' | \ - jq -R . | jq -s -c . ) + awk '{print $1}' | \ + sed 's/()//' | \ + jq -R . | jq -s -c . ) echo "Discovered test matrix: $TEST_FUNCTIONS" # Fail the job if no tests were found. @@ -137,7 +154,7 @@ jobs: echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT" pre-commit: - # This job now runs in parallel with the build job + # This job runs in parallel with the build job needs: gatekeeper runs-on: ubuntu-latest steps: @@ -165,8 +182,10 @@ jobs: pre_merge_hpu_test_build: if: > !contains(github.event.pull_request.labels.*.name, 'skip-gaudi-tests') - runs-on: ucb-vllm-cicd-g2 - needs: [pre-commit, discover_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [pre-commit, discover_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} permissions: contents: read # Required to checkout code and read history outputs: @@ -318,8 +337,10 @@ jobs: echo "Docker image built successfully." hpu_unit_tests: - runs-on: ucb-vllm-cicd-g2 - needs: pre_merge_hpu_test_build + # --- UPDATED: Add discover_runner dependency --- + needs: [pre_merge_hpu_test_build, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run pytest in tests/unit_tests run: | @@ -340,10 +361,10 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" hpu_pd_tests: - runs-on: ucb-vllm-cicd-g2 - # This is a final job that runs after the build and unit tests - # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run. - needs: [pre_merge_hpu_test_build, hpu_unit_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run test scripts run: | @@ -369,10 +390,10 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" hpu_perf_tests: - runs-on: ucb-vllm-cicd-g2 - # This is a final job that runs after the build and unit tests - # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run. - needs: [pre_merge_hpu_test_build, hpu_unit_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run test scripts run: | @@ -393,10 +414,10 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" hpu_dp_tests: - runs-on: ucb-vllm-cicd-g2 - # This is a final job that runs after the build and unit tests - # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run. - needs: [pre_merge_hpu_test_build, hpu_unit_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} steps: - name: Run test scripts run: | @@ -420,8 +441,10 @@ jobs: EXITCODE=$? echo "Test script exited with code: $EXITCODE" e2e: - runs-on: ucb-vllm-cicd-g2 - needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} strategy: fail-fast: false matrix: @@ -451,8 +474,10 @@ jobs: echo "Test script exited with code: $EXITCODE" pre_merge_hpu_test: - runs-on: ucb-vllm-cicd-g2 - needs: [hpu_unit_tests, e2e, hpu_perf_tests] + # --- UPDATED: Add discover_runner dependency --- + needs: [hpu_unit_tests, e2e, hpu_perf_tests, discover_runner] + # --- UPDATED: Run on the specific node --- + runs-on: ${{ needs.discover_runner.outputs.runner_name }} # This job is required to pass for pre-merge CI. By itself it does nothing, and will only pass if all jobs specified in "needs" list pass. steps: - name: Succeeded if all previous jobs passed @@ -476,4 +501,3 @@ jobs: ### ✅ CI Passed All checks passed successfully against the following vllm commit: **`${{ needs.pre_merge_hpu_test_build.outputs.target_commit }}`** - diff --git a/.jenkins/vision/run-tests.sh b/.jenkins/vision/run-tests.sh index aed0f9f50..fd0d358f3 100644 --- a/.jenkins/vision/run-tests.sh +++ b/.jenkins/vision/run-tests.sh @@ -42,13 +42,12 @@ do export PT_HPU_ENABLE_LAZY_COLLECTIVES=true export VLLM_SKIP_WARMUP=true export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" - RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 [!IMPORTANT] -> This is an early developer preview of the vLLM Gaudi Plugin and is not yet intended for general use. For a more stable experience, consider using the [HabanaAI/vllm-fork](https://github.com/HabanaAI/vllm-fork) or the in-tree Gaudi implementation available in [vllm-project/vllm](https://github.com/vllm-project/vllm). - -# Welcome to vLLM x Intel Gaudi -

- vLLM + vLLM x Intel-Gaudi

+

+Welcome to vLLM x Intel Gaudi +

+ +

+| Documentation | Intel® Gaudi® Documentation | Optimizing Training Platform Guide | +

+ +--- +*Latest News* 🔥 + +- [2025/06] We are introduced an early developer preview of the vLLM Gaudi Plugin and is not yet intended for general use. For a more stable experience, consider using the [HabanaAI/vllm-fork](https://github.com/HabanaAI/vllm-fork) or the in-tree Gaudi implementation available in [vllm-project/vllm](https://github.com/vllm-project/vllm). + +--- + +## About vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference. This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware. -Learn more: - -📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html) -🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html) +Learn more: 🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html) ## Getting Started +0. Preparation of the Setup + + To set up the execution environment, please follow the instructions in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html). + To achieve the best performance on HPU, please follow the methods outlined in the + [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). + 1. Get Last good commit on vllm NOTE: vllm-gaudi is always follow latest vllm commit, however, vllm upstream API update may crash vllm-gaudi, this commit saved is verified with vllm-gaudi @@ -37,7 +51,7 @@ Learn more: git clone https://github.com/vllm-project/vllm cd vllm git checkout $VLLM_COMMIT_HASH - pip install -r <(sed '/^[torch]/d' requirements/build.txt) + pip install -r <(sed '/^torch/d' requirements/build.txt) VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e . cd .. ``` @@ -50,46 +64,17 @@ Learn more: cd .. ``` -4. (Optional) Install nixl: +4. To uncover all installation methods, sucha as NixL, follow the [link](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html) - ```bash - cd vllm-gaudi - python install_nixl.sh - cd .. - ``` +## Contributing + +We welcome and value any contributions and collaborations. + +## Contact Us -## Install with Docker file - -```bash -docker build -t ubuntu.pytorch.vllm.nixl.latest \ - -f .cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest github.com/vllm-project/vllm-gaudi -docker run -it --rm --runtime=habana \ - --name=ubuntu.pytorch.vllm.nixl.latest \ - --network=host \ - -e HABANA_VISIBLE_DEVICES=all \ - vllm-gaudi-for-llmd /bin/bash -``` - -### Full installation from source (vLLM and vLLM-Gaudi): - -```bash -# Fetch last good commit on vllm -git clone https://github.com/vllm-project/vllm-gaudi -cd vllm-gaudi -export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) - -# Build vLLM from source for empty platform, reusing existing torch installation -git clone https://github.com/vllm-project/vllm -cd vllm -git checkout $VLLM_COMMIT_HASH -pip install -r <(sed '/^[torch]/d' requirements/build.txt) -VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e . -cd .. - -# Build vLLM-Gaudi from source -cd vllm-gaudi -pip install -e . - -# Build nixl -python install_nixl.sh -``` + +- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-gaudi/issues) +- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai) +- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai) +- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature + diff --git a/calibration/README.md b/calibration/README.md new file mode 100644 index 000000000..de0416031 --- /dev/null +++ b/calibration/README.md @@ -0,0 +1,202 @@ +# FP8 Calibration Procedure + +Running inference via [vLLM](https://github.com/vllm-project/vllm) on HPU with FP8 precision is achieved using [Intel® Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#inference-using-fp8) package. This approach requires a model calibration procedure to generate measurements, quantization files, and configurations first. To simplify this process, we've provided the `calibrate_model.sh` script. It requires the following arguments: + +- `-m`, i.e., **model stub or path:** Path to your model (if stored locally) or the model ID from the Hugging Face Hub. +- `-d`, i.e., **path to the source dataset:** Path to your dataset in pickle format (".pkl"). +- `-o`, i.e., **output path:** Path to the directory where the generated measurements, etc., will be stored. + +There are also optional arguments, and you can read about them by executing the script with the `-h` option. + +The calibration procedure works with any dataset that contains following fields: `system_prompt` and `question`. These fields are used to prepare a calibration dataset with prompts formatted specifically for your model. We recommend to use a public dataset used by MLCommons in Llama2-70b inference submission: https://github.com/mlcommons/inference/tree/master/language/llama2-70b#preprocessed. + +> [!TIP] +> For the [DeepSeek-R1](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) series models, which contains 256 experts, it’s important to provide a diverse and +> sufficiently large sample set to ensure that all experts are properly activated during calibration. +> Through our experiments, we found that using [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) and selecting 512 samples with at least 1024 tokens each yields good calibration coverage. + +## Options and Usage + +To run the ```calibrate_model.sh``` script, follow the steps below: + +1. Build and install latest [vllm-plugin](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html). +2. Go ```calibration``` subdirectory: + +```bash +cd calibration +pip install -r requirements.txt +``` + +1. Download the dataset. +> [!NOTE] +> For [DeepSeek-R1](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) series models, it is recommended to use `NeelNanda/pile-10k` as the dataset. + +1. Run the ```calibrate_model.sh``` script. Refer to the script options and run examples below. The script generates the ```maxabs_quant_g3.json``` file, which is used for FP8 inference. + +### Here are some examples of how to use the script: + +```bash +./calibrate_model.sh -m /path/to/local/llama3.1/Meta-Llama-3.1-405B-Instruct/ -d dataset-processed.pkl -o /path/to/measurements/vllm-benchmarks/inc -b 128 -t 8 -l 4096 +# OR +./calibrate_model.sh -m facebook/opt-125m -d dataset-processed.pkl -o inc/ +# OR Calibrate DeepSeek models with dataset NeelNanda/pile-10k +PT_HPU_LAZY_MODE=1 ./calibrate_model.sh -m deepseek-ai/DeepSeek-R1 -d NeelNanda/pile-10k -o inc/ -t 8 +``` + +> [!WARNING] +> Measurements are device-dependent, so you can't use scales collected on Gaudi3 on Gaudi2 accelerators. This behavior can cause accuracy issues. + +> [!TIP] +> If you get following error, ensure you set a valid tensor parallelism value, e.g. `-t 8`: +> +> ``` +> RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed for size::939524096 (896)MB +> ``` + +# Run inference with FP8 models + +An inference with FP8 precision models using vLLM has been described in [Documentation](https://vllm-gaudi.readthedocs.io/en/latest/configuration/model_calibration.html). + +# Multi-node FP8 Calibration + +Following section details the procedure for calibrating models that do not fit into a single Gaudi node. For illustration we have used the Llama 3.1 405B model running in Tensor Parallelism(TP)-16 mode spanning two Gaudi2 nodes.
+ +> [!NOTE] +> Following steps are to be executed within a [Gaudi Pytorch container](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#use-intel-gaudi-containers) + +## Step 1: Pre-requisites + +- Install latest [vllm-plugin](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html) +- Ensure that all nodes in the multi-node setup are connected to an NFS mount (Network File System). +- Create workspace directory on NFS, clone the calibration scripts repo and create an empty file `quant_config_buffer.json`. + +```bash +mkdir /my_workspace && cd /my_workspace +cd /calibration +touch quant_config_buffer.json +``` + +- Check if all Gaudi NIC ports are up
+Note : Following commands should be run on the host and NOT inside the container.
+ +```bash +cd /opt/habanalabs/qual/gaudi2/bin +./manage_network_ifs.sh --status +# All the ports should be in 'up' state. Try flipping the state +./manage_network_ifs.sh --down +./manage_network_ifs.sh --up +# Give it a minute for the NIC's to flip and check the status again +``` + +- Set following envs at all nodes: + +```bash +# Check the network interface for outbound/inbound comms. Command 'ip a' or 'ifconfig' should list all the interfaces +export GLOO_SOCKET_IFNAME=eth0 +export HCCL_SOCKET_IFNAME=eth0 +export QUANT_CONFIG="/quant_config_buffer.json" +``` + +### Step 2: Start a Ray cluster to accommodate the required TP size. + +```bash +# Start Ray on head node +ray start --head --port=6379 + +# Add worker nodes to the Ray cluster +ray start --address=':6379' + +# Check if the cluster has required number of HPU's +ray status +``` + +#### Step 3: Run model calibration script + +```bash +./calibrate_model.sh -m meta-llama/Llama-3.1-405B-Instruct -d /open_orca_gpt4_tokenized_llama.calibration_1000.pkl -o /fp8_output -l 4096 -t 16 -b 128 +``` + +Running the above command will create calibration measurement files in the specified output directory, organized into model-specific subdirectories. + +> [!NOTE] +> The current calibration procedure works correctly only when the multi-node configuration has more than 8 cards. + +#### Step 4: (Optional) Measurement unification + +This is an optional step and is used to reduce the target tensor parallelism level by unifying the measurement scales. For example, you can perform FP8 calibration on the Llama 3.1 405B model using 2x Gaudi2 nodes with Tensor Parallelism (TP) set to 16, and then use the unification script to reduce the TP to 8. This can be achieved in two ways: +1. Add `-r` optional parameter to `calibration_model.sh` script, e.g. + +```bash +./calibrate_model.sh -m meta-llama/Llama-3.1-405B-Instruct -d /open_orca_gpt4_tokenized_llama.calibration_1000.pkl -o /fp8_output -l 4096 -t 16 -b 128 -r 8 +``` + +1. If calibration has already been performed, use the following command to convert existing scales: + +```bash +python3 step-5-unify_measurements.py -r 8 -m /fp8_output/llama-3.1-405b-instruct/g2/ -o /fp8_output/llama-3.1-405b-instruct/g2/ +``` + +- `-r`, i.e. **rank number** of unified measurements. +- `-m`, i.e. **calibration output path** containing the measurement files. +- `-o`, i.e. **unification output directory** where unification output will be written. +- `-u`, i.e. unify original measurement results based on **expert parallelism** rules. + +> [!TIP] +> It is a good practice to store unification results in the source directory. This allows you to run the vLLM server with FP8 precision and different TP values without modifying the directory specified in the `QUANT_CONFIG` environment variable. + +Below examples in case you want to convert scales from TP=16 to TP=4 and 2: +- conversion of scales TP=16 -> TP=4: + +```bash +python3 step-5-unify_measurements.py -r 4 -m /fp8_output/llama-3.1-405b-instruct/g2/ -o /fp8_output/llama-3.1-405b-instruct/g2/ +``` + +- conversion of scales TP=16 -> TP=2: + +```bash +python3 step-5-unify_measurements.py -r 2 -m /fp8_output/llama-3.1-405b-instruct/g2/ -o /fp8_output/llama-3.1-405b-instruct/g2/ +``` + +In case the model contains MoE layers and is calibrated with expert parallelism, `-u` is required for unification: + +```bash +python3 step-5-unify_measurements.py -r 4 -m /fp8_output/model_name/g2 -o /fp8_output/model_name/g2 -u +``` + +#### Step 5: Serving the FP8 quantized model + +```bash +export QUANT_CONFIG='/fp8_output/llama-3.1-405b-instruct/maxabs_quant_g2.json' +vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor-parallel-size 8 --max-model-len 2048 +``` + +> [!NOTE] +> Detailed information about serving with vLLM (including multi-node serving) you can find in [Documentation](https://vllm-gaudi.readthedocs.io/en/latest/configuration/model_calibration.html). + +#### Advanced Usage for MoE Models + +For models with Mixture of Experts (MoE), like Deepseek-R1, you may want to run calibration once and use the results for different expert parallelism and data parallelism scenarios (e.g., 8, 16, or 32 cards). To do this: + +1. Unify all measurement files onto a single card (TP1). +2. (Optional) Postprocess the unified measurement for better performance. +3. Expand the unified results to the number of expert-parallel cards you need. The `step-6-expand-measurements.py` splits expert measurements across the target number of cards, while other values are reused. + +The diagram below shows an example where calibration is done on 2 cards and deployment is on 4 cards. + +![unify-and-expand](./unify-and-expand.png) + +Here is a real example that calibrates Deepseek-R1 on 8 cards and deploys on 16 or 32 cards: + +```bash +# Unify measurements: TP8 -> TP1 +python step-5-unify_measurements.py -m /path/to/measurements/deepseek-r1/g3/ -r 1 -o /path/to/measurements/deepseek-r1/g3-unified-tp1/ -u -s + +# (Optional) Postprocess unified TP1 +python step-3-postprocess-measure.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -d + +# Expand to EP16TP1 +python step-6-expand-measurements.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post-expand-ep16 -w 16 + +# Expand to EP32TP1 +python step-6-expand-measurements.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post-expand-ep32 -w 32 +``` diff --git a/calibration/calibrate_model.sh b/calibration/calibrate_model.sh new file mode 100755 index 000000000..8d3df4e66 --- /dev/null +++ b/calibration/calibrate_model.sh @@ -0,0 +1,285 @@ +#!/bin/bash +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +set -e +cd "$(dirname "$0")" + +ALLOWED_DEVICES=("g2" "g3") + +usage() { + echo + echo "Calibrate given MODEL_PATH for FP8 inference" + echo + echo "usage: ${0} " + echo + echo " -m - [required] huggingface stub or local directory of the MODEL_PATH" + echo " -d - [required] path to source dataset (details in README)" + echo " -o - [required] path to output directory for fp8 measurements" + echo " -b - batch size to run the measurements at (default: 32)" + echo " -l - limit number of samples in calibration dataset" + echo " -t - tensor parallel size to run at (default: 1); NOTE: if t > 8 then we need a multi-node setup" + echo " -r - rank of unified measurements, it should be smaller than original rank number and should be a factor of the original rank number" + echo " -u - unify measurement results based on expert parallelism rules (default: False), expert parallelism unification rule is unique, card 1 expert measurement will be extended to card 0 if unified to x from 2x cards number" + echo " -e - set this flag to enable enforce_eager execution" + echo +} + +cleanup_tmp() { + if [[ $(pwd) == *vllm-gaudi/calibration ]]; then + echo "Clearing temporary directory" + rm -rf nc_workspace + rm -rf inc_tmp + else + echo "Skipping temporary directory removal" + fi +} + +create_measure_config() { + mkdir -p $1/$2/$3 + + model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]') + + if [[ $model_name_lower =~ ^mixtral ]]; then + tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\": []},\"blocklist\": {\"types\": [], \"names\": [\"self_attn\", \"lm_head\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + elif [[ $model_name_lower =~ ^deepseek ]]; then + tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\": []},\"blocklist\": {\"types\": [], \"names\": [\"lm_head\", \"mlp\\\.gate\\\b\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + else + tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\": []},\"blocklist\": {\"types\": [], \"names\": []},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + fi + echo "$tmp_config" > $1/$2/maxabs_measure_$3.json +} + +create_quant_config() { + mkdir -p $1/$2/$3 + + model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]') + + #note(kwisniewski98): mixtral models has attention masked to not cause regression in accuracy + if [[ $model_name_lower =~ ^mixtral ]]; then + if [[ $PT_HPU_LAZY_MODE == 0 ]]; then + tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"CONST\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + else + tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + fi + elif [[ $model_name_lower =~ ^deepseek ]]; then + tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"scalar\", \"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"lm_head\", \"mlp\\\.gate\\\b\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + else + tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": []},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + fi + echo "$tmp_config" > $1/$2/maxabs_quant_$3.json +} + +extract_last_folder_name() { + local path="$1" + + path="${path%/}" + last_folder="$(basename "$path")" + last_folder="${last_folder,,}" + + echo "$last_folder" +} + +cleanup_tmp + +EXTRA_FLAGS_STEP_1="" +EXTRA_FLAGS_STEP_2="" +EXTRA_FLAGS_STEP_3="" +EXTRA_FLAGS_STEP_4="" +BATCH_SIZE=32 +TP_SIZE=1 +MULTI_NODE_SETUP=false + +USE_EP="" +ENFORCE_EAGER=false + +while getopts "m:b:l:t:d:h:o:r:u:e" OPT; do + case ${OPT} in + m ) + MODEL_PATH="$OPTARG" + ;; + d ) + DATASET_PATH_OR_NAME="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + o ) + FP8_DIR=$(realpath "$OPTARG") + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + r ) + RANK="$OPTARG" + ;; + u ) + USE_EP="--use_expert_paral" + ;; + e ) + ENFORCE_EAGER=true + ;; + h ) + usage + ;; + \? ) + usage + exit 1 + ;; + esac +done + +if [[ -z "$MODEL_PATH" && -z "$FP8_DIR" && -z "$DATASET_PATH_OR_NAME" ]]; then + echo "Model stub, source dataset path and output path for fp8 measurements must be provided." + usage + exit 1 +fi + +# Store the provided MODEL_PATH name in a variable +MODEL_NAME=$(extract_last_folder_name "$MODEL_PATH") +model_name_lower=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]') + +echo "Step 0 - detecting used device type [g2, g3]" +DEVICE_TYPE=$(python3 step-0-detect-device.py) || (echo "Detecting device process failed" && exit 1) +DEVICE_TYPE="g$DEVICE_TYPE" +echo "Detected device type: $DEVICE_TYPE" +echo "Step 0 done" + +# Check if the provided device type is valid +if [[ ! " ${ALLOWED_DEVICES[*]} " =~ " $DEVICE_TYPE " ]]; then + echo "Invalid device type: $DEVICE_TYPE. Allowed devices: ${ALLOWED_DEVICES[*]}" + exit 1 +fi + +if [[ $TP_SIZE -gt 8 ]]; then + MULTI_NODE_SETUP=true +fi + +if $MULTI_NODE_SETUP; then + RAY_AVAILABLE_RESOURCES=$(python3 -c 'import ray; ray.init(); print(int(ray.available_resources()["HPU"]))') + if [[ $RAY_AVAILABLE_RESOURCES -lt $TP_SIZE ]]; then + echo "Required TP size : $TP_SIZE" + echo "Available HPU's : $RAY_AVAILABLE_RESOURCES " + echo "!! Exiting since not enough HPU resources available. You can run 'ray status' to see available resources" + echo "Refer https://vllm-gaudi.readthedocs.io/en/latest/configuration/multi_node.html for multi-node runs" + exit 1 + fi + + if [[ ! -e $QUANT_CONFIG ]]; then + echo " !! Exiting. Invalid QUANT_CONFIG env" + echo " Multi-node calibration requires QUANT_CONFIG to point to an empty buffer.json file. Refer https://vllm-gaudi.readthedocs.io/en/latest/configuration/multi_node.html" + exit 1 + fi +fi + +create_measure_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE +create_quant_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE + +if [[ $TP_SIZE > 1 ]]; then + export PT_HPU_ENABLE_LAZY_COLLECTIVES=true +fi + +if [[ $MODEL_PATH_NAME == llama.*2.* ]]; then + EXTRA_FLAGS_STEP_1+="--chat-template template/llama-2-chat.jinja " +elif [[ "$MODEL_PATH" == *"Mixtral-8x7B"* ]]; then + EXTRA_FLAGS_STEP_1+="--chat-template template/mistral_mixtral.jinja " +fi + +if [[ -n $LIMIT ]]; then + EXTRA_FLAGS_STEP_1+="--max-dataset-samples $LIMIT " +fi + +SKIP_STEP_1=false +if [[ $DATASET_PATH_OR_NAME == *.pkl ]]; then + SKIP_STEP_1=false +else + echo "DATASET_PATH_OR_NAME is not a .pkl file, will prepare calibration dataset based on it." + SKIP_STEP_1=true +fi + + +if [[ "$model_name_lower" == *"deepseek"* ]]; then + EXTRA_FLAGS_STEP_2+="--block-quant --expert-parallel " + EXTRA_ENVS_STEP_2="VLLM_HPU_FORCE_CHANNEL_FP8=0" + EXTRA_FLAGS_STEP_3+="--deepseek " + EXTRA_ENVS_STEP_4="VLLM_HPU_FORCE_CHANNEL_FP8=0" + EXTRA_FLAGS_STEP_4+="--block-quant --expert-parallel " +fi + +# Skip step 1 if the DATASET_PATH_OR_NAME is a .pkl file +if $SKIP_STEP_1; then + EXTRA_FLAGS_STEP_2+="--max-dataset-samples 512 --batch-size 1 --max-tokens 32 " + EXTRA_FLAGS_STEP_2+="--auto-process-dataset --sample-len 1024 --max-model-len 2048 " + EXTRA_FLAGS_STEP_2+="--dataset ${DATASET_PATH_OR_NAME} " +fi + +if [[ -z "$VLLM_USE_V1" || $VLLM_USE_V1 != "1" ]]; then + EXTRA_FLAGS_STEP_2+="--max-num-prefill-seqs 1 " + EXTRA_FLAGS_STEP_4+="--max-num-prefill-seqs 1 " +fi + +if $MULTI_NODE_SETUP; then + cat $FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json > $QUANT_CONFIG + sleep 2 +else + export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json +fi + +if $ENFORCE_EAGER; then + EXTRA_FLAGS_STEP_2+="--enforce-eager " + EXTRA_FLAGS_STEP_4+="--enforce-eager " +fi + +if $SKIP_STEP_1; then + echo "Skipping step 1 - prepare calibration dataset with dataset ${DATASET_PATH_OR_NAME}" +else + echo "" + echo "1/4 Preparing calibration dataset" + python3 step-1-prepare-calibration-dataset.py -m $MODEL_PATH -d $DATASET_PATH_OR_NAME -o $MODEL_NAME $EXTRA_FLAGS_STEP_1 || (echo "Error in step 1" && exit 1) + echo "Step 1/4 done" +fi + +echo "" +echo "2/4 Measuring scales" +if $MULTI_NODE_SETUP; then + env $EXTRA_ENVS_STEP_2 python3 step-2-measure-scales.py -m $MODEL_PATH --tensor-parallel-size $TP_SIZE -d $MODEL_NAME-calibration-dataset.pkl --batch-size $BATCH_SIZE --distributed-executor-backend ray $EXTRA_FLAGS_STEP_2 || (echo "Error in step 2" && exit 1) +else + env $EXTRA_ENVS_STEP_2 python3 step-2-measure-scales.py -m $MODEL_PATH --tensor-parallel-size $TP_SIZE -d $MODEL_NAME-calibration-dataset.pkl --batch-size $BATCH_SIZE $EXTRA_FLAGS_STEP_2 || (echo "Error in step 2" && exit 1) +fi +echo "Step 2/4 done" + +echo "" +echo "3/4 Postprocessing scales" +python3 step-3-postprocess-measure.py -m $FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/ -o inc_tmp/$MODEL_NAME/$DEVICE_TYPE/ $EXTRA_FLAGS_STEP_3 || (echo "Error in step 3" && exit 1) +cp inc_tmp/$MODEL_NAME/$DEVICE_TYPE/* $FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/ +echo "Step 3/4 done" + + +if $MULTI_NODE_SETUP; then + cat $FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json > $QUANT_CONFIG + sleep 2 +else + export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json +fi + +echo "" +echo "4/4 Quantize scales" +if $MULTI_NODE_SETUP; then + env $EXTRA_ENVS_STEP_4 python3 step-4-quantize-scales.py --model $MODEL_PATH --tensor-parallel-size $TP_SIZE --distributed-executor-backend ray $EXTRA_FLAGS_STEP_4 || (echo "Error in step 4" && exit 1) +else + env $EXTRA_ENVS_STEP_4 python3 step-4-quantize-scales.py --model $MODEL_PATH --tensor-parallel-size $TP_SIZE $EXTRA_FLAGS_STEP_4 || (echo "Error in step 4" && exit 1) +fi + +if [[ -n $RANK ]]; then + echo "" + echo "5/5 Unify scales" + QUANT_DIR=$FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/ + python3 step-5-unify_measurements.py -r $RANK -m $QUANT_DIR -o $QUANT_DIR $USE_EP || (echo "Error in step 5" && exit 1) + echo "Step 5/5 done" +fi +cleanup_tmp +echo "Calibration process done" diff --git a/calibration/requirements.txt b/calibration/requirements.txt new file mode 100644 index 000000000..a0f34c7b1 --- /dev/null +++ b/calibration/requirements.txt @@ -0,0 +1,3 @@ +datasets +transformers +numpy \ No newline at end of file diff --git a/calibration/step-0-detect-device.py b/calibration/step-0-detect-device.py new file mode 100644 index 000000000..12c6af08b --- /dev/null +++ b/calibration/step-0-detect-device.py @@ -0,0 +1,12 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import habana_frameworks.torch.hpu as hthpu + + +def detect_hpu(): + return hthpu.get_device_name()[-1] + + +if __name__ == "__main__": + print(detect_hpu()) diff --git a/calibration/step-1-prepare-calibration-dataset.py b/calibration/step-1-prepare-calibration-dataset.py new file mode 100755 index 000000000..374c086a6 --- /dev/null +++ b/calibration/step-1-prepare-calibration-dataset.py @@ -0,0 +1,93 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import argparse +import os + +import pandas as pd +import transformers + +os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "0" +os.environ["VLLM_SKIP_WARMUP"] = "true" + + +def get_ds(args): + print(f"Loading source dataset: {args.dataset}") + ds = pd.read_pickle(args.dataset) + + if args.max_dataset_samples: + ds = ds.sample(frac=1, random_state=42) + ds = ds.head(args.max_dataset_samples) + + return ds + + +def load_chat_template(chat_template_path: str) -> str: + + with open(chat_template_path) as f: + return f.read() + + +def main(args): + + calibration_ds = get_ds(args) + try: + tokenizer = transformers.AutoTokenizer.from_pretrained( + args.model, + model_max_length=args.max_model_length, + padding_side="left", + use_fast=False, + ) + except (OSError, ValueError, RuntimeError, ImportError): + tokenizer = transformers.AutoTokenizer.from_pretrained( + args.model, + model_max_length=args.max_model_length, + padding_side="left", + use_fast=True, + ) + + chat_template = load_chat_template(args.chat_template) if args.chat_template else None + + print("Creating calibration dataset...") + inputs = [] + for _, row in calibration_ds.iterrows(): + question = row["question"] + system_prompt = row["system_prompt"] + if "mixtral" in args.model or "Mixtral" in args.model: + tmp_conversation = [{"role": "user", "content": question}, {"role": "assistant", "content": system_prompt}] + else: + tmp_conversation = [{"role": "system", "content": system_prompt}, {"role": "user", "content": question}] + try: + tmp_input = tokenizer.apply_chat_template(tmp_conversation, + chat_template=chat_template, + tokenize=False, + truncation=True) + except ValueError: + # Case when given model don't need any chat-template and can process raw string without any system tokens, + # e.g. facebook/opt-125m + tmp_input = f"{system_prompt}. {question}" + inputs.append(tmp_input) + + calibration_ds['input'] = inputs + + print("Saving calibration dataset...") + calibration_ds.to_pickle(f"{args.output_name}-calibration-dataset.pkl") + print("Done.") + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser(description="Create a calibration dataset for a model.") + parser.add_argument("-d", "--dataset", type=str, required=True) + parser.add_argument("-m", "--model", type=str, required=True) + parser.add_argument("-o", "--output_name", type=str, required=True) + parser.add_argument("--max-model-length", type=int, default=1024) + parser.add_argument("--max-dataset-samples", type=int, default=0) + parser.add_argument("--chat-template", + type=str, + default="", + help="If not provided, the default chat-template from the model will be used.") + + args = parser.parse_args() + + main(args) diff --git a/calibration/step-2-measure-scales.py b/calibration/step-2-measure-scales.py new file mode 100755 index 000000000..22790106e --- /dev/null +++ b/calibration/step-2-measure-scales.py @@ -0,0 +1,197 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import vllm +import torch +import pandas as pd +import time +import argparse +import os + +os.environ["PT_HPU_WEIGHT_SHARING"] = "0" +os.environ["VLLM_SKIP_WARMUP"] = "true" + + +def get_ds(args): + print(f"Loading dataset: {args.dataset}") + ds = pd.read_pickle(args.dataset) + + if args.max_dataset_samples: + ds = ds.head(args.max_dataset_samples) + + return ds + + +def get_dataset(args): + + def reset_seed(seed=42): + import torch + import random + import numpy as np + + torch.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + def get_prompt_token_ids(model_path, prompts, max_length=1024): + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(model_path) + prompt_token_ids = [] + for prompt in prompts: + tokens = tokenizer( + prompt, + return_tensors="pt", + truncation=True, + max_length=max_length, + ) + if len(tokens.input_ids[0]) < max_length: + continue + prompt_token_ids.append([x.item() for x in tokens.input_ids[0]]) + return prompt_token_ids + + def get_prompts( + model_name, + dataset_name="NeelNanda/pile-10k", + num_samples=512, + least_tokens=1024, + ): + print(f"Loading {num_samples} samples with at least {least_tokens} tokens " + f"from {dataset_name} for model {model_name}...") + from datasets import load_dataset + from tqdm import tqdm + import transformers + + seed = 42 + + reset_seed(seed) + + dataset = load_dataset(dataset_name, split="train") + dataset = dataset.shuffle(seed=seed) + + tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) + num_sample = 0 + samples_lst = [] + for data in tqdm(dataset): + prompt = data["text"] + tokens = tokenizer(prompt, return_tensors="pt") + if len(tokens.input_ids[0]) < least_tokens: + continue + num_sample += 1 + samples_lst.append(prompt) + if num_sample >= num_samples: + break + return samples_lst + + least_tokens = args.sample_len + num_samples = args.max_dataset_samples + try: + prompts = get_prompts( + args.model, + dataset_name=args.dataset, + num_samples=num_samples, + least_tokens=least_tokens, + ) + except (OSError, ValueError, RuntimeError, ImportError, ConnectionError, FileNotFoundError) as e: + import sys + sys.exit(f"Failed to load prompts from dataset {args.dataset}. Error: {e}") + prompt_token_ids = get_prompt_token_ids(args.model, prompts, least_tokens) + print(f"Got {len(prompts)} prompts, length of first prompt: {len(prompt_token_ids[0])}.") + gt = None + return prompts, prompt_token_ids, gt + + +def generate_responses(llm, input_batch, args, sampling_params=None, prompt_token_ids=None): + if prompt_token_ids: + input_batch = [{"prompt_token_ids": p} for p in prompt_token_ids] + responses = llm.generate(input_batch, sampling_params, use_tqdm=True) + + total_input_tokens = 0 + total_generated_tokens = 0 + + for response in responses: + if args.verbose: + print(f"Prompt: {response.prompt};\nAnswer: {response.outputs[0].text}\n") + total_input_tokens += len(response.prompt_token_ids) + total_generated_tokens += len(response.outputs[0].token_ids) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--dataset", type=str, required=True) + parser.add_argument("-m", "--model", type=str, required=True) + parser.add_argument("--batch-size", type=int, default=32) + parser.add_argument("--tensor-parallel-size", type=int, default=1) + parser.add_argument("--max-dataset-samples", type=int, default=0) + parser.add_argument("--max-num-prefill-seqs", type=int, default=None) + parser.add_argument("--block-quant", action="store_true", default=False) + parser.add_argument("--expert-parallel", action="store_true", default=False) + parser.add_argument( + "--auto-process-dataset", + action="store_true", + default=False, + help="Automatically generate a calibration dataset based on the provided dataset name.", + ) + parser.add_argument("--enforce-eager", action="store_true", default=False) + parser.add_argument("--max-model-len", type=int, default=2048) + parser.add_argument("--max-tokens", type=int, default=1024, help="Maximum number of tokens to generate.") + parser.add_argument("--sample-len", type=int, default=1024, help="Minimum number of tokens in each sample.") + parser.add_argument("-v", "--verbose", action="store_true") + parser.add_argument( + "--distributed-executor-backend", + choices=["mp", "ray"], + default="mp", + help= + "For single node calibration use the default multiprocessing backend. " \ + "For multi-node calibration use ray backend" + ) + + args = parser.parse_args() + if not args.auto_process_dataset: + calibration_ds = get_ds(args) + llm = vllm.LLM( + model=args.model, + dtype=torch.bfloat16, + enforce_eager=args.enforce_eager, + quantization="fp8" if args.block_quant else "inc", + max_num_seqs=args.batch_size, + tensor_parallel_size=args.tensor_parallel_size, + max_model_len=args.max_model_len, + trust_remote_code=True, + distributed_executor_backend=args.distributed_executor_backend, + enable_expert_parallel=args.expert_parallel, + ) + + sampling_params = vllm.SamplingParams(temperature=0.0, top_p=1, max_tokens=args.max_tokens) + + if not args.auto_process_dataset: + input_batch = [] + dataset_len = len(calibration_ds) + batch_num = dataset_len // args.batch_size if dataset_len % args.batch_size == 0 else (dataset_len // + args.batch_size) + 1 + batch_done = 0 + for i, (_, row) in enumerate(calibration_ds.iterrows()): + input_batch.append(row["input"]) + if i and i % args.batch_size == 0: + t_start = time.perf_counter() + generate_responses(llm, input_batch, args) + t_end = time.perf_counter() + batch_done += 1 + print(f"Batch finished: {i}/{calibration_ds.shape[0]} samples done; ETA: " + f"{int((t_end - t_start) * (batch_num - batch_done) // 60)} min") + input_batch = [] + generate_responses(llm, input_batch, args) + print(f"Last batch finished: {i + 1}/{calibration_ds.shape[0]} samples done") + else: + prompts, prompt_token_ids, gt = get_dataset(args) + generate_responses( + llm=llm, + input_batch=None, + args=args, + sampling_params=sampling_params, + prompt_token_ids=prompt_token_ids, + ) + + # Skip shutdown when VLLM_USE_V1 is set to "1" + if not os.environ.get("VLLM_USE_V1") or os.environ.get("VLLM_USE_V1") != "1": + llm.llm_engine.model_executor.shutdown() diff --git a/calibration/step-3-postprocess-measure.py b/calibration/step-3-postprocess-measure.py new file mode 100755 index 000000000..2d51cb698 --- /dev/null +++ b/calibration/step-3-postprocess-measure.py @@ -0,0 +1,122 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import argparse +import json +import os +import sys + +import numpy as np + + +def fix_cache_inputs(json_data, args): + layer_indexes = set([int(key.split('.')[2]) for key in json_data['Nodes'] if key.startswith('model.layers.')]) + for layer_index in range(len(layer_indexes)): + matmul_av_input = None + v_cache_input = None + matmul_qk_input = None + k_cache_input = None + + attn_name = "attn" + k_cache_name = "k_cache" + v_cache_name = "v_cache" + if args.deepseek: + attn_name = "mla_attn.mla_attn" + k_cache_name = "latent_cache_k" + + matmul_av_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.matmul_av' + v_cache_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.{v_cache_name}' + matmul_qk_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.matmul_qk' + k_cache_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.{k_cache_name}' + + matmul_av_input = json_data['Nodes'].get(matmul_av_key, {}).get('inputs', [None, None])[1] + v_cache_input = json_data['Nodes'].get(v_cache_key, {}).get('inputs', [None])[0] + matmul_qk_input = json_data['Nodes'].get(matmul_qk_key, {}).get('inputs', [None, None])[1] + k_cache_input = json_data['Nodes'].get(k_cache_key, {}).get('inputs', [None])[0] + + if matmul_av_input != v_cache_input: + if args.deepseek: + # For deepseek, there is one tensor for k_cache and v_cache + json_data['Nodes'][matmul_av_key]['inputs'][1] = k_cache_input + else: + json_data['Nodes'][matmul_av_key]['inputs'][1] = v_cache_input + if matmul_qk_input != k_cache_input: + json_data['Nodes'][matmul_qk_key]['inputs'][1] = k_cache_input + + return json_data + + +def parse_args(args): + parser = argparse.ArgumentParser(description="Run the measurements parser", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-m", + "--measurements", + type=str, + help="full path to the directory of the measurements that should be fixed") + parser.add_argument( + "-o", + "--out", + type=str, + default=os.getcwd(), + help="path to the directory where the fixed measurements will be written", + ) + parser.add_argument( + "-d", + "--deepseek", + action="store_true", + help="if handle deepseek models, please set this flag", + ) + return parser.parse_args(args) + + +def main(args): + args = parse_args(args) + output_path = args.out + if not os.path.exists(output_path): + os.makedirs(output_path) + measurements_path = args.measurements + measurements_paths = os.listdir(measurements_path) + measurements_paths_ranges = [ + measurement_path for measurement_path in measurements_paths if measurement_path.endswith(".json") + and 'MAXABS_HW' not in measurement_path and "mod_list" not in measurement_path + ] + measurements_paths_scales = [ + measurement_path for measurement_path in measurements_paths + if measurement_path.endswith(".json") and 'MAXABS_HW' in measurement_path and "mod_list" not in measurement_path + ] + print(measurements_paths_ranges) + print(measurements_paths_scales) + for measurement in measurements_paths_ranges + measurements_paths_scales: + fixed_json_path = os.path.join(output_path, f"{measurement.split(os.sep)[-1]}") + with open(fixed_json_path, "w") as fixed_json_file, \ + open(os.path.join(measurements_path, measurement)) as json_file: + data_to_fix = json.load(json_file) + fixed_data = fix_cache_inputs(data_to_fix, args) + json.dump(fixed_data, fixed_json_file) + print("") + print("measurement=", measurement, flush=True) + print("measurements_paths_scales=", measurements_paths_scales, flush=True) + if measurement in measurements_paths_ranges + measurements_paths_scales: + global_rank = fixed_data["GlobalRank"] + local_rank = fixed_data["LocalRank"] + mode = fixed_data["Mode"] + nodes = fixed_data["Nodes"] + layers = {} + fixed_npz_path = fixed_json_path.replace(".json", ".npz") + for layer, dlayer in nodes.items(): + layers[layer] = {} + layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]] + if dlayer.get("outputs") is not None: + layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]] + if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None: + layers[layer]["params"] = {} + layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"]) + df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers} + with open(fixed_npz_path, "w"): + np.savez(fixed_npz_path, df) + + print("finished fix_measurements script") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/calibration/step-4-quantize-scales.py b/calibration/step-4-quantize-scales.py new file mode 100755 index 000000000..a13061498 --- /dev/null +++ b/calibration/step-4-quantize-scales.py @@ -0,0 +1,47 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import vllm +import torch +import argparse +import os + +os.environ["PT_HPU_WEIGHT_SHARING"] = "0" +os.environ["VLLM_SKIP_WARMUP"] = "true" + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("--model", type=str, required=True) + parser.add_argument("--tensor-parallel-size", type=int, default=1) + parser.add_argument("--block-quant", action="store_true", default=False) + parser.add_argument("--enforce-eager", action="store_true", default=False) + parser.add_argument("--expert-parallel", action="store_true", default=False) + parser.add_argument("--max-num-prefill-seqs", type=int, default=None) + parser.add_argument( + "--distributed-executor-backend", + choices=["mp", "ray"], + default="mp", + help= + "For single node calibration use the default multiprocessing backend. " \ + "For multi-node calibration use ray backend" + ) + + args = parser.parse_args() + + llm = vllm.LLM( + model=args.model, + tensor_parallel_size=args.tensor_parallel_size, + enforce_eager=args.enforce_eager, + dtype=torch.bfloat16, + quantization="fp8" if args.block_quant else "inc", + kv_cache_dtype="fp8_inc", + max_model_len=128, + trust_remote_code=True, + distributed_executor_backend=args.distributed_executor_backend, + enable_expert_parallel=args.expert_parallel, + ) + + # Skip shutdown when VLLM_USE_V1 is set to "1" + if not os.environ.get("VLLM_USE_V1") or os.environ.get("VLLM_USE_V1") != "1": + llm.llm_engine.model_executor.shutdown() diff --git a/calibration/step-5-unify_measurements.py b/calibration/step-5-unify_measurements.py new file mode 100755 index 000000000..5d283ea00 --- /dev/null +++ b/calibration/step-5-unify_measurements.py @@ -0,0 +1,317 @@ +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### +import argparse +import glob +import json +import os +import re +import sys + +import numpy as np + + +def find_measurement_path(measurement, measurements_dir_path, scales, group_size): + measurment_card = "_" + measurement + "_" + str(group_size) + for measurment_file in os.listdir(measurements_dir_path): + filename = os.fsdecode(measurment_file) + if (not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename): + continue + if scales: + if "MAXABS" in filename: + return os.path.join(measurements_dir_path, measurment_file) + else: + if "MAXABS" not in filename: + return os.path.join(measurements_dir_path, measurment_file) + + +def is_fused_moe_op(node_name): + return ("moe" in node_name.lower() and ".w13_list" not in node_name and ".w2_list" not in node_name) + + +def is_moe_experts(node_name): + return ("moe" in node_name.lower() and (".w13_list" in node_name or ".w2_list" in node_name)) + + +def get_expert_id(node_name): + parts = node_name.split(".") + assert parts[-1].isdigit() + expert_id = int(parts[-1]) + return expert_id + + +def get_expert_prefix(node_name): + parts = node_name.split(".") + assert parts[-1].isdigit() + prefix = ".".join(parts[:-1]) + return prefix + + +def get_local_expert_num(data): + expert_id = -1 + for mod_name in data: + if is_moe_experts(mod_name): + idx = get_expert_id(mod_name) + expert_id = max(expert_id, idx) + return expert_id + 1 + + +def unify_measurements(measurement_group, + measurements_dir_path, + output_path, + groups_size, + groups_num, + group_index, + scales=False, + use_ep=False): + measurements_paths = [] + group_name = "" + + # save all the jsons paths in the given measurement group + for measurement in measurement_group: + measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size) + if measurement_path is not None: + measurements_paths.append(measurement_path) + group_name += measurement + + if len(measurements_paths) == 0: + print("Error: invalid measurement paths. No *.json files or no " + "*mod_list.json files.") + return + + # save all the jsons content in the given measurement group + measurements_jsons = [] + for measurement_path in measurements_paths: + with open(measurement_path) as f: + js = json.load(f) + measurements_jsons.append(js["Nodes"]) + # create a name for the unified json that will be created for this measurement group + base_path = find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size) + old_suffix = "_" + measurement_group[0] + "_" + str(groups_size) + new_suffix = "_" + str(group_index) + "_" + str(groups_num) + unified_json_name = base_path.split("/")[-1].replace(old_suffix, new_suffix) + unified_json_path = os.path.join(output_path, unified_json_name) + + # open a unified json file + with open(measurements_paths[0]) as origin, open(unified_json_path, "w") as copy: + copy.write(origin.read()) + with open(unified_json_path) as json_file: + unified_json = json.load(json_file) + unified_json["LocalRank"] = group_index if groups_num != 1 else -1 + + moe_experts_data = {} + # expert_num is original local_expert_num, it is used only when use_ep is True + expert_num = get_local_expert_num(unified_json["Nodes"]) if use_ep else -1 + + # iterate all unified json nodes + for node_name, node_values in unified_json["Nodes"].items(): + max_inputs = node_values["inputs"] + max_outputs = None + if node_values.get("outputs") is not None: + max_outputs = node_values["outputs"] + max_weight = None + if node_values.get("params") is not None and node_values["params"].get("weight") is not None: + max_weight = node_values["params"]["weight"] + + # iterate over all the measurment group and take the maximum for each tensor and its channel + if scales: + for idx, measurement_json in enumerate(measurements_jsons): + # for experts of moe, append results in all measurements + if use_ep and is_moe_experts(node_name): + if node_name not in moe_experts_data: + moe_experts_data[node_name] = node_values + else: + prefix, local_expert_id = get_expert_prefix(node_name), get_expert_id(node_name) + # take original total_rank=8, total_expert_num=128, + # local_expert_num=16 and expert string.MoeOp.w13_list.11 on rank 3 + # if target total_rank=4, then new local_expert_num=32, + # new expert is string.MoeOp.w13_list.27(16*1+11) on rank 1 + new_node_name = ".".join((prefix, str(expert_num * idx + local_expert_id))) + assert new_node_name not in moe_experts_data + moe_experts_data[new_node_name] = measurement_json[node_name] + continue + + # for moe op, keep max of the first, retain rest from other measurements + if use_ep and is_fused_moe_op(node_name) and idx > 0: + # input 0 of moe is hidden_states, we should get the max value + # across ranks during unification + # input 1 ~ local_expert_num is the intermidiate_amax of each + # expert, we should extend them during unification + max_inputs[0] = max(measurement_json[node_name]["inputs"][0], max_inputs[0]) + max_inputs.extend(measurement_json[node_name]["inputs"][1:]) + else: + for i in range(0, len(max_inputs)): + max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i]) + if max_outputs is not None: + max_outputs = max(measurement_json[node_name]["outputs"], max_outputs) + if max_weight is not None: + max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight) + else: + for idx, measurement_json in enumerate(measurements_jsons): + # for experts of moe, append results in all measurements + if use_ep and is_moe_experts(node_name): + if node_name not in moe_experts_data: + moe_experts_data[node_name] = node_values + else: + prefix, local_expert_id = get_expert_prefix(node_name), get_expert_id(node_name) + new_node_name = ".".join((prefix, str(expert_num * idx + local_expert_id))) + assert new_node_name not in moe_experts_data + moe_experts_data[new_node_name] = measurement_json[node_name] + continue + + for i in range(0, len(max_inputs)): + for j in range(0, len(max_inputs[i])): + max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0]) + if max_outputs is not None: + if use_ep and is_fused_moe_op(node_name) and idx > 0: + max_outputs[0][0] = max(measurement_json[node_name]["outputs"][0][0], max_outputs[0][0]) + max_outputs.extend(measurement_json[node_name]["outputs"][1:]) + else: + for i in range(0, len(max_outputs)): + max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0]) + if max_weight is not None: + for i in range(0, len(max_weight)): + max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0]) + + # update the maximum in the unified json + if scales: + for i in range(0, len(max_inputs)): + unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i] + if max_outputs is not None: + unified_json["Nodes"][node_name]["outputs"] = max_outputs + if max_weight is not None: + unified_json["Nodes"][node_name]["params"]["weight"] = max_weight + else: + for i in range(0, len(max_inputs)): + for j in range(0, len(max_inputs[i])): + unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0] + if max_outputs is not None: + for i in range(0, len(max_outputs)): + unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0] + if max_weight is not None: + for i in range(0, len(max_weight)): + unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0] + if use_ep: + unified_json["Nodes"].update(moe_experts_data) + global_rank = None + local_rank = group_index if groups_num != 1 else -1 + mode = "" + layers = {} + with open(unified_json_path, "w") as json_file: + json.dump(unified_json, json_file, indent=4) + mode = unified_json["Mode"] + nodes = unified_json["Nodes"] + + # create unified npz file from the unified json + unified_npz_path = os.path.join(output_path, unified_json_name.replace(".json", ".npz")) + for layer, dlayer in nodes.items(): + layers[layer] = {} + layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]] + if dlayer.get("outputs") is not None: + layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]] + if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None: + layers[layer]["params"] = {} + layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"]) + df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers} + with open(unified_npz_path, "w"): + np.savez(unified_npz_path, df) + + +def parse_args(args): + parser = argparse.ArgumentParser(description="Run the measurements parser", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument("-m", + "--measurements", + type=str, + help="path to the directory of the measurements that will be " + "unified") + parser.add_argument("-r", "--rank", type=int, help="rank of unified measurements") + parser.add_argument( + "-o", + "--out", + type=str, + default=os.getcwd(), + help="path to the directory where the unified measurements will be " + "written", + ) + parser.add_argument( + "-u", + "--use_expert_paral", + action="store_true", + help="unify original measurement results based on expert parallelism " + "rules", + ) + parser.add_argument( + "-s", + "--skip_unify_scales", + action="store_true", + help="skip the scale unification step.", + ) + return parser.parse_args(args) + + +def prepare_group_list(measurements_path, rank): + measure_files = glob.glob(os.path.join(measurements_path, "*_mod_list.json")) + if len(measure_files) > 0: + # take original rank=8 as an example, target file name: string_0_8_mod_list.json + matched = re.match(r"^(\w+)_(\d+)_(\d+)_(\w+)_(\w+)\.json$", os.path.basename(measure_files[0])) + if matched: + total_rank = int(matched.group(3)) + assert (rank < total_rank) and (total_rank % rank) == 0, ( + f"Original total_rank {total_rank} should be larger than your " + f"target rank {rank} and be divisible by it") + group_size = total_rank // rank + group_list = [[str(i * group_size + j) for j in range(group_size)] for i in range(rank)] + print("Card grouping list >> {}".format(group_list)) + return group_list + else: + raise ValueError("Unrecognized file name!") + else: + raise ValueError("*_mod_list.json doesn't exist in {}".format(measurements_path)) + + +def main(args): + args = parse_args(args) + output_path = args.out + if not os.path.exists(output_path): + os.mkdir(output_path) + measurements_path = args.measurements + groups = prepare_group_list(measurements_path, args.rank) + + num_jsons_drange = 0 + num_jsons_scales = 0 + for path in os.listdir(measurements_path): + if path.endswith(".json"): + if "MAXABS" in path: + num_jsons_scales += 1 + elif "mod_list" not in path: + num_jsons_drange += 1 + assert (os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0 + and (num_jsons_scales % len(groups)) == 0) + + for group_index, group in enumerate(groups): + unify_measurements(group, + measurements_path, + output_path, + num_jsons_drange, + len(groups), + group_index, + scales=False, + use_ep=args.use_expert_paral) + if not args.skip_unify_scales: + unify_measurements( + group, + measurements_path, + output_path, + num_jsons_scales, + len(groups), + group_index, + scales=True, + use_ep=args.use_expert_paral, + ) + + print("finished measurement unifier script") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/calibration/step-6-expand-measurements.py b/calibration/step-6-expand-measurements.py new file mode 100644 index 000000000..c6dabb5da --- /dev/null +++ b/calibration/step-6-expand-measurements.py @@ -0,0 +1,213 @@ +############################################################################### +# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company +############################################################################### +import argparse +import json +import os +import sys + +import numpy as np +import logging + +# from loguru import logger +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def find_measurement_path(measurement, measurements_dir_path, group_size): + measurement_card = "_" + measurement + "_" + str(group_size) + for measurement_file in os.listdir(measurements_dir_path): + filename = os.fsdecode(measurement_file) + if (not filename.endswith(".json") or "_mod_list" in filename or measurement_card not in filename): + continue + if "MAXABS" not in filename: + return os.path.join(measurements_dir_path, measurement_file) + + +def is_fused_moe_op(node_name): + return ("moe" in node_name.lower() and ".w13_list" not in node_name and ".w2_list" not in node_name) + + +def is_moe_experts(node_name): + # model.layers.3.mlp.experts.moe_op.w13_list.0 + return ("moe" in node_name.lower() and (".w13_list" in node_name or ".w2_list" in node_name)) + + +def get_expert_id(node_name): + parts = node_name.split(".") + assert parts[-1].isdigit() + expert_id = int(parts[-1]) + return expert_id + + +def get_expert_prefix(node_name): + parts = node_name.split(".") + assert parts[-1].isdigit() + prefix = ".".join(parts[:-1]) + return prefix + + +def get_local_expert_num(data): + expert_id = -1 + for mod_name in data: + if is_moe_experts(mod_name): + idx = get_expert_id(mod_name) + expert_id = max(expert_id, idx) + return expert_id + 1 + + +def expand_measurements( + measurements_dir_path, + output_path, + local_rank, + world_size, +): + measurement_group = ["0"] + # save all the jsons paths in the given measurement group + groups_size = 1 + unified_measurement_index = "0" + measurement_path = find_measurement_path(unified_measurement_index, measurements_dir_path, groups_size) + measurements_jsons = [] + + with open(measurement_path) as f: + js = json.load(f) + measurements_jsons.append(js["Nodes"]) + # New json file name + new_json_name = (find_measurement_path(measurement_group[0], measurements_dir_path, + groups_size).split("/")[-1].replace( + "_" + measurement_group[0] + "_" + str(groups_size), + "_" + str(local_rank) + "_" + str(world_size), + )) + logger.info( + "Generating new json file: %s with local_rank %d and world_size %d", + new_json_name, + local_rank, + world_size, + ) + new_json_path = os.path.join(output_path, new_json_name) + + # Create a replica of the measurement json file + with open(measurement_path) as origin, open(new_json_path, "w") as copy: + copy.write(origin.read()) + with open(new_json_path) as json_file: + new_json = json.load(json_file) + new_json["LocalRank"] = local_rank + + expert_num = get_local_expert_num(new_json["Nodes"]) + total_experts = expert_num + + # Iterate all nodes + for node_name, node_values in new_json["Nodes"].items(): + max_outputs = None + if node_values.get("outputs") is not None: + max_outputs = node_values["outputs"] + + # iterate over all the measurements and update the fused moe op with selected experts data + + for idx, measurement_json in enumerate(measurements_jsons): + if is_fused_moe_op(node_name): + node_res = measurement_json[node_name]["outputs"] + node_res_output = node_res[0] + node_res_experts_intermediate_amax = node_res[1:] + num_intermediate_amax = len(node_res_experts_intermediate_amax) + assert num_intermediate_amax == total_experts, ( + f"the number of intermediate amax should be {total_experts}, but got {num_intermediate_amax}") + ep_size = world_size + ep_rank = local_rank + num_local_experts = total_experts // ep_size + expert_start_index = ep_rank * num_local_experts + expert_end_index = expert_start_index + num_local_experts + node_intermediate_amax = node_res_experts_intermediate_amax[expert_start_index:expert_end_index] + assert len(node_intermediate_amax) == num_local_experts, ( + f"len(node_intermediate_amax) should be {num_local_experts}, but got {len(node_intermediate_amax)}") + max_outputs = [node_res_output, *node_intermediate_amax] + logger.debug( + "Selecting %d outputs for %s " + "ep_rank %d with expert_start_index %d and expert_end_index %d", + len(max_outputs), + node_name, + ep_rank, + expert_start_index, + expert_end_index, + ) + + if max_outputs is not None and is_fused_moe_op(node_name): + new_json["Nodes"][node_name]["outputs"] = max_outputs + + global_rank = None + local_rank = local_rank + mode = "" + layers = {} + with open(new_json_path, "w") as json_file: + json.dump(new_json, json_file, indent=4) + mode = new_json["Mode"] + nodes = new_json["Nodes"] + + # create unified npz file from the new json + unified_npz_path = os.path.join(output_path, new_json_name.replace(".json", ".npz")) + for layer, dlayer in nodes.items(): + layers[layer] = {} + layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]] + if dlayer.get("outputs") is not None: + layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]] + if (dlayer.get("params") is not None and dlayer["params"].get("weight") is not None): + layers[layer]["params"] = {} + layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"]) + df = { + "GlobalRank": global_rank, + "LocalRank": local_rank, + "Mode": mode, + "Nodes": layers, + } + with open(unified_npz_path, "w"): + np.savez(unified_npz_path, df) + + +def parse_args(args): + parser = argparse.ArgumentParser( + description="Run the measurements parser", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-m", + "--measurements", + type=str, + help="path to the directory of the measurements that have been unified", + ) + parser.add_argument( + "-o", + "--out", + type=str, + default=os.getcwd(), + help="path to the directory where the expand measurements will be written", + ) + parser.add_argument( + "-w", + "--target_world_size", + type=int, + help="The target number of ranks to expand the measurements to.", + ) + return parser.parse_args(args) + + +def main(args): + args = parse_args(args) + output_path = args.out + if not os.path.exists(output_path): + os.mkdir(output_path) + measurements_path = args.measurements + + target_world_size = args.target_world_size + for ep_rank in range(target_world_size): + expand_measurements( + measurements_dir_path=measurements_path, + output_path=output_path, + local_rank=ep_rank, + world_size=target_world_size, + ) + + logger.info("finished expanding measurements for %d ranks", target_world_size) + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/calibration/template/llama-2-chat.jinja b/calibration/template/llama-2-chat.jinja new file mode 100644 index 000000000..78c348f0f --- /dev/null +++ b/calibration/template/llama-2-chat.jinja @@ -0,0 +1,24 @@ +{% if messages[0]['role'] == 'system' %} + {% set system_message = '<>\n' + messages[0]['content'] | trim + '\n<>\n\n' %} + {% set messages = messages[1:] %} +{% else %} + {% set system_message = '' %} +{% endif %} + +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if loop.index0 == 0 %} + {% set content = system_message + message['content'] %} + {% else %} + {% set content = message['content'] %} + {% endif %} + + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + content | trim + ' [/INST]' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + content | trim + ' ' + eos_token }} + {% endif %} +{% endfor %} diff --git a/calibration/template/mistral_mixtral.jinja b/calibration/template/mistral_mixtral.jinja new file mode 100644 index 000000000..3fd756364 --- /dev/null +++ b/calibration/template/mistral_mixtral.jinja @@ -0,0 +1,19 @@ +{% if messages[0]['role'] == 'system' %} + {% set system_message = messages[0]['content'] | trim + '\n\n' %} + {% set messages = messages[1:] %} +{% else %} + {% set system_message = '' %} +{% endif %} + +{{ bos_token + system_message}} +{% for message in messages %} + {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %} + {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} + {% endif %} + + {% if message['role'] == 'user' %} + {{ '[INST] ' + message['content'] | trim + ' [/INST]' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + message['content'] | trim + eos_token }} + {% endif %} +{% endfor %} diff --git a/calibration/unify-and-expand.png b/calibration/unify-and-expand.png new file mode 100644 index 000000000..3c113fbf8 Binary files /dev/null and b/calibration/unify-and-expand.png differ diff --git a/calibration/vlm-calibration/README.md b/calibration/vlm-calibration/README.md new file mode 100644 index 000000000..c83cd2189 --- /dev/null +++ b/calibration/vlm-calibration/README.md @@ -0,0 +1,19 @@ +# FP8 Calibration Procedure for VLM models + +The model calibration procedure for LLM models is a little bit different than VLM, we need to change it to adapt VLM models. To simplify this process, we've provided the `calibrate_model.sh` script. It requires the following arguments: + +- `-m`, i.e., **model stub or path:** Path to your model (if stored locally) or the model ID from the Hugging Face Hub. +- `-d`, i.e., **dir to the source dataset:** It's path of HuggingFace Cache dir, which is used to store various calibration dataset. We currently hard-coded the calibration dataset to MMMU datasets. We add some valid check, please make sure the provided dataset path meet any of the following conditions: (1) The provided dir contain raw MMMU dataset and processed MMMU dataset. Specially, the dir of dataset has two subfolders, `${your_dataset_dir}/hub` contains raw datasets, `${your_dataset_dir}/datasets` contains processed datasets. (2) The provided dir `${your_dataset_dir}` contains processed datasets. (3) If users don't provided the dataset path, we will directly download it from huggingface. +- `-o`, i.e., **output path:** Path to the directory where the generated measurements, etc., will be stored. +- `-t`, i.e., **tensor parallel size:** Tensor parallel size to run at. + +Here are some examples of how to use the script: + +```bash +cd vlm-calibration +./calibrate_model.sh \ + -m $MODEL_PATH \ + -o $INC_OUTPUT_PATH \ + -t $TP_SIZE \ + -d $DATASET_PATH +``` diff --git a/calibration/vlm-calibration/calibrate_model.sh b/calibration/vlm-calibration/calibrate_model.sh new file mode 100755 index 000000000..9a35d3fd5 --- /dev/null +++ b/calibration/vlm-calibration/calibrate_model.sh @@ -0,0 +1,219 @@ +#!/bin/bash +############################################################################### +# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company +############################################################################### + +set -e +cd "$(dirname "$0")" + +ALLOWED_DEVICES=("g2" "g3") + +usage() { + echo + echo "Calibrate given MODEL_PATH for FP8 inference" + echo + echo "usage: ${0} " + echo + echo " -m - [required] huggingface stub or local directory of the MODEL_PATH" + echo " -d - [optional] path to source dataset (details in README). If not provided, the dataset will be downloaded from HuggingFace." + echo " -o - [required] path to output directory for fp8 measurements" + echo " -b - batch size to run the measurements at (default: 32)" + echo " -l - limit number of samples in calibration dataset" + echo " -t - tensor parallel size to run at (default: 1); NOTE: if t > 8 then we need a multi-node setup" + echo " -g - groups of cards we want to unify. Card indices seperated by commas and groups seperated by double dash '--', e.g. 0,1--2,3--4,5--6,7 card 0 measurement will be unified with card 1 measurement and so on." + echo " -e - Turn on or off eager mode, default: off" + echo +} + +cleanup_tmp() { + if [[ $(pwd) == *vlm-calibration ]]; then + echo "Clearing temporary directory" + mkdir -p inc_tmp nc_workspace + rm -rf nc_workspace + rm -rf inc_tmp + else + echo "Skipping temporary directory removal" + fi +} + +create_measure_config() { + mkdir -p $1/$2/$3 + + model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]') + + tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\": []},\"blocklist\": {\"types\": [], \"names\": [\"lm_head\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + + echo "$tmp_config" > $1/$2/maxabs_measure_$3.json +} + +create_quant_config() { + mkdir -p $1/$2/$3 + + model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]') + + tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}" + + echo "$tmp_config" > $1/$2/maxabs_quant_$3.json +} + +extract_last_folder_name() { + local path="$1" + + path="${path%/}" + last_folder="$(basename "$path")" + last_folder="${last_folder,,}" + + echo "$last_folder" +} + +cleanup_tmp + +# jump to the script directory +cd "$(dirname "$0")" +echo "downloading requirements..." +pip install -r requirements.txt + +EXTRA_FLAGS="" +BATCH_SIZE=32 +TP_SIZE=1 +eager_mode="off" +while getopts "m:b:l:t:d:h:o:g:e:" OPT; do + case ${OPT} in + m ) + MODEL_PATH="$OPTARG" + ;; + d ) + DATASET_PATH="$OPTARG" + ;; + b ) + BATCH_SIZE="$OPTARG" + ;; + o ) + FP8_DIR=$(realpath "$OPTARG") + ;; + l ) + LIMIT="$OPTARG" + ;; + t ) + TP_SIZE="$OPTARG" + ;; + g ) + CARD_GROUPS="$OPTARG" + ;; + h ) + usage + ;; + e ) + eager_mode="$OPTARG" + ;; + \? ) + usage + exit 1 + ;; + esac +done + +if [[ -z "$MODEL_PATH" && -z "$FP8_DIR" ]]; then + echo "Model stub and output path for fp8 measurements must be provided." + usage + exit 1 +fi + +if [[ -z "$DATASET_PATH" ]]; then + echo "Local calibration dataset path not provided. Will download it from HuggingFace." +else + echo "Using local calibration dataset path: $DATASET_PATH" + if [[ -d "$DATASET_PATH/hub/datasets--MMMU--MMMU" && -d "$DATASET_PATH/datasets/MMMU___mmmu" ]]; then + export HF_HOME="/root/.cache/huggingface" + echo "copying local calibration dataset $DATASET_PATH to $HF_HOME" + mkdir -p $HF_HOME "$HF_HOME/hub" "$HF_HOME/datasets" + cp -rf "$DATASET_PATH/hub/datasets--MMMU--MMMU" "$HF_HOME/hub" + cp -rf "$DATASET_PATH/datasets/MMMU___mmmu" "$HF_HOME/datasets" + elif [[ -d "$DATASET_PATH/MMMU___mmmu" ]]; then + export HF_DATASETS_CACHE="/root/.cache/huggingface/datasets" + echo "copying local calibration dataset $DATASET_PATH to $HF_DATASETS_CACHE" + mkdir -p $HF_DATASETS_CACHE + cp -rf "$DATASET_PATH/MMMU___mmmu" $HF_DATASETS_CACHE + else + echo "Your provided dataset path doesn't contain MMMU dataset. Please refer to README for details." + exit 1 + fi +fi + + +if [[ $eager_mode == "on" ]]; then + EXTRA_FLAGS+="--enforce-eager " +fi + +# Store the provided MODEL_PATH name in a variable +MODEL_NAME=$(extract_last_folder_name "$MODEL_PATH") + +echo "" +echo "Step 1/3 - detecting used device type [g2, g3]" +DEVICE_TYPE=$(python3 ../step-0-detect-device.py) || (echo "Detecting device process failed" && exit 1) +DEVICE_TYPE="g$DEVICE_TYPE" +echo "Detected device type: $DEVICE_TYPE" +echo "Step 1 done" + +# Check if the provided device type is valid +if [[ ! " ${ALLOWED_DEVICES[*]} " =~ " $DEVICE_TYPE " ]]; then + echo "Invalid device type: $DEVICE_TYPE. Allowed devices: ${ALLOWED_DEVICES[*]}" + exit 1 +fi + + +create_measure_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE +create_quant_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE + +if [[ $TP_SIZE > 1 ]]; then + export PT_HPU_ENABLE_LAZY_COLLECTIVES=true +fi +export VLLM_SKIP_WARMUP=true +max_model_len=8192 + + +echo "" +echo "2/3 Measuring scales" +export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json +# quantization='None' +# kv_cache_dtype='auto' +quantization='inc' +kv_cache_dtype='auto' + +python3 vision_lm_eval.py \ + --max-model-len $max_model_len \ + --model-path $MODEL_PATH \ + --quantization $quantization \ + --kv-cache-dtype $kv_cache_dtype \ + --tensor-parallel-size $TP_SIZE \ + $EXTRA_FLAGS +echo "Step 2/3 done" + + +echo "" +echo "3/3 Quantize scales" +export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json +quantization='inc' +kv_cache_dtype='fp8_inc' + +python3 vision_lm_eval.py \ + --max-model-len $max_model_len \ + --model-path $MODEL_PATH \ + --quantization $quantization \ + --kv-cache-dtype $kv_cache_dtype \ + --tensor-parallel-size $TP_SIZE \ + $EXTRA_FLAGS + +echo "Step 3/3 done" + + + +if [[ -n $CARD_GROUPS ]]; then + echo "" + echo "Unify scales" + QUANT_DIR=$FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/ + python3 ../step-5-unify_measurements.py -g "$CARD_GROUPS" -m $QUANT_DIR -o $QUANT_DIR || (echo "Error in step 5" && exit 1) + echo "Unify scales done" +fi +cleanup_tmp +echo "Calibration process done" \ No newline at end of file diff --git a/calibration/vlm-calibration/requirements.txt b/calibration/vlm-calibration/requirements.txt new file mode 100644 index 000000000..b0d742b52 --- /dev/null +++ b/calibration/vlm-calibration/requirements.txt @@ -0,0 +1,2 @@ +lm_eval +datasets diff --git a/calibration/vlm-calibration/vision_lm_eval.py b/calibration/vlm-calibration/vision_lm_eval.py new file mode 100644 index 000000000..8feba9d81 --- /dev/null +++ b/calibration/vlm-calibration/vision_lm_eval.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: Apache-2.0 +""" +This example shows how to use vLLM for running offline inference with +multi-image input on vision language models for text generation, +using the chat template defined by the model. +""" + +from vllm.utils import FlexibleArgumentParser +from vllm.engine.arg_utils import AsyncEngineArgs + +from lm_eval import tasks, evaluator +from lm_eval.models.vllm_vlms import VLLM_VLM + +IMAGE_LIMIT = 1 + + +def run_generate(): + config_template_bf16 = { + "model_name": "REPLACE_ME", + "lm_eval_kwargs": { + "batch_size": "auto" + }, + "vllm_kwargs": { + "pretrained": "REPLACE_ME", + "max_num_seqs": 128, + "max_model_len": 2048, + "dtype": "bfloat16", + "data_parallel_size": 1, + "tensor_parallel_size": args.tensor_parallel_size, + "disable_log_stats": False, + }, + } + config_template_fp8 = { + **config_template_bf16, "vllm_kwargs": { + **config_template_bf16["vllm_kwargs"], + "quantization": args.quantization, + "kv_cache_dtype": args.kv_cache_dtype, + "weights_load_device": args.weights_load_device, + } + } + config_template_vision_fp8 = { + **config_template_fp8, + "lm_eval_kwargs": { + **config_template_fp8["lm_eval_kwargs"], + "max_images": IMAGE_LIMIT, + }, + "vllm_kwargs": { + **config_template_fp8["vllm_kwargs"], + "max_num_seqs": 32, + "use_padding_aware_scheduling": True, + "max_num_prefill_seqs": 1, # TODO: remove when higher prefill batch size will be supported + "disable_log_stats": True, # TODO: investigate error when running with log stats + }, + } + lm_instance_cfg = { + **config_template_vision_fp8, + "model_name": "Meta-Llama-3.2-11B-Vision-Instruct", + "lm_eval_kwargs": { + **config_template_vision_fp8["lm_eval_kwargs"], + "batch_size": 8, + }, + "vllm_kwargs": { + **config_template_vision_fp8["vllm_kwargs"], + "pretrained": args.model_path, + "enforce_eager": args.enforce_eager, + "max_model_len": args.max_model_len, + }, + } + lm = VLLM_VLM(**lm_instance_cfg["vllm_kwargs"], **lm_instance_cfg["lm_eval_kwargs"]) + + task_name = "mmmu_val" + task_manager = tasks.TaskManager(include_path="./meta-configs") + task_dict = tasks.get_task_dict(task_name, task_manager) + eval_kwargs = { + "limit": 1, + "fewshot_as_multiturn": True, + "apply_chat_template": True, + } + + results = evaluator.evaluate(lm=lm, task_dict=task_dict, **eval_kwargs) + return results + + +def main(args): + run_generate() + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description='Demo on using vLLM for offline inference with ' + 'vision language models that support multi-image input for text ' + 'generation') + parser.add_argument('--model-path', '-p', type=str, default="", help='Huggingface model path') + parser = AsyncEngineArgs.add_cli_args(parser) + + args = parser.parse_args() + main(args) diff --git a/docs/.nav.yml b/docs/.nav.yml index 42ace218e..510922d6a 100644 --- a/docs/.nav.yml +++ b/docs/.nav.yml @@ -15,7 +15,12 @@ nav: - user_guide/* - Configuration: - Summary: configuration/README.md - - configuration/* + - configuration/env_vars.md + - configuration/long_context.md + - configuration/model_calibration.md + - configuration/optimization.md + - configuration/pipeline_parallelism.md + #- configuration/* - Models: - models/validated_models.md - Features: diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md index 8451c7a6d..7751c2e5f 100644 --- a/docs/configuration/env_vars.md +++ b/docs/configuration/env_vars.md @@ -16,6 +16,9 @@ - `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated to HPUGraph capture. The default is `0.1`. - `VLLM_EXPONENTIAL_BUCKETING`: if `true`, enables exponential bucket spacing instead of linear. The default is `true`. + +**Experimental Knobs:** + - `VLLM_SKIP_WARMUP`: if `true`, warmup is skipped. The default is `false`. !!! note @@ -23,7 +26,10 @@ !!! tip When a deployed workload does not utilize the full context that a model can handle, it is good practice to limit the maximum values upfront based on the input and output token lengths that will be generated after serving the vLLM server. -

**Example:**

Let's assume that we want to deploy text generation model Qwen2.5-1.5B, which has a defined `max_position_embeddings` of 131072 (our `max_model_len`). At the same time, we know that our workload pattern will not use the full context length because we expect a maximum input token size of 1K and predict generating a maximum of 2K tokens as output. In this case, starting the vLLM server to be ready for the full context length is unnecessary. Instead, we should limit it upfront to achieve faster service preparation and decrease warmup time. The recommended values in this example should be: + + **Example:** + + Let's assume that we want to deploy text generation model Qwen2.5-1.5B, which has a defined `max_position_embeddings` of 131072 (our `max_model_len`). At the same time, we know that our workload pattern will not use the full context length because we expect a maximum input token size of 1K and predict generating a maximum of 2K tokens as output. In this case, starting the vLLM server to be ready for the full context length is unnecessary. Instead, we should limit it upfront to achieve faster service preparation and decrease warmup time. The recommended values in this example should be: > - `--max_model_len`: `3072` - the sum of input and output sequences (1+2)*1024. > - `VLLM_PROMPT_SEQ_BUCKET_MAX`: `1024` - the maximum input token size that we expect to handle. @@ -48,11 +54,11 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - Default values: - Prompt: - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1` - - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32` + - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `1` - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `max_num_prefill_seqs` - - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` - - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` - - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len` + - query length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size` + - query length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size` + - query length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_num_batched_tokens` - sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0` - sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1` - sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size` @@ -62,4 +68,4 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs` - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size` - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size` - - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_blocks` + - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_model_len * max_num_seqs // block_size` by default or `max_blocks` for CONTIGUOUS PA diff --git a/docs/dev_guide/ci-failures.md b/docs/dev_guide/ci-failures.md index 96f897c5f..712ac98f1 100644 --- a/docs/dev_guide/ci-failures.md +++ b/docs/dev_guide/ci-failures.md @@ -1,3 +1,53 @@ # CI Failures -WIP +## CI + +For all PRs that are created in vllm-gaudi repository all checks in CI are required: +- pre-commit & DCO +- HPU tests +- HPU Gaudi tests + +### Pre-commit & DCO +To install run: + +```pre-commit install``` + +This way all of your commits should be correctly formated and signed-off. If you need to manually sign off your commits, remember to use ```git commit -s``` to pass DCO. + +### HPU tests +HPU tests consist of several unit tests: +- pre merge tests +- unit tests +- perf test +- feature tests +- e2e tests + +All of the above tests are mandatory. Those tests operate in fast fail mode, meaning if one test fails, all of the others won't be triggered. + +### HPU Gaudi tests +Additional Gaudi tests are expectd to pass, but aren't mandatory. Those tests are being run on internal Jenkins system, so results are internal only. Those tests can be run by CODEOWNERs and TESTOWNERs only. + +## Docs Pull Requests +All PRs that do not interfere in code, like docstring changes or README updates can be merged without HPU tests and Gaudi tests. It is still required to pass pre-commit check. + +## Hourly Checks and Tests +On vllm-gaudi repository hourly tests can be found in ```Hourly Commit Check and Tests``` under ```Actions``` tab. This tab also allows developers to manually trigger hourly tests on selected branch. + +If the last hourly test is failing it means that vllm-gaudi main branch doesn't work with upstream newest main commit. To find last good commit check [last good commit](https://github.com/vllm-project/vllm-gaudi/blob/vllm/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT). + +Failing hourly checks will be fixed by developers as soon as possible. + +## Troubleshooting +### Unreleated failures +Sometimes there may be some issues that are unreleated to your specific changes in code. Often causeb by connection problems. In this case failed checks should be reruned. Those errors are: +- ```Error response from daemon: No such container``` +- ```ValueError: Unsupported device: the device type is 7.``` +- ```[Device not found] Device acquire failed.``` + +### Accuracy and functionality issues +Accuracy issues can be tracked in HPU Gaudi tests with gsm8k runs. If any check fails with accuracy - too low accuracy compare to the one measured, or functionality issues, the **PR can't be merged** until solved. + +### Pre-commit failures +To run pre-commit test manually run: + +```pre-commit run --show-diff-on-failure --color=always --all-files --hook-stage manual``` diff --git a/docs/features/bucketing_mechanism.md b/docs/features/bucketing_mechanism.md index 6a98cdc2d..cda63a654 100644 --- a/docs/features/bucketing_mechanism.md +++ b/docs/features/bucketing_mechanism.md @@ -17,6 +17,7 @@ In dynamic inference serving scenarios, minimizing the number of graph compilati ## Bucketing Strategies Bucketing is focused on three dimensions: + - `batch size`: number of samples in batch - `query lenght`: sequence length without context tokens - `num blocks`: context length counted in blocks @@ -44,6 +45,7 @@ a `(2, 1, 512)` bucket, or the context length increases beyond 512 tokens. It wi ### Exponential Strategy - Default Exponential strategy is the default warm-up mechanism. It is based on 4 parameters: + - `min`: the smallest value - `step`: the rounding value for bucket boundaries - `max`: the largest value @@ -60,7 +62,7 @@ Example distribution is shown below: min = 128, step = 128, max = 4096, limit = 13 ``` -![exponential bucketing distribution for 4096 max query length](../../docs/assets/graphs/exponential_bucketing_example.png) +![exponential bucketing distribution for 4096 max query length](../assets/graphs/exponential_bucketing_example.png) This strategy creates more buckets with smaller values closer to `min`. As the values increase toward `max`, the buckets become less frequent, meaning the distance between them gets larger. This helps prioritize warming up the smaller values more precisely, while still covering the full range. @@ -73,7 +75,7 @@ Linear strategy is determined with 3 parameters only - `min`, `step` and `max`. `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, the interval between `min` and `step` has special handling: `min` is multiplied by consecutive powers of two until the multiplier is less than or equal to `step`. We refer to this as the ramp-up phase, which is used for handling lower batch sizes with minimal wastage, while allowing for larger padding on larger batch sizes. -**Example with ramp-up** +#### Example with ramp-up ```{.} min = 2, step = 32, max = 64 @@ -82,7 +84,7 @@ min = 2, step = 32, max = 64 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64) ``` -**Example without ramp-up** +#### Example without ramp-up ```{.} min = 128, step = 128, max = 512 @@ -94,6 +96,7 @@ min = 128, step = 128, max = 512 ### Unified Strategy Unified strategy is dedicated strategy for Unified Attention. It's buckets are determined by different dimensions: + - `query length`: number of currently processed tokens, without context tokens - `shared num blocks`: context length counted in blocks, including only blocks that are either shared between at least two block tables (different requests) or is used by at least two tokens in query - `unique num blocks`: context length counted in blocks, including only blocks that are not shared between block tables and are used only by one token @@ -114,7 +117,7 @@ Example distribution is shown below: batch size = 64, max num batched tokens = 4096 ``` -![exponential bucketing distribution for 4096 max query length](../../docs/assets/graphs/unified_bucketing_example.png) +![exponential bucketing distribution for 4096 max query length](../assets/graphs/unified_bucketing_example.png) Additionaly for context blocks, both shared and unique, `0` value will be added as well. diff --git a/docs/features/supported_features.md b/docs/features/supported_features.md index 19fb4a2ac..55747f33b 100644 --- a/docs/features/supported_features.md +++ b/docs/features/supported_features.md @@ -3,29 +3,36 @@ title: Supported Features --- [](){ #supported-features } -# Supported Features +## Supported Features + | **Feature** | **Description** | **References** | |--- |--- |--- | -| Offline batched inference | Offline inference using LLM class from vLLM Python API | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html) | -| Online inference via OpenAI-Compatible Server | Online inference using HTTP server that implements OpenAI Chat and Completions API | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html) | +| Offline batched inference | Offline inference using LLM class from vLLM Python API | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html) | +| Online inference via OpenAI-Compatible Server | Online inference using HTTP server that implements OpenAI Chat and Completions API | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html) | | HPU autodetection | HPU users do not need to specify the target platform, it will be detected automatically upon vLLM startup | N/A | | Paged KV cache with algorithms enabled for Intel Gaudi accelerators | vLLM HPU backend contains a custom Paged Attention and cache operators implementations optimized for Gaudi devices. | N/A | | Custom Intel Gaudi operator implementations | vLLM HPU backend provides optimized implementations of operators such as prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding. | N/A | -| Tensor parallel inference (single or multi-node multi-HPU) | vLLM HPU backend supports multi-HPU inference across multiple nodes with tensor parallelism with multiprocessing or Ray and HCCL. | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)
[Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)
[HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html) | -| Pipeline parallel inference (single or multi-node multi-HPU) | vLLM HPU backend supports multi-HPU inference across single or multi-node with pipeline parallelism. | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)
[Running Pipeline Parallelism](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md#pipeline-parallelism) | -| Inference with HPU Graphs | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time and replayed later during inference, significantly reducing host overheads. | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)
[vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)
[Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture) | +| Tensor parallel inference | vLLM HPU backend supports multi-HPU inference with tensor parallelism with multiprocessing. | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html) [Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html) [HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html) | +| Pipeline parallel inference | vLLM HPU backend supports multi-HPU inference with pipeline parallelism. | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html) [Running Pipeline Parallelism](https://vllm-gaudi.readthedocs.io/en/latest/configuration/pipeline_parallelism.html) | +| Inference with HPU Graphs | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time and replayed later during inference, significantly reducing host overheads. | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html) [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) [Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture) | | Inference with torch.compile | vLLM HPU backend supports inference with `torch.compile`. | [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes) | | INC quantization | vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). (Not fully supported with torch.compile execution mode) | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html) | | AutoAWQ quantization | vLLM HPU backend supports inference with models quantized using AutoAWQ library. | [Library](https://github.com/casper-hansen/AutoAWQ) | | AutoGPTQ quantization | vLLM HPU backend supports inference with models quantized using AutoGPTQ library. | [Library](https://github.com/AutoGPTQ/AutoGPTQ) | -| LoRA/MultiLoRA support | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)
[vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | -| Multi-step scheduling support | vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter. | [Feature RFC](https://github.com/vllm-project/vllm/issues/6854) | -| Automatic prefix caching | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)
[Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) | -| Speculative decoding (functional release) | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurable via standard `--speculative_model` and `--num_speculative_tokens` parameters. (Not fully supported with torch.compile execution mode) | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)
[Example](https://docs.vllm.ai/en/stable/getting_started/examples/mlpspeculator.html) | -| Multiprocessing backend | Multiprocessing is the default distributed runtime in vLLM. The vLLM HPU backend supports it alongside Ray. | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) | +| LoRA/MultiLoRA support | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models. | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html) [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html) | +| Fully async model executor | This allows the model runner to function asynchronously when using async scheduling. This allows full overlap of the cpu operations (including prepare_inputs) and the model forward pass. This does not support speculative decoding, PP, or guided decoding. Expected speedup is 5-10% over the current async scheduling. | [Feature description](https://github.com/vllm-project/vllm/pull/23569) | +| Automatic prefix caching | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter. | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html) [Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html) | +| Speculative decoding (functional release) | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurable via standard `--speculative_model` and `--num_speculative_tokens` parameters. (Not fully supported with torch.compile execution mode) | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html) [Example](https://docs.vllm.ai/en/stable/getting_started/examples/mlpspeculator.html) | +| Multiprocessing backend | Multiprocessing is the default distributed runtime in vLLM. | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) | | Multimodal | vLLM HPU backend supports the inference for multi-modal models. (Not fully supported with t.compile execution mode) | [Documentation](https://docs.vllm.ai/en/latest/serving/multimodal_inputs.html) | -| Multinode support | vLLM HPU backend supports distributed, multiple-node inference with Ray. | | -| vLLM v1 architecture (early release) | V1 architecture is now available for the HPU backend, and will gradually enable it for every use case we plan to support. | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) | | Guided decode | vLLM HPU supports a guided decoding backend for generating structured outputs. | [Documentation](https://docs.vllm.ai/en/latest/features/structured_outputs.html) | -| Delayed Sampling (experimental) | vLLM HPU supports delayed sampling scheduling for asynchronous execution, enabled by `VLLM_DELAYED_SAMPLING=true` environment variable. | N/A | | Exponential bucketing | vLLM HPU supports exponential bucketing spacing instead of linear to automate configuration of bucketing mechanism, enabled by default. It can be disabled via `VLLM_EXPONENTIAL_BUCKETING=false` environment variable. | N/A | +| Data Parellel support | vLLM HPU supports Data Parellel | [Documentation](https://docs.vllm.ai/en/stable/serving/data_parallel_deployment.html) [Example](https://docs.vllm.ai/en/latest/examples/offline_inference/data_parallel.html) | + +## Coming Soon + +- Sliding window attention +- P/D disaggregate support +- In-place weight update +- MLA with Unified Attention +- Multinode support diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md index 8125c5365..5f01cb8fa 100644 --- a/docs/getting_started/installation.md +++ b/docs/getting_started/installation.md @@ -1,36 +1,35 @@ --- title: Installation --- -[](){ #installation } + This guide provides instructions on running vLLM with Intel Gaudi devices. ## Requirements - Python 3.10 - Intel Gaudi 2 or 3 AI accelerators -- Intel Gaudi software version 1.21.0 or above +- Intel Gaudi software version 1.22.0 or above !!! note To set up the execution environment, please follow the instructions in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html). To achieve the best performance on HPU, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). +## Running vLLM on Gaudi with Docker Compose + +Starting with the 1.22 release, we are introducing ready-to-run container images that bundle vLLM and Gaudi software. Please follow the [instruction](https://github.com/vllm-project/vllm-gaudi/tree/main/.cd) to quickly launch vLLM on Gaudi using a prebuilt Docker image and Docker Compose, with options for custom parameters and benchmarking. + ## Quick Start Using Dockerfile -# --8<-- [start:docker_quickstart] -Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile. -=== "Ubuntu" +## --8<-- [start:docker_quickstart] - ``` - $ docker build -f Dockerfile.hpu -t vllm-hpu-env . - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env - ``` +Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile. -=== "Red Hat Enterprise Linux for Use with Red Hat OpenShift AI" +=== "Ubuntu" ``` - $ docker build -f Dockerfile.hpu.ubi -t vllm-hpu-env . - $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env + $ docker build -f .cd/Dockerfile.ubuntu.pytorch.vllm -t vllm-hpu-env . + $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --entrypoint='' --rm vllm-hpu-env ``` !!! tip @@ -38,11 +37,13 @@ Set up the container with the latest Intel Gaudi Software Suite release using th of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime). Make sure you have ``habanalabs-container-runtime`` package installed and that ``habana`` container runtime is registered. -# --8<-- [end:docker_quickstart] + +## --8<-- [end:docker_quickstart] ## Build from Source ### Environment Verification + To verify that the Intel Gaudi software was correctly installed, run the following: $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible @@ -59,58 +60,89 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html): - docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest - docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest + docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest + docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest ### Build and Install vLLM -Currently, multiple ways are provided which can be used to install vLLM with Intel® Gaudi®: - -=== "Stable vLLM-fork version" +=== "Step 1: Get Last good commit on vllm" - vLLM releases are being performed periodically to align with Intel® Gaudi® software releases. The stable version is released with a tag, and supports fully validated features and performance optimizations in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork). To install the stable release from [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: + !!! note + Vllm-gaudi always follows the latest vllm commit. However, updates to the upstream vLLM + API may cause vLLM-Gaudi to crash. This saved commit has been verified with vLLM-Gaudi + on an hourly basis. - ```{.console} - git clone https://github.com/HabanaAI/vllm-fork.git - cd vllm-fork - git checkout v0.7.2+Gaudi-1.21.0 - pip install -r requirements-hpu.txt - python setup.py develop + ```bash + git clone https://github.com/vllm-project/vllm-gaudi + cd vllm-gaudi + export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) ``` -=== "Latest vLLM-fork" +=== "Step 2: Install vLLM" + + Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source) + + ```bash + # Build vLLM from source for empty platform, reusing existing torch installation + git clone https://github.com/vllm-project/vllm + cd vllm + git checkout $VLLM_COMMIT_HASH + pip install -r <(sed '/^torch/d' requirements/build.txt) + VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e . + cd .. + ``` - Currently, the latest features and performance optimizations are being developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and periodically upstreamed to the vLLM main repository. - To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following: +=== "Step 3: Install vLLM Plugin" - ```{.console} - git clone https://github.com/HabanaAI/vllm-fork.git - cd vllm-fork - git checkout habana_main - pip install --upgrade pip - pip install -r requirements-hpu.txt - python setup.py develop + Install vLLM-Gaudi from source: + ```bash + cd vllm-gaudi + pip install -e . + cd .. ``` -=== "vLLM Upstream" +### Build and Install vLLM with nixl - If you prefer to build and install directly from the main vLLM source, where periodically we are upstreaming new features, run the following: +=== "Install vLLM Plugin with nixl" - ```{.console} - git clone https://github.com/vllm-project/vllm.git - cd vllm - pip install -r requirements-hpu.txt - python setup.py develop + ```bash + cd vllm-gaudi + python install_nixl.py + cd .. ``` -=== "[EXPERIMENTAL] vLLM Upstream + Plugin" +=== "Install vLLM Gaudi and nixl with Docker file" - You're on the bleeding edge, good luck to you: + ```bash + docker build -t ubuntu.pytorch.vllm.nixl.latest \ + -f .cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest github.com/vllm-project/vllm-gaudi + docker run -it --rm --runtime=habana \ + --name=ubuntu.pytorch.vllm.nixl.latest \ + --network=host \ + -e HABANA_VISIBLE_DEVICES=all \ + vllm-gaudi-for-llmd /bin/bash + ``` - ```{.console} - VLLM_TARGET_DEVICE=hpu pip install git+https://github.com/HabanaAI/vllm-fork.git@dev/upstream_vllm_for_plugin - pip uninstall -y triton - git clone -b plugin_poc https://github.com/HabanaAI/vllm-hpu-extension.git vllm-hpu - cd vllm-hpu +=== "Full installation from source vLLM Gaudi with nixl" + + ```bash + # Fetch last good commit on vllm + git clone https://github.com/vllm-project/vllm-gaudi + cd vllm-gaudi + export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) + + # Build vLLM from source for empty platform, reusing existing torch installation + git clone https://github.com/vllm-project/vllm + cd vllm + git checkout $VLLM_COMMIT_HASH + pip install -r <(sed '/^torch/d' requirements/build.txt) + VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e . + cd .. + + # Build vLLM-Gaudi from source + cd vllm-gaudi pip install -e . + + # Build nixl + python install_nixl.py ``` diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index feaf0409a..5611ce4d0 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -1,12 +1,13 @@ --- title: Quickstart --- -[](){ #quickstart } -This guide will help you quickly get started with vLLM to perform: +## vLLM Quick Start Guide -- [Offline batched inference][quickstart-offline] -- [Online serving using OpenAI-compatible server][quickstart-online] +This guide shows how to quickly launch vLLM on Gaudi using a prebuilt Docker +image with Docker Compose which is supported on Ubuntu only. It supports model benchmarking, custom runtime parameters, +and a selection of validated models — including the LLama, Mistral, and Qwen. +The advanced configuration is available via environment variables or YAML files. ## Requirements @@ -19,38 +20,364 @@ This guide will help you quickly get started with vLLM to perform: To achieve the best performance on HPU, please follow the methods outlined in the [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html). -## Quick Start Using Dockerfile +## Running vLLM on Gaudi with Docker Compose ---8<-- "docs/getting_started/installation.md:docker_quickstart" +Follow the steps below to run the vLLM server or launch benchmarks on Gaudi using Docker Compose. + +### 1. Clone the vLLM fork repository and navigate to the appropriate directory + + git clone https://github.com/vllm-project/vllm-gaudi.git + cd vllm-gaudi/.cd/ + +This ensures you have the required files and Docker Compose configurations. + +### 2. Set the following environment variables + +| **Variable** | **Description** | +| --- |--- | +| `MODEL` | Choose a model name from the [`vllm supported models`][supported-models] list. | +| `HF_TOKEN` | Your Hugging Face token (generate one at ). | +| `DOCKER_IMAGE` | The Docker image name or URL for the vLLM Gaudi container. When using the Gaudi repository, make sure to select Docker images with the *vllm-installer* prefix in the file name. | + +### 3. Run the vLLM server using Docker Compose + + MODEL="Qwen/Qwen2.5-14B-Instruct" \ + HF_TOKEN="" \ + DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \ + docker compose up + +To automatically run benchmarking for a selected model using default settings, add the `--profile benchmark up` option + + MODEL="Qwen/Qwen2.5-14B-Instruct" \ + HF_TOKEN="" \ + DOCKER_IMAGE=="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \ + docker compose --profile benchmark up + +This command launches the vLLM server and runs the associated benchmark suite. + +## Advanced Options + +The following steps cover optional advanced configurations for +running the vLLM server and benchmark. These allow you to fine-tune performance, +memory usage, and request handling using additional environment variables or configuration files. +For most users, the basic setup is sufficient, but advanced users may benefit from these customizations. + +=== "Run vLLM Using Docker Compose with Custom Parameters" + + To override default settings, you can provide additional environment variables when starting the server. This advanced method allows fine-tuning for performance and memory usage. + + **Environment variables** + + | **Variable** | **Description** | + |---|---| + | `PT_HPU_LAZY_MODE` | Enables Lazy execution mode, potentially improving performance by batching operations. | + | `VLLM_SKIP_WARMUP` | Skips the model warmup phase to reduce startup time (may affect initial latency). | + | `MAX_MODEL_LEN` | Sets the maximum supported sequence length for the model. | + | `MAX_NUM_SEQS` | Specifies the maximum number of sequences processed concurrently. | + | `TENSOR_PARALLEL_SIZE` | Defines the degree of tensor parallelism. | + | `VLLM_EXPONENTIAL_BUCKETING` | Enables or disables exponential bucketing for warmup strategy. | + | `VLLM_DECODE_BLOCK_BUCKET_STEP` | Configures the step size for decode block allocation, affecting memory granularity. | + | `VLLM_DECODE_BS_BUCKET_STEP` | Sets the batch size step for decode operations, impacting how decode batches are grouped. | + | `VLLM_PROMPT_BS_BUCKET_STEP` | Adjusts the batch size step for prompt processing. | + | `VLLM_PROMPT_SEQ_BUCKET_STEP` | Controls the step size for prompt sequence allocation. | + + **Example** + + ```bash + MODEL="Qwen/Qwen2.5-14B-Instruct" \ + HF_TOKEN="" \ + DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \ + TENSOR_PARALLEL_SIZE=1 \ + MAX_MODEL_LEN=2048 \ + docker compose up + ``` + +=== "Run vLLM and Benchmark with Custom Parameters" + + You can customize benchmark behavior by setting additional environment variables before running Docker Compose. + + **Benchmark parameters:** + + | **Variable** | **Description** | + |---|---| + | `INPUT_TOK` | Number of input tokens per prompt. | + | `OUTPUT_TOK` | Number of output tokens to generate per prompt. | + | `CON_REQ` | Number of concurrent requests during benchmarking. | + | `NUM_PROMPTS`| Total number of prompts to use in the benchmark. | + + **Example:** + + ```bash + MODEL="Qwen/Qwen2.5-14B-Instruct" \ + HF_TOKEN="" \ + DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \ + INPUT_TOK=128 \ + OUTPUT_TOK=128 \ + CON_REQ=16 \ + NUM_PROMPTS=64 \ + docker compose --profile benchmark up + ``` + + This launches the vLLM server and runs the benchmark using your specified parameters. + +=== "Run vLLM and Benchmark with Combined Custom Parameters" + + You can launch the vLLM server and benchmark together, providing any combination of server and benchmark-specific parameters. + + **Example:** + + ```bash + MODEL="Qwen/Qwen2.5-14B-Instruct" \ + HF_TOKEN="" \ + DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu22.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \ + TENSOR_PARALLEL_SIZE=1 \ + MAX_MODEL_LEN=2048 \ + INPUT_TOK=128 \ + OUTPUT_TOK=128 \ + CON_REQ=16 \ + NUM_PROMPTS=64 \ + docker compose --profile benchmark up + ``` + + This command starts the server and executes benchmarking with the provided configuration. + +=== "Run vLLM and Benchmark Using Configuration Files" + + You can also configure the server and benchmark via YAML configuration files. Set the following environment variables: + + | **Variable** | **Description** | + |---|---| + | `VLLM_SERVER_CONFIG_FILE` | Path to the server config file inside the Docker container. | + | `VLLM_SERVER_CONFIG_NAME` | Name of the server config section. | + | `VLLM_BENCHMARK_CONFIG_FILE` | Path to the benchmark config file inside the container. | + | `VLLM_BENCHMARK_CONFIG_NAME` | Name of the benchmark config section. | + + **Example** + + ```bash + HF_TOKEN= \ + VLLM_SERVER_CONFIG_FILE=server_configurations/server_text.yaml \ + VLLM_SERVER_CONFIG_NAME=llama31_8b_instruct \ + VLLM_BENCHMARK_CONFIG_FILE=benchmark_configurations/benchmark_text.yaml \ + VLLM_BENCHMARK_CONFIG_NAME=llama31_8b_instruct \ + docker compose --profile benchmark up + ``` + + !!! note + When using configuration files, you do not need to set the `MODEL` variable as the model details are included in the config files. However, the `HF_TOKEN` flag is still required. + +=== "Run vLLM Directly Using Docker" + + For maximum control, you can run the server directly using the `docker run` command, allowing full customization of Docker runtime settings. + + **Example:** + + ```bash + docker run -it --rm \ + -e MODEL=$MODEL \ + -e HF_TOKEN=$HF_TOKEN \ + -e http_proxy=$http_proxy \ + -e https_proxy=$https_proxy \ + -e no_proxy=$no_proxy \ + --cap-add=sys_nice \ + --ipc=host \ + --runtime=habana \ + -e HABANA_VISIBLE_DEVICES=all \ + -p 8000:8000 \ + --name vllm-server \ + + ``` + + This method provides full flexibility over how the vLLM server is executed within the container. + +--- + +## Supported Models + +| **Model Name** | **Validated TP Size** | +|---|---| +| deepseek-ai/DeepSeek-R1-Distill-Llama-70B | 8 | +| meta-llama/Llama-3.1-70B-Instruct | 4 | +| meta-llama/Llama-3.1-405B-Instruct | 8 | +| meta-llama/Llama-3.1-8B-Instruct | 1 | +| meta-llama/Llama-3.3-70B-Instruct | 4 | +| mistralai/Mistral-7B-Instruct-v0.2 | 1 | +| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | +| mistralai/Mixtral-8x22B-Instruct-v0.1 | 4 | +| Qwen/Qwen2.5-7B-Instruct | 1 | +| Qwen/Qwen2.5-VL-7B-Instruct | 1 | +| Qwen/Qwen2.5-14B-Instruct | 1 | +| Qwen/Qwen2.5-32B-Instruct | 1 | +| Qwen/Qwen2.5-72B-Instruct | 4 | +| ibm-granite/granite-8b-code-instruct-4k | 1 | +| ibm-granite/granite-20b-code-instruct-8k | 1 | ## Executing inference === "Offline Batched Inference" [](){ #quickstart-offline } + + Offline inference processes multiple prompts in a batch without needing a running server. This is ideal for batch jobs and testing. + ```python from vllm import LLM, SamplingParams - prompts = [ - "Hello, my name is", - "The future of AI is", - ] - sampling_params = SamplingParams(temperature=0.8, top_p=0.95) - llm = LLM(model="facebook/opt-125m") + def main(): + prompts = [ + "Hello, my name is", + "The future of AI is", + ] + sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct") + + outputs = llm.generate(prompts, sampling_params) + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") - outputs = llm.generate(prompts, sampling_params) + if __name__ == "__main__": + main() - for output in outputs: - prompt = output.prompt - generated_text = output.outputs[0].text - print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") ``` -=== "OpenAI Completions API" +=== "Online Inference" [](){ #quickstart-online } - WIP + + Online inference provides real-time text generation through a running vLLM server. + First, start the server: + + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --host 0.0.0.0 \ + --port 8000 + ``` + + Then query it from Python: + + ```python + import requests + + def main(): + url = "http://localhost:8000/v1/completions" + headers = {"Content-Type": "application/json"} + + payload = { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "The future of AI is", + "max_tokens": 50, + "temperature": 0.8 + } + + response = requests.post(url, headers=headers, json=payload) + result = response.json() + print(result["choices"][0]["text"])} + + if __name__ == "__main__": + main() + + ``` + +=== "OpenAI Completions API" + + [](){ #quickstart-oopenai-completions-api } + + vLLM provides an OpenAI-compatible completions API. + Start the server: + + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --host 0.0.0.0 \ + --port 8000 + ``` + + Use the OpenAI Python client: + + ```python + from openai import OpenAI + + def main(): + client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1") + + result = client.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + prompt="Explain quantum computing in simple terms:", + max_tokens=100, + temperature=0.7 + ) + print(result.choices[0].text) + + if __name__ == "__main__": + main() + ``` + + Or use curl: + + ```bash + curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "prompt": "Explain quantum computing in simple terms:", + "max_tokens": 100, + "temperature": 0.7 + }' + ``` === "OpenAI Chat Completions API with vLLM" - WIP + [](){ #quickstart-oopenai-chat-completions-api } + + vLLM also supports the OpenAI chat completions API format. + Start the server: + + ```bash + python -m vllm.entrypoints.openai.api_server \ + --model meta-llama/Llama-3.1-8B-Instruct \ + --host 0.0.0.0 \ + --port 8000 + ``` + + Use the OpenAI Python client: + + ```python + from openai import OpenAI + + def main(): + client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1") + + chat = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"} + ], + max_tokens=50, + temperature=0.7 + ) + print(chat.choices[0].message.content) + + if __name__ == "__main__": + main() + ``` + + Or use curl: + + ```bash + curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is the capital of France?"} + ], + "max_tokens": 50, + "temperature": 0.7 + }' + ``` diff --git a/docs/models/validated_models.md b/docs/models/validated_models.md index cb8bf9242..ca4811691 100644 --- a/docs/models/validated_models.md +++ b/docs/models/validated_models.md @@ -5,6 +5,24 @@ title: Validated Models The following configurations have been validated to function with Gaudi 2 or Gaudi 3 devices with random or greedy sampling. Configurations that are not listed may or may not work. +| **Model** | **Tensor Parallelism [x HPU]** | **Datatype** | **Validated on** | +|:--- |:---: |:---: |:---: | +| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| +| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 2, 4, 8 | BF16, FP8, FP16 (Gaudi 2) |Gaudi 2, Gaudi 3| +| [meta-llama/Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 8 | BF16, FP8 |Gaudi 3| +| [meta-llama/Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) | 4 | BF16, FP8 | Gaudi 3| +| [meta-llama/Granite-3B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k) | 1 | BF16 | Gaudi 3| +| [meta-llama/Granite-8B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-8b-code-instruct-128k) | 1 | BF16 | Gaudi 3| +| [meta-llama/Granite-3.1-8B-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| +| [meta-llama/Granite-20B-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| +| [meta-llama/Granite-34B-code-instruc-8k](https://huggingface.co/ibm-granite/granite-34b-code-instruct-8k) | 1 | BF16 | Gaudi 3| +| [mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 1, 4 | BF16 | Gaudi 3| +| [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | 2 | FP8, BF16 |Gaudi 2, Gaudi 3| +| [meta-llama/CodeLlama-34b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 1 | BF16 |Gaudi 3| +| [Qwen/Qwen3-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507) | 8 | BF16 |Gaudi 3| + +Validation of following configurations is under progress. + | **Model** | **Tensor Parallelism [x HPU]** | **Datatype** | **Validated on** | |:--- |:---: |:---: |:---: | | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1, 2, 8 | BF16 | Gaudi 2, Gaudi 3| @@ -12,25 +30,12 @@ The following configurations have been validated to function with Gaudi 2 or Gau | [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 8 | BF16 |Gaudi 2, Gaudi 3| | [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 8 | BF16 |Gaudi 2, Gaudi 3| | [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1 | BF16, FP8, INT4, FP16 (Gaudi 2) | Gaudi 2, Gaudi 3| -| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| | [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 2, 4, 8 | BF16, FP8, INT4 |Gaudi 2, Gaudi 3| -| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 2, 4, 8 | BF16, FP8, FP16 (Gaudi 2) |Gaudi 2, Gaudi 3| | [meta-llama/Meta-Llama-3.1-405B](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B) | 8 | BF16, FP8 |Gaudi 3| -| [meta-llama/Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 8 | BF16, FP8 |Gaudi 3| -| [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| -| [meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) | 4, 8 (min. for Gaudi 2) | BF16, FP8 | Gaudi 2, Gaudi 3| -| [meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct) | 4, 8 (min. for Gaudi 2) | BF16 | Gaudi 2, Gaudi 3 | | [meta-llama/Meta-Llama-3.3-70B](https://huggingface.co/meta-llama/Llama-3.3-70B) | 4 | BF16, FP8 | Gaudi 3| -| [meta-llama/Granite-3B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k) | 1 | BF16 | Gaudi 3| -| [meta-llama/Granite-3.0-8B-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| -| [meta-llama/Granite-20B-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | 1 | BF16, FP8 | Gaudi 2, Gaudi 3| -| [meta-llama/Granite-34B-code-instruc-8k](https://huggingface.co/ibm-granite/granite-34b-code-instruct-8k) | 1 | BF16 | Gaudi 3| -| [mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 1, 4 | BF16 | Gaudi 3| | [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1, 2 | BF16 | Gaudi 2| -| [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) | 2 | FP8, BF16 |Gaudi 2, Gaudi 3| | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1, 8 | BF16 | Gaudi 2, Gaudi 3 | | [princeton-nlp/gemma-2-9b-it-SimPO](https://huggingface.co/princeton-nlp/gemma-2-9b-it-SimPO) | 1 | BF16 |Gaudi 2, Gaudi 3| | [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) | 8 | BF16 |Gaudi 2| | [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 8 | BF16 |Gaudi 2| -| [meta-llama/CodeLlama-34b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 1 | BF16 |Gaudi 3| -| [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)
[quick start scripts](https://github.com/HabanaAI/vllm-fork/blob/deepseek_r1/scripts/DEEPSEEK_R1_ON_GAUDI.md) | 8 | FP8, BF16 |Gaudi 2, Gaudi 3| +| [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) | 8 | FP8, BF16 |Gaudi 2, Gaudi 3| diff --git a/examples/data_parallel.py b/examples/data_parallel.py index 349145b5f..1d1eec2d4 100644 --- a/examples/data_parallel.py +++ b/examples/data_parallel.py @@ -30,7 +30,6 @@ """ import os -import sys from time import sleep import torch @@ -178,9 +177,6 @@ def start(rank): if __name__ == "__main__": args = parse_args() - print("Execution is currently disabled. Exiting. expected fix in SW-241972") - sys.exit(0) # Exits gracefully with an success code - dp_size = args.dp_size tp_size = args.tp_size node_size = args.node_size diff --git a/mkdocs.yaml b/mkdocs.yaml index 4b141430c..7b06f9b5f 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -82,6 +82,7 @@ plugins: markdown_extensions: - attr_list + - sane_lists - md_in_html - admonition - pymdownx.details @@ -109,6 +110,10 @@ markdown_extensions: # For math rendering - mdx_math: enable_dollar_delimiter: true + # For checkbox feature + - pymdownx.tasklist: + custom_checkbox: true + extra_javascript: - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML diff --git a/requirements.txt b/requirements.txt index 8b8ff73f1..701716ea9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ # Dependencies for HPU code -ray +numexpr==2.13.1 +ray<2.49.0 pandas -numpy==1.26.4 +numpy tabulate setuptools>=77.0.3,<80.0.0 setuptools-scm>=8 diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh index 8c0136c2d..776301d88 100644 --- a/tests/full_tests/ci_gsm8k_tests.sh +++ b/tests/full_tests/ci_gsm8k_tests.sh @@ -13,8 +13,12 @@ echo $VLLM_GAUDI_PREFIX # Gemma3 with image input run_gemma3_test() { echo "➡️ Testing gemma-3-4b-it..." - VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml" - echo "✅ Test with multimodal-support with gemma-3-4b-it passed." + #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml" + #echo "✅ Test with multimodal-support with gemma-3-4b-it passed." + #echo "➡️ Testing gemma-3-27b-it..." + #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml" + #echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed." + # echo "Skipping gemma-3-4b-it due to changes from https://github.com/vllm-project/vllm/pull/26715 } # Basic model test @@ -158,6 +162,14 @@ run_gsm8k_granite_test() { echo "✅ Test with granite-8b passed." } +# GSM8K on granite-8b (unified attn) +run_gsm8k_granite_test_unified_attn() { + echo "➡️ Testing GSM8K on granite-8b with unified attention..." + VLLM_UNIFIED_ATTN=True VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \ + pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/granite-8b.yaml" + echo "✅ Test with granite-8b unified attention passed." +} + # GSM8K on granite-8b with async scheduling run_gsm8k_granite_async_test() { echo "➡️ Testing GSM8K on granite-8b with async scheduling..." @@ -230,6 +242,7 @@ launch_all_tests() { run_compressed_w4a16_channelwise_test run_compressed_w4a16_moe_gidx_test run_gsm8k_granite_test + run_gsm8k_granite_test_unified_attn run_gsm8k_granite_async_test run_gsm8k_deepseek_test run_gsm8k_qwen3_30b_test diff --git a/tests/full_tests/model_cards/gemma-3-27b-it.yaml b/tests/full_tests/model_cards/gemma-3-27b-it.yaml new file mode 100644 index 000000000..5631346a2 --- /dev/null +++ b/tests/full_tests/model_cards/gemma-3-27b-it.yaml @@ -0,0 +1,26 @@ +model_name: "google/gemma-3-27b-it" +test_config: + # Single image test + - modality: image + extra_engine_args: + mm_processor_kwargs: + min_pixels: 802816 # 896x896 + max_pixels: 1003520 + fps: 1 + input_data_config: + num_prompts: 4 + media_source: default + + # Multi-image test + - modality: multi_image + extra_engine_args: + mm_processor_kwargs: + min_pixels: 802816 # 896x896 + max_pixels: 1003520 + fps: 1 + limit_mm_per_prompt: + image: 10 # Allow up to 10 images per prompt + input_data_config: + num_prompts: 2 + media_source: default # Uses default images + num_images: 6 diff --git a/tests/models/language/generation/generation_mm_multi.py b/tests/models/language/generation/generation_mm_multi.py new file mode 100644 index 000000000..7ec7ff974 --- /dev/null +++ b/tests/models/language/generation/generation_mm_multi.py @@ -0,0 +1,246 @@ +from argparse import ArgumentParser +from vllm import LLM, EngineArgs, SamplingParams +from vllm.assets.image import ImageAsset, ImageAssetName +from vllm.assets.video import VideoAsset +from vllm.multimodal.image import convert_image_mode +from dataclasses import asdict +from typing import Union, get_args +from PIL import Image +from dataclasses import dataclass +import yaml +import os +from vllm_gaudi.extension.logger import logger as init_logger + +logger = init_logger() + + +@dataclass +class PROMPT_DATA: + _questions = { + "image": [ + "What is the most prominent object in this image?", "Describe the scene in the image.", + "What is the weather like in the image?", "Write a short poem about this image." + ], + "multi_image": [ + "Compare and contrast these images. What are the similarities and differences?", + "Tell a story that connects all these images together.", + "What common themes do you see across these images?", + "Describe the progression or sequence shown in these images.", "Which image stands out the most and why?", + "What emotions or moods are conveyed by these images collectively?" + ], + "video": ["Describe this video", "Which movie would you associate this video with?"] + } + + def __post_init__(self): + self._questions = self._questions + + def _load_single_image(self, source: str) -> Image.Image: + """Load a single image""" + if source == "default": + return convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB") + else: + return convert_image_mode(Image.open(source), "RGB") + + def _load_video(self, source: str): + """Load video data""" + return VideoAsset(name="baby_reading" if source == "default" else source, num_frames=16).np_ndarrays + + def _load_multiple_images(self, source: Union[str, list[str]]) -> list[Image.Image]: + images = [] + """Load multiple images from various sources""" + if source == "default": + # Get all available ImageAsset names from the Literal type + available_assets = list(get_args(ImageAssetName)) + logger.info("Available ImageAssets: %(available_assets)s", {"available_assets": available_assets}) + + # Load up to 6 different assets (or more if needed) + target_count = 6 + loaded_count = 0 + for asset_name in available_assets: + if loaded_count >= target_count: + break + + try: + img = ImageAsset(asset_name).pil_image + converted_img = convert_image_mode(img, "RGB") + images.append(converted_img) + loaded_count += 1 + logger.info("Successfully loaded ImageAsset: %(asset_name)s (Size: %(size)s)", + dict(asset_name=asset_name, size=converted_img.size)) + except Exception as e: + logger.warning("Failed to load ImageAsset '%(asset_name)s': %(e)s", dict(asset_name=asset_name, + e=e)) + continue + + elif isinstance(source, list): + # Load from list of file paths + for img_path in source: + try: + img = Image.open(img_path) + images.append(convert_image_mode(img, "RGB")) + except Exception as e: + logger.warning("Failed to load image %(img_path)s: %(e)s", dict(img_path=img_path, e=e)) + + logger.info("Loaded %(num_images)s images for multi-image processing", {"num_images": len(images)}) + return images + + def _get_data(self, modality: str, source: str): + """Get data based on modality""" + if modality == "image": + return self._load_single_image(source) + elif modality == "multi_image": + return self._load_multiple_images(source) + elif modality == "video": + return self._load_video(source) + else: + raise ValueError(f"Unsupported modality: {modality}") + + def get_prompts(self, + model_name: str = "", + modality: str = "image", + media_source: str = "default", + num_prompts: int = 1, + num_images: int = 1, + skip_vision_data=False): + + # Handle multi-image modality + if modality == "multi_image" or modality == "image": + pholder = "" * num_images if "gemma" in model_name.lower() else "<|image_pad|>" * num_images + elif modality == "video": + pholder = "