diff --git a/.cd/Dockerfile.rhel.tenc.pytorch.vllm b/.cd/Dockerfile.rhel.tenc.pytorch.vllm
index 2794dda34..d3eb34c50 100644
--- a/.cd/Dockerfile.rhel.tenc.pytorch.vllm
+++ b/.cd/Dockerfile.rhel.tenc.pytorch.vllm
@@ -13,8 +13,9 @@ ARG TORCH_TYPE_SUFFIX
 FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-${TORCH_TYPE_SUFFIX}installer-${PT_VERSION}:${REVISION}
 
 # Parameterize commit/branch for vllm-fork checkout
-ARG VLLM_GAUDI_COMMIT=v0.10.1
-ARG VLLM_PROJECT_COMMIT=v0.10.1
+ARG VLLM_GAUDI_COMMIT=main
+# leave empty to use last-good-commit-for-vllm-gaudi
+ARG VLLM_PROJECT_COMMIT=
 
 ARG BASE_NAME
 ENV BASE_NAME=${BASE_NAME}
@@ -39,23 +40,36 @@ ENV VLLM_PATH=/workspace/vllm-project
 ENV VLLM_PATH2=/workspace/vllm-gaudi
 
 # Clone the vllm-project repository and install inside the container
-RUN mkdir -p $VLLM_PATH && \
+# --- START: COMBINED RUN COMMAND ---
+RUN \
+    # Clone vllm-gaudi and get the commit hash for the vllm-project/vllm
+    set -e && \
+    mkdir -p $VLLM_PATH2 && \
+    git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \
+    cd $VLLM_PATH2 && \
+    if [ -z "${VLLM_PROJECT_COMMIT}" ]; then \
+       VLLM_PROJECT_COMMIT=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) && \
+       echo "Found vLLM commit hash: ${VLLM_PROJECT_COMMIT}"; \
+    else \
+       echo "Using vLLM commit : ${VLLM_PROJECT_COMMIT}"; \
+    fi && \
+    mkdir -p $VLLM_PATH && \
+    # Clone vllm-project/vllm and use configured or last good commit hash
     git clone https://github.com/vllm-project/vllm.git $VLLM_PATH && \
     cd $VLLM_PATH && \
     git remote add upstream https://github.com/vllm-project/vllm.git && \
     git fetch upstream --tags || true && \
     git checkout ${VLLM_PROJECT_COMMIT} && \
-    bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \
-    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
-
-# Clone the vllm-gaudi repository and install inside the container
-RUN mkdir -p $VLLM_PATH2 && \
-    git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \
+    # Install vllm-project/vllm
+    bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \
+    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
+    # Install vllm-gaudi plugin
     cd $VLLM_PATH2 && \
     git checkout ${VLLM_GAUDI_COMMIT} && \
-    VLLM_TARGET_DEVICE=hpu && pip install -v -e $VLLM_PATH2
+    VLLM_TARGET_DEVICE=hpu pip install -v . --no-build-isolation
+# --- END: COMBINED RUN COMMAND ---
 
-    # to be enabled later PWolsza
+# to be enabled later PWolsza
 # RUN pip3 install -v -e $VLLM_PATH/tests/vllm_test_utils
 
 # Install additional Python packages
@@ -70,4 +84,4 @@ COPY benchmark /root/scripts/benchmark/
 WORKDIR /root/scripts
 
 # Set entrypoint script
-ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"]
\ No newline at end of file
+ENTRYPOINT ["python3", "-m", "entrypoints.entrypoint_main"]
diff --git a/.cd/Dockerfile.ubuntu.pytorch.vllm b/.cd/Dockerfile.ubuntu.pytorch.vllm
index 81574d688..ea8d8d4d6 100644
--- a/.cd/Dockerfile.ubuntu.pytorch.vllm
+++ b/.cd/Dockerfile.ubuntu.pytorch.vllm
@@ -4,7 +4,7 @@
 # Parameterize base image components
 ARG DOCKER_URL=vault.habana.ai/gaudi-docker
 ARG VERSION=1.22.0
-ARG BASE_NAME=ubuntu22.04
+ARG BASE_NAME=ubuntu24.04
 ARG PT_VERSION=2.7.1
 ARG REVISION=latest
 ARG REPO_TYPE=habanalabs
@@ -12,9 +12,9 @@ ARG REPO_TYPE=habanalabs
 FROM ${DOCKER_URL}/${VERSION}/${BASE_NAME}/${REPO_TYPE}/pytorch-installer-${PT_VERSION}:${REVISION}
 
 # Parameterize commit/branch for vllm-project & vllm-gaudi checkout
-ARG VLLM_GAUDI_COMMIT=v0.10.2_next
-ARG VLLM_PROJECT_COMMIT=v0.10.2
-
+ARG VLLM_GAUDI_COMMIT=main
+# leave empty to use last-good-commit-for-vllm-gaudi
+ARG VLLM_PROJECT_COMMIT=
 ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
 
 RUN apt update && \
@@ -30,24 +30,34 @@ RUN echo "dash dash/sh boolean false" | debconf-set-selections && \
 ENV ENV=~/.profile
 
 # Clone the vllm-project repository and install inside the container
-
-RUN mkdir -p $VLLM_PATH && \
+# --- START: COMBINED RUN COMMAND ---
+RUN \
+    # Clone vllm-gaudi and get the commit hash for the vllm-project/vllm
+    set -e && \
+    mkdir -p $VLLM_PATH2 && \
+    git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \
+    cd $VLLM_PATH2 && \
+    if [ -z "${VLLM_PROJECT_COMMIT}" ]; then \
+       VLLM_PROJECT_COMMIT=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null) && \
+       echo "Found vLLM commit hash: ${VLLM_PROJECT_COMMIT}"; \
+    else \
+       echo "Using vLLM commit : ${VLLM_PROJECT_COMMIT}"; \
+    fi && \
+    mkdir -p $VLLM_PATH && \
+    # Clone vllm-project/vllm and use configured or last good commit hash
     git clone https://github.com/vllm-project/vllm.git $VLLM_PATH && \
     cd $VLLM_PATH && \
     git remote add upstream https://github.com/vllm-project/vllm.git && \
     git fetch upstream --tags || true && \
     git checkout ${VLLM_PROJECT_COMMIT} && \
-    bash -c "pip install -r <(sed '/^[torch]/d' requirements/build.txt)" && \
-    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation .
-
-# Clone the vllm-gaudi repository and install inside the container
-
-RUN mkdir -p $VLLM_PATH2 && \
-    git clone https://github.com/vllm-project/vllm-gaudi.git $VLLM_PATH2 && \
+    # Install vllm-project/vllm
+    bash -c "pip install -r <(sed '/^torch/d' requirements/build.txt)" && \
+    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
+    # Install vllm-gaudi plugin
     cd $VLLM_PATH2 && \
-# Comment: enable if vllm-gaudi release version is used otherwise main
-    git checkout ${VLLM_GAUDI_COMMIT} && \ 
-    VLLM_TARGET_DEVICE=hpu && pip install -v $VLLM_PATH2 --no-build-isolation
+    git checkout ${VLLM_GAUDI_COMMIT} && \
+    VLLM_TARGET_DEVICE=hpu pip install -v . --no-build-isolation
+# --- END: COMBINED RUN COMMAND ---
 
 # Install additional Python packages
 RUN pip install datasets && \
diff --git a/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest b/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest
index 68feedf0b..b7c589827 100644
--- a/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest
+++ b/.cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest
@@ -45,7 +45,7 @@ RUN \
     git remote add upstream https://github.com/vllm-project/vllm.git && \
     git fetch upstream --tags || true && \
     git checkout ${VLLM_COMMIT_HASH} && \
-    pip install -r <(sed '/^[torch]/d' requirements/build.txt) && \
+    pip install -r <(sed '/^torch/d' requirements/build.txt) && \
     VLLM_TARGET_DEVICE=empty pip install --no-build-isolation . && \
     \
     # Install vllm-gaudi
diff --git a/.cd/benchmark/benchmark_defaults.yaml b/.cd/benchmark/benchmark_defaults.yaml
index cc2b65b10..0147e628b 100644
--- a/.cd/benchmark/benchmark_defaults.yaml
+++ b/.cd/benchmark/benchmark_defaults.yaml
@@ -29,12 +29,10 @@ model_text:
 
 model_vision:
   MODELS:
-    - meta-llama/Llama-3.2-11B-Vision-Instruct
-    - meta-llama/Llama-3.2-90B-Vision-Instruct
     - Qwen/Qwen2.5-VL-7B-Instruct
   DATASET: lmarena-ai/vision-arena-bench-v0.1
   DATASET_NAME: hf
   BACKEND: openai-chat
   ENDPOINT: /v1/chat/completions
   CONCURRENT_REQ: 64
-  NUM_PROMPTS: 500
\ No newline at end of file
+  NUM_PROMPTS: 500
diff --git a/.cd/benchmark/benchmark_scenarios_vision.yaml b/.cd/benchmark/benchmark_scenarios_vision.yaml
index b9e438cf5..8e00db022 100644
--- a/.cd/benchmark/benchmark_scenarios_vision.yaml
+++ b/.cd/benchmark/benchmark_scenarios_vision.yaml
@@ -1,8 +1,2 @@
-llama32-11B-Vision-Instruct:
-  MODEL: meta-llama/Llama-3.2-11B-Vision-Instruct
-
-llama32-90B-Vision-Instruct:
-  MODEL: meta-llama/Llama-3.2-90B-Vision-Instruct
-
 qwen2.5-vl-7b-instruct:
   MODEL: Qwen/Qwen2.5-VL-7B-Instruct
diff --git a/.cd/entrypoints/entrypoint_main.py b/.cd/entrypoints/entrypoint_main.py
index c107414a7..babfce32a 100644
--- a/.cd/entrypoints/entrypoint_main.py
+++ b/.cd/entrypoints/entrypoint_main.py
@@ -190,6 +190,7 @@ def run(self):
                 output_script_path="vllm_server.sh",
                 variables=variables,
                 log_dir="logs",
+                varlist_conf_path="server/server_output.env",
             ).create_and_run()
         elif self.mode == "benchmark":
             print("[INFO] Starting container in benchmark mode.")
diff --git a/.cd/entrypoints/script_generator.py b/.cd/entrypoints/script_generator.py
index e48062d22..684d1e9c7 100644
--- a/.cd/entrypoints/script_generator.py
+++ b/.cd/entrypoints/script_generator.py
@@ -4,8 +4,9 @@
 
 class ScriptGenerator:
 
-    def __init__(self, template_script_path, output_script_path, variables, log_dir="logs"):
+    def __init__(self, template_script_path, output_script_path, variables, log_dir="logs", varlist_conf_path=None):
         self.template_script_path = template_script_path
+        self.varlist_conf_path = varlist_conf_path
         self.output_script_path = output_script_path
         self.variables = variables
         self.log_dir = log_dir
@@ -19,7 +20,16 @@ def generate_script(self, vars_dict):
         """
         with open(self.template_script_path) as f:
             template = f.read()
-        export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()])
+        # Create our output list
+        if self.varlist_conf_path:
+            output_dict = {}
+            with open(self.varlist_conf_path) as var_file:
+                for line in var_file:
+                    param = line.strip()
+                    output_dict[param] = vars_dict[param]
+            export_lines = "\n".join([f"export {k}={v}" for k, v in output_dict.items()])
+        else:
+            export_lines = "\n".join([f"export {k}={v}" for k, v in vars_dict.items()])
         script_content = template.replace("#@VARS", export_lines)
         with open(self.output_script_path, 'w') as f:
             f.write(script_content)
diff --git a/.cd/server/server_output.env b/.cd/server/server_output.env
new file mode 100644
index 000000000..dccdef0ae
--- /dev/null
+++ b/.cd/server/server_output.env
@@ -0,0 +1,60 @@
+MODEL
+DTYPE
+DEVICE_NAME
+TENSOR_PARALLEL_SIZE
+MAX_MODEL_LEN
+TOTAL_GPU_MEM
+MODEL_DTYPE
+QUANT_DTYPE
+BLOCK_SIZE
+VLLM_PROMPT_BS_BUCKET_MIN
+VLLM_PROMPT_BS_BUCKET_STEP
+VLLM_DECODE_BS_BUCKET_MIN
+VLLM_DECODE_BS_BUCKET_STEP
+VLLM_PROMPT_SEQ_BUCKET_MIN
+VLLM_PROMPT_SEQ_BUCKET_STEP
+VLLM_DECODE_BLOCK_BUCKET_MIN
+VLLM_DECODE_BLOCK_BUCKET_STEP
+MAX_NUM_PREFILL_SEQS
+NUM_HIDDEN_LAYERS
+HIDDEN_SIZE
+NUM_KEY_VALUE_HEADS
+NUM_ATTENTION_HEADS
+CACHE_DTYPE_BYTES
+LIMIT_MODEL_LEN
+PT_HPU_LAZY_MODE
+VLLM_DELAYED_SAMPLING
+VLLM_SKIP_WARMUP
+EXPERIMENTAL_WEIGHT_SHARING
+VLLM_EXPONENTIAL_BUCKETING
+MAX_NUM_BATCHED_TOKENS
+PT_HPU_ENABLE_LAZY_COLLECTIVES
+DEVICE_HPU_MEM
+MODEL_MEM_IN_GB
+USABLE_MEM
+GPU_MEM_UTILIZATION
+KV_CACHE_PER_SEQ
+EST_MAX_NUM_SEQS
+EST_HPU_BLOCKS
+DECODE_BS_RAMP_GRAPHS
+DECODE_BS_STEP_GRAPHS
+DECODE_BLOCK_RAMP_GRAPHS
+DECODE_BLOCK_STEP_GRAPHS
+NUM_DECODE_GRAPHS
+PROMPT_BS_RAMP_GRAPHS
+PROMPT_BS_STEP_GRAPHS
+PROMPT_SEQ_RAMP_GRAPHS
+PROMPT_SEQ_STEP_GRAPHS
+EST_NUM_PROMPT_GRAPHS
+EST_GRAPH_PROMPT_RATIO
+VLLM_GRAPH_PROMPT_RATIO
+DECODE_GRAPH_TARGET_GB
+EST_GRAPH_RESERVE_MEM
+VLLM_GRAPH_RESERVED_MEM
+KV_CACHE_MEM
+MAX_NUM_SEQS
+VLLM_PROMPT_SEQ_BUCKET_MAX
+VLLM_CONTIGUOUS_PA
+VLLM_DEFRAG
+ASYNC_SCHEDULING
+VLLM_WEIGHT_LOAD_FORCE_SYNC
diff --git a/.cd/server/server_user.env b/.cd/server/server_user.env
index 8d1664272..3dd52ba00 100644
--- a/.cd/server/server_user.env
+++ b/.cd/server/server_user.env
@@ -10,3 +10,4 @@ MAX_NUM_SEQS
 TENSOR_PARALLEL_SIZE
 VLLM_EXPONENTIAL_BUCKETING
 GPU_MEM_UTILIZATION
+ASYNC_SCHEDULING
diff --git a/.cd/server/settings_vllm.csv b/.cd/server/settings_vllm.csv
index b616d0e49..00d2e6b47 100644
--- a/.cd/server/settings_vllm.csv
+++ b/.cd/server/settings_vllm.csv
@@ -1,21 +1,19 @@
-MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS
-meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048
-mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048
-mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048
-mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048
-deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.2-11B-Vision-Instruct,1,8448,128,2,21340441670,2,2,19.87483507,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,40,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048
-meta-llama/Llama-3.2-90B-Vision-Instruct,4,8448,512,2,177186710646,2,2,165.0179835,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,100,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048
-ibm-granite/granite-8b-code-instruct-4k,1,2048,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048
-ibm-granite/granite-20b-code-instruct-8k,1,2048,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,16,80,2,65536,1,TRUE,FALSE,0,FALSE,2048
-Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048
+MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
+meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,1
+Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
+ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,0,0
+Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,0,0
diff --git a/.cd/server/vllm_autocalc_rules.py b/.cd/server/vllm_autocalc_rules.py
index 5841b670a..30290edb8 100644
--- a/.cd/server/vllm_autocalc_rules.py
+++ b/.cd/server/vllm_autocalc_rules.py
@@ -82,8 +82,12 @@ def calc_DECODE_BLOCK_STEP_GRAPHS(ctx):
 
 def calc_NUM_DECODE_GRAPHS(ctx):
     # 3d update
-    return ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) *
-            (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS'])) / 2
+    decode_graphs = ((ctx['DECODE_BS_RAMP_GRAPHS'] + ctx['DECODE_BS_STEP_GRAPHS']) *
+                     (ctx['DECODE_BLOCK_RAMP_GRAPHS'] + ctx['DECODE_BLOCK_STEP_GRAPHS']))
+    if ctx['VLLM_CONTIGUOUS_PA']:
+        return decode_graphs
+    else:
+        return decode_graphs / 2
 
 
 def calc_PROMPT_BS_RAMP_GRAPHS(ctx):
diff --git a/.cd/templates/template_vllm_benchmark.sh b/.cd/templates/template_vllm_benchmark.sh
index 3af3e3f3d..19b23d215 100644
--- a/.cd/templates/template_vllm_benchmark.sh
+++ b/.cd/templates/template_vllm_benchmark.sh
@@ -3,7 +3,7 @@
 #@VARS
 
 # Wait for vLLM server to be ready
-until curl -s http://localhost:8000${ENDPOINT} > /dev/null; do
+until curl -s http://localhost:8000/v1/models > /dev/null; do
     echo "Waiting for vLLM server to be ready..."
     sleep 15
 done
@@ -35,4 +35,4 @@ vllm bench serve \
                 --metric-percentiles 90 \
                 --ignore-eos \
                 --trust-remote-code \
-2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log
\ No newline at end of file
+2>&1 | tee -a logs/perftest_inp${INPUT_TOK}_out${OUTPUT_TOK}_user${CONCURRENT_REQ}.log
diff --git a/.cd/templates/template_vllm_server.sh b/.cd/templates/template_vllm_server.sh
index c28cd3ed5..b6db4e8cd 100644
--- a/.cd/templates/template_vllm_server.sh
+++ b/.cd/templates/template_vllm_server.sh
@@ -2,6 +2,10 @@
 
 #@VARS
 
+if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling
+    EXTRA_ARGS+=" --async_scheduling"
+fi
+
 ## Start server
 vllm serve $MODEL \
         --block-size $BLOCK_SIZE \
@@ -11,5 +15,7 @@ vllm serve $MODEL \
         --max-model-len $MAX_MODEL_LEN \
         --gpu-memory-utilization $GPU_MEM_UTILIZATION \
         --max-num-seqs $MAX_NUM_SEQS \
-        --disable-log-requests \
+        --generation-config vllm \
+        --max_num_batched_tokens $MAX_NUM_BATCHED_TOKENS \
+        --disable-log-requests ${EXTRA_ARGS} \
 2>&1 | tee -a  logs/vllm_server.log
diff --git a/.cd/tests/test_vllm_autocalc_rules.py b/.cd/tests/test_vllm_autocalc_rules.py
index d14a07bc6..17a504e16 100644
--- a/.cd/tests/test_vllm_autocalc_rules.py
+++ b/.cd/tests/test_vllm_autocalc_rules.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+import pytest
 import math
 
 import server.vllm_autocalc_rules as rules
@@ -110,14 +111,16 @@ def test_calc_DECODE_BLOCK_STEP_GRAPHS():
     assert rules.calc_DECODE_BLOCK_STEP_GRAPHS(ctx) == expected
 
 
-def test_calc_NUM_DECODE_GRAPHS():
+@pytest.mark.parametrize("cpa", ["true", "false"])
+def test_calc_NUM_DECODE_GRAPHS(cpa):
     ctx = {
         'DECODE_BS_RAMP_GRAPHS': 2,
         'DECODE_BS_STEP_GRAPHS': 3,
         'DECODE_BLOCK_RAMP_GRAPHS': 4,
-        'DECODE_BLOCK_STEP_GRAPHS': 5
+        'DECODE_BLOCK_STEP_GRAPHS': 5,
+        'VLLM_CONTIGUOUS_PA': cpa
     }
-    expected = ((2 + 3) * (4 + 5)) / 2
+    expected = (2 + 3) * (4 + 5) if cpa else (2 + 3) * (4 + 5) / 2
     assert rules.calc_NUM_DECODE_GRAPHS(ctx) == expected
 
 
diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml
index 12c9134b7..e17f4f422 100644
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@@ -2,3 +2,4 @@ self-hosted-runner:
   labels:
     - ucb-vllm-cicd-g2
     - hourly-ci
+    - pr-ci
diff --git a/.github/workflows/create-release-branch.yaml b/.github/workflows/create-release-branch.yaml
index 9941b11c8..cd40773ef 100644
--- a/.github/workflows/create-release-branch.yaml
+++ b/.github/workflows/create-release-branch.yaml
@@ -120,9 +120,27 @@ jobs:
             echo "branch_name=pre_${{ github.event.inputs.branch_name }}" >> "$GITHUB_OUTPUT"
           fi
 
-  setup_and_build:
-    runs-on: hourly-ci
+  # --- NEW JOB ---
+  # This job runs after prep, picks one 'hourly-ci' runner,
+  # and outputs its name so all other test jobs can target it.
+  discover_runner:
+    name: "Discover Self-Hosted Runner"
     needs: [prepare-release-branch]
+    runs-on: hourly-ci
+    outputs:
+      runner_name: ${{ steps.get_name.outputs.name }}
+    steps:
+      - name: Get runner name
+        id: get_name
+        run: |
+          echo "This workflow will run on: ${{ runner.name }}"
+          echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
+
+  setup_and_build:
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: "Checkout the release branch"
         uses: actions/checkout@v4
@@ -168,8 +186,10 @@ jobs:
           echo "Docker image built successfully."
 
   run_unit_tests:
-    needs: [prepare-release-branch, setup_and_build]
-    runs-on: hourly-ci
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, setup_and_build, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -191,8 +211,10 @@ jobs:
           echo "Test script exited with code: $EXITCODE"
 
   discover_tests:
-    runs-on: hourly-ci
-    needs: [prepare-release-branch]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -204,9 +226,9 @@ jobs:
         id: set-matrix
         run: |
           TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
-              awk '{print $1}' | \
-              sed 's/()//' | \
-              jq -R . | jq -s -c . )
+                      awk '{print $1}' | \
+                      sed 's/()//' | \
+                      jq -R . | jq -s -c . )
 
           echo "Discovered test matrix: $TEST_FUNCTIONS"
           if [ "$TEST_FUNCTIONS" = "[]" ]; then
@@ -216,8 +238,10 @@ jobs:
           echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
 
   e2e:
-    needs: [prepare-release-branch, setup_and_build, discover_tests]
-    runs-on: hourly-ci
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, setup_and_build, discover_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     strategy:
       fail-fast: false
       matrix:
@@ -245,8 +269,10 @@ jobs:
           echo "Test script exited with code: $EXITCODE"
 
   run_data_parallel_test:
-    needs: [prepare-release-branch, setup_and_build]
-    runs-on: hourly-ci
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, setup_and_build, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run Data Parallel test
         run: |
@@ -271,8 +297,10 @@ jobs:
           echo "Test script exited with code: $EXITCODE"
 
   run_pd_disaggregate_test:
-    needs: [prepare-release-branch, setup_and_build]
-    runs-on: hourly-ci
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, setup_and_build, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run PD disaggregate test
         run: |
@@ -298,8 +326,10 @@ jobs:
           echo "Test script exited with code: $EXITCODE"
 
   run_hpu_perf_tests:
-    needs: [prepare-release-branch, setup_and_build]
-    runs-on: hourly-ci
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [prepare-release-branch, setup_and_build, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run Sharegpt performance tests with warmup
         run: |
@@ -324,6 +354,8 @@ jobs:
   summarize_and_notify:
     name: "Summarize Test Results and Notify"
     runs-on: ubuntu-latest
+    # --- This job runs on ubuntu-latest, so no runner change is needed ---
+    # It will correctly wait for all the test jobs to finish
     if: needs.prepare-release-branch.result == 'success'
     needs:
       - prepare-release-branch
diff --git a/.github/workflows/hourly-ci.yaml b/.github/workflows/hourly-ci.yaml
index 2131cc8f2..4805c5de2 100644
--- a/.github/workflows/hourly-ci.yaml
+++ b/.github/workflows/hourly-ci.yaml
@@ -13,13 +13,29 @@ on:
   workflow_dispatch: {}
 
 jobs:
-  # JOB 1: Sets up the environment and builds the Docker image needed for all tests.
+  # JOB 1: (NEW) Discovers an available runner and locks it for all subsequent jobs.
+  discover_runner:
+    runs-on: hourly-ci # Picks any available runner from the 'hourly-ci' pool
+    outputs:
+      runner_name: ${{ steps.get_name.outputs.name }}
+    steps:
+      - name: Get runner name
+        id: get_name
+        # This command gets the unique name of the runner (e.g., "my-runner-123")
+        # and saves it as an output variable
+        run: |
+          echo "This workflow will run on: ${{ runner.name }}"
+          echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
+
+  # JOB 2: (UPDATED) Sets up the environment and builds the Docker image.
   setup_and_build:
     if: |
       github.event_name == 'workflow_dispatch' ||
       github.ref == 'refs/heads/main'
-    runs-on: hourly-ci
-    needs: discover_tests
+    # <-- UPDATED: Now needs 'discover_tests' AND 'discover_runner'
+    needs: [discover_tests, discover_runner]
+    # <-- UPDATED: Runs on the specific runner from the discover_runner job
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     permissions:
       contents: read # Required to checkout code and read history
     outputs:
@@ -103,9 +119,12 @@ jobs:
           EOF
           echo "Docker image built successfully."
 
+  # JOB 3: (UPDATED)
   run_unit_tests:
-    needs: setup_and_build
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
+    needs: [setup_and_build, discover_runner]
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -126,8 +145,12 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
 
+  # JOB 4: (UPDATED)
   discover_tests:
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs 'discover_runner'
+    needs: discover_runner
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -140,9 +163,9 @@ jobs:
           # naming convention, excluding the main 'run_all_tests' function itself.
           # The final list is formatted into a JSON array required for the matrix strategy.
           TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
-                   awk '{print $1}' | \
-                   sed 's/()//' | \
-                   jq -R . | jq -s -c . )
+                      awk '{print $1}' | \
+                      sed 's/()//' | \
+                      jq -R . | jq -s -c . )
 
           echo "Discovered test matrix: $TEST_FUNCTIONS"
           # Fail the job if no tests were found.
@@ -152,9 +175,12 @@ jobs:
           fi
           echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
 
+  # JOB 5: (UPDATED)
   e2e:
-    needs: [setup_and_build, discover_tests]
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs 'setup_and_build', 'discover_tests', AND 'discover_runner'
+    needs: [setup_and_build, discover_tests, discover_runner]
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     strategy:
       fail-fast: false
       matrix:
@@ -183,9 +209,12 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
 
+  # JOB 6: (UPDATED)
   run_data_parallel_test:
-    needs: setup_and_build
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
+    needs: [setup_and_build, discover_runner]
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run Data Parallel test
         run: |
@@ -209,9 +238,12 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
 
+  # JOB 7: (UPDATED)
   run_pd_disaggregate_test:
-    needs: setup_and_build
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs 'setup_and_build' AND 'discover_runner'
+    needs: [setup_and_build, discover_runner]
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run PD disaggregate test
         run: |
@@ -236,9 +268,12 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
   
+  # JOB 8: (UPDATED)
   store_last_stable_vllm_commit:
-    needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test]
-    runs-on: hourly-ci
+    # <-- UPDATED: Now needs all test jobs AND 'discover_runner'
+    needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test, discover_runner]
+    # <-- UPDATED: Runs on the specific runner
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     permissions:
       contents: write # Permission is required to push a commit
     steps:
diff --git a/.github/workflows/pre-merge.yaml b/.github/workflows/pre-merge.yaml
index 7968fefe4..59707351b 100644
--- a/.github/workflows/pre-merge.yaml
+++ b/.github/workflows/pre-merge.yaml
@@ -22,7 +22,6 @@ concurrency:
 jobs:
   gatekeeper:
     runs-on: ubuntu-latest
-    if: github.event.action == 'opened' || github.event.before != github.event.after
     permissions:
       # Required to read the status of checks and PR details
       checks: read
@@ -107,8 +106,26 @@ jobs:
           echo "Failing this job to prevent the main CI from running."
           exit 1
 
+  # --- NEW JOB ---
+  # This job runs first on the self-hosted pool, picks a runner,
+  # and outputs its name so all other jobs can target it.
+  discover_runner:
+    needs: gatekeeper
+    runs-on: pr-ci
+    outputs:
+      runner_name: ${{ steps.get_name.outputs.name }}
+    steps:
+      - name: Get runner name
+        id: get_name
+        run: |
+          echo "This workflow will run on: ${{ runner.name }}"
+          echo "name=${{ runner.name }}" >> "$GITHUB_OUTPUT"
+
   discover_tests:
-    runs-on: ucb-vllm-cicd-g2
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: discover_runner
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
@@ -124,9 +141,9 @@ jobs:
           # naming convention, excluding the main 'run_all_tests' function itself.
           # The final list is formatted into a JSON array required for the matrix strategy.
           TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh | \
-                   awk '{print $1}' | \
-                   sed 's/()//' | \
-                   jq -R . | jq -s -c . )
+                      awk '{print $1}' | \
+                      sed 's/()//' | \
+                      jq -R . | jq -s -c . )
 
           echo "Discovered test matrix: $TEST_FUNCTIONS"
           # Fail the job if no tests were found.
@@ -137,7 +154,7 @@ jobs:
           echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"
 
   pre-commit:
-    # This job now runs in parallel with the build job
+    # This job runs in parallel with the build job
     needs: gatekeeper
     runs-on: ubuntu-latest
     steps:
@@ -165,8 +182,10 @@ jobs:
   pre_merge_hpu_test_build:
     if: >
         !contains(github.event.pull_request.labels.*.name, 'skip-gaudi-tests')
-    runs-on: ucb-vllm-cicd-g2
-    needs: [pre-commit, discover_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre-commit, discover_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     permissions:
       contents: read # Required to checkout code and read history
     outputs:
@@ -318,8 +337,10 @@ jobs:
           echo "Docker image built successfully."
 
   hpu_unit_tests:
-    runs-on: ucb-vllm-cicd-g2
-    needs: pre_merge_hpu_test_build
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre_merge_hpu_test_build, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run pytest in tests/unit_tests
         run: |
@@ -340,10 +361,10 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
   hpu_pd_tests:
-    runs-on: ucb-vllm-cicd-g2
-    # This is a final job that runs after the build and unit tests 
-    # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run.
-    needs: [pre_merge_hpu_test_build, hpu_unit_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run test scripts
         run: |
@@ -369,10 +390,10 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
   hpu_perf_tests:
-    runs-on: ucb-vllm-cicd-g2
-    # This is a final job that runs after the build and unit tests 
-    # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run.
-    needs: [pre_merge_hpu_test_build, hpu_unit_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run test scripts
         run: |
@@ -393,10 +414,10 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
   hpu_dp_tests:
-    runs-on: ucb-vllm-cicd-g2
-    # This is a final job that runs after the build and unit tests 
-    # Unit tests are supposed to fail fast if anything goes wrong, removing the need for this job to run.
-    needs: [pre_merge_hpu_test_build, hpu_unit_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     steps:
       - name: Run test scripts
         run: |
@@ -420,8 +441,10 @@ jobs:
           EXITCODE=$?
           echo "Test script exited with code: $EXITCODE"
   e2e:
-    runs-on: ucb-vllm-cicd-g2
-    needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [pre_merge_hpu_test_build, hpu_unit_tests, discover_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     strategy:
       fail-fast: false
       matrix:
@@ -451,8 +474,10 @@ jobs:
           echo "Test script exited with code: $EXITCODE"
   
   pre_merge_hpu_test:
-    runs-on: ucb-vllm-cicd-g2
-    needs: [hpu_unit_tests, e2e, hpu_perf_tests]
+    # --- UPDATED: Add discover_runner dependency ---
+    needs: [hpu_unit_tests, e2e, hpu_perf_tests, discover_runner]
+    # --- UPDATED: Run on the specific node ---
+    runs-on: ${{ needs.discover_runner.outputs.runner_name }}
     # This job is required to pass for pre-merge CI. By itself it does nothing, and will only pass if all jobs specified in "needs" list pass.
     steps:
       - name: Succeeded if all previous jobs passed
@@ -476,4 +501,3 @@ jobs:
             ### ✅ CI Passed
             All checks passed successfully against the following vllm commit:
             **`${{ needs.pre_merge_hpu_test_build.outputs.target_commit }}`**
-
diff --git a/.jenkins/vision/run-tests.sh b/.jenkins/vision/run-tests.sh
index aed0f9f50..fd0d358f3 100644
--- a/.jenkins/vision/run-tests.sh
+++ b/.jenkins/vision/run-tests.sh
@@ -42,13 +42,12 @@ do
     export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
     export VLLM_SKIP_WARMUP=true
     export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]" 
-    RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
     JUNIT_FAMILY=""
     JUNIT_XML=""
     TIMEOUT_S=900 # 15 minutes timeout per test
     if [[ -n "$TEST_RESULTS_DIR" ]]; then
         LOG_DIR=$TEST_RESULTS_DIR
-        LOG_FILENAME="test_${MODEL_CONFIG}_${RANDOM_SUFFIX}.xml"
+        LOG_FILENAME="test_${MODEL_CONFIG}.xml"
         LOG_PATH="${LOG_DIR}/${LOG_FILENAME}"
         JUNIT_FAMILY="-o junit_family=xunit1"
         JUNIT_XML="--junitxml=${LOG_PATH}"
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index 7702adbb7..e6cc51b4e 100644
--- a/README.md
+++ b/README.md
@@ -1,24 +1,38 @@
-> [!IMPORTANT]  
-> This is an early developer preview of the vLLM Gaudi Plugin and is not yet intended for general use. For a more stable experience, consider using the [HabanaAI/vllm-fork](https://github.com/HabanaAI/vllm-fork) or the in-tree Gaudi implementation available in [vllm-project/vllm](https://github.com/vllm-project/vllm).
-
-# Welcome to vLLM x Intel Gaudi
-
 <p align="center">
-  <img src="./docs/assets/logos/vllm-logo-text-light.png" alt="vLLM" width="30%">
+  <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png" alt="vLLM" width="30%">
   <span style="font-size: 24px; font-weight: bold;">x</span>
   <img src="./docs/assets/logos/gaudi-logo.png" alt="Intel-Gaudi" width="30%">
 </p>
 
+<h2 align="center">
+Welcome to vLLM x Intel Gaudi
+</h2>
+
+<p align="center">
+| <a href="https://vllm-gaudi.readthedocs.io/en/latest/index.html"><b>Documentation</b></a> | <a href="https://docs.habana.ai/en/latest/index.html"><b>Intel® Gaudi® Documentation</b></a> | <a href="https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html"><b>Optimizing Training Platform Guide</b></a> |
+</p>
+
+---
+*Latest News* 🔥
+
+- [2025/06] We are introduced an early developer preview of the vLLM Gaudi Plugin and is not yet intended for general use. For a more stable experience, consider using the [HabanaAI/vllm-fork](https://github.com/HabanaAI/vllm-fork) or the in-tree Gaudi implementation available in [vllm-project/vllm](https://github.com/vllm-project/vllm).
+
+---
+
+## About
 vLLM Gaudi plugin (vllm-gaudi) integrates Intel Gaudi accelerators with vLLM to optimize large language model inference.
 
 This plugin follows the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162) and [[RFC]: Enhancing vLLM Plugin Architecture](https://github.com/vllm-project/vllm/issues/19161) principles, providing a modular interface for Intel Gaudi hardware.
 
-Learn more:
-
-📚 [Intel Gaudi Documentation](https://docs.habana.ai/en/v1.21.1/index.html)  
-🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
+Learn more: 🚀 [vLLM Plugin System Overview](https://docs.vllm.ai/en/latest/design/plugin_system.html)
 
 ## Getting Started
+0. Preparation of the Setup
+
+    To set up the execution environment, please follow the instructions in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).
+    To achieve the best performance on HPU, please follow the methods outlined in the
+    [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
+
 1. Get Last good commit on vllm
    NOTE: vllm-gaudi is always follow latest vllm commit, however, vllm upstream
    API update may crash vllm-gaudi, this commit saved is verified with vllm-gaudi
@@ -37,7 +51,7 @@ Learn more:
     git clone https://github.com/vllm-project/vllm
     cd vllm
     git checkout $VLLM_COMMIT_HASH
-    pip install -r <(sed '/^[torch]/d' requirements/build.txt)
+    pip install -r <(sed '/^torch/d' requirements/build.txt)
     VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
     cd ..
     ```
@@ -50,46 +64,17 @@ Learn more:
     cd ..
     ```
 
-4. (Optional) Install nixl:
+4. To uncover all installation methods, sucha as NixL, follow the  [link](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html)
 
-    ```bash
-    cd vllm-gaudi
-    python install_nixl.sh
-    cd ..
-    ```
+## Contributing
+
+We welcome and value any contributions and collaborations.
+
+## Contact Us
 
-## Install with Docker file
-
-```bash
-docker build -t ubuntu.pytorch.vllm.nixl.latest \
-  -f .cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest github.com/vllm-project/vllm-gaudi
-docker run -it --rm --runtime=habana \
-  --name=ubuntu.pytorch.vllm.nixl.latest \
-  --network=host \
-  -e HABANA_VISIBLE_DEVICES=all \
-  vllm-gaudi-for-llmd /bin/bash
-```
-
-### Full installation from source (vLLM and vLLM-Gaudi):
-
-```bash
-# Fetch last good commit on vllm
-git clone https://github.com/vllm-project/vllm-gaudi
-cd vllm-gaudi
-export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null)
-
-# Build vLLM from source for empty platform, reusing existing torch installation
-git clone https://github.com/vllm-project/vllm
-cd vllm
-git checkout $VLLM_COMMIT_HASH
-pip install -r <(sed '/^[torch]/d' requirements/build.txt)
-VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
-cd ..
-
-# Build vLLM-Gaudi from source
-cd vllm-gaudi
-pip install -e .
-
-# Build nixl
-python install_nixl.sh
-```
+<!-- --8<-- [start:contact-us] -->
+- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm-gaudi/issues)
+- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
+- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
+- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
+<!-- --8<-- [end:contact-us] -->
diff --git a/calibration/README.md b/calibration/README.md
new file mode 100644
index 000000000..de0416031
--- /dev/null
+++ b/calibration/README.md
@@ -0,0 +1,202 @@
+# FP8 Calibration Procedure
+
+Running inference via [vLLM](https://github.com/vllm-project/vllm) on HPU with FP8 precision is achieved using [Intel® Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html#inference-using-fp8) package. This approach requires a model calibration procedure to generate measurements, quantization files, and configurations first. To simplify this process, we've provided the `calibrate_model.sh` script. It requires the following arguments:
+
+- `-m`, i.e., **model stub or path:** Path to your model (if stored locally) or the model ID from the Hugging Face Hub.
+- `-d`, i.e., **path to the source dataset:** Path to your dataset in pickle format (".pkl").
+- `-o`, i.e., **output path:** Path to the directory where the generated measurements, etc., will be stored.
+
+There are also optional arguments, and you can read about them by executing the script with the `-h` option.
+
+The calibration procedure works with any dataset that contains following fields: `system_prompt` and `question`. These fields are used to prepare a calibration dataset with prompts formatted specifically for your model. We recommend to use a public dataset used by MLCommons in Llama2-70b inference submission: https://github.com/mlcommons/inference/tree/master/language/llama2-70b#preprocessed.
+
+> [!TIP]
+> For the [DeepSeek-R1](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) series models, which contains 256 experts, it’s important to provide a diverse and
+> sufficiently large sample set to ensure that all experts are properly activated during calibration.
+> Through our experiments, we found that using [NeelNanda/pile-10k](https://huggingface.co/datasets/NeelNanda/pile-10k) and selecting 512 samples with at least 1024 tokens each yields good calibration coverage.
+
+## Options and Usage
+
+To run the ```calibrate_model.sh``` script, follow the steps below:
+
+1. Build and install latest [vllm-plugin](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html).
+2. Go ```calibration``` subdirectory:
+
+```bash
+cd calibration
+pip install -r requirements.txt
+```
+
+1. Download the dataset.
+> [!NOTE]
+> For [DeepSeek-R1](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) series models, it is recommended to use `NeelNanda/pile-10k` as the dataset.
+
+1. Run the ```calibrate_model.sh``` script. Refer to the script options and run examples below. The script generates the ```maxabs_quant_g3.json``` file, which is used for FP8 inference.
+
+### Here are some examples of how to use the script:
+
+```bash
+./calibrate_model.sh -m /path/to/local/llama3.1/Meta-Llama-3.1-405B-Instruct/ -d dataset-processed.pkl -o /path/to/measurements/vllm-benchmarks/inc -b 128 -t 8 -l 4096
+# OR
+./calibrate_model.sh -m facebook/opt-125m -d dataset-processed.pkl -o inc/
+# OR Calibrate DeepSeek models with dataset NeelNanda/pile-10k
+PT_HPU_LAZY_MODE=1 ./calibrate_model.sh -m deepseek-ai/DeepSeek-R1  -d NeelNanda/pile-10k -o inc/ -t 8
+```
+
+> [!WARNING]
+> Measurements are device-dependent, so you can't use scales collected on Gaudi3 on Gaudi2 accelerators. This behavior can cause accuracy issues.
+
+> [!TIP]
+> If you get following error, ensure you set a valid tensor parallelism value, e.g. `-t 8`:
+>
+> ```
+> RuntimeError: [Rank:0] FATAL ERROR :: MODULE:PT_DEVMEM Allocation failed for size::939524096 (896)MB
+> ```
+
+# Run inference with FP8 models
+
+An inference with FP8 precision models using vLLM has been described in [Documentation](https://vllm-gaudi.readthedocs.io/en/latest/configuration/model_calibration.html).
+
+# Multi-node FP8 Calibration
+
+Following section details the procedure for calibrating models that do not fit into a single Gaudi node. For illustration we have used the Llama 3.1 405B model running in Tensor Parallelism(TP)-16 mode spanning two Gaudi2 nodes.<br>
+
+> [!NOTE]
+> Following steps are to be executed within a [Gaudi Pytorch container](https://docs.habana.ai/en/latest/Installation_Guide/Additional_Installation/Docker_Installation.html#use-intel-gaudi-containers)
+
+## Step 1: Pre-requisites
+
+- Install latest [vllm-plugin](https://vllm-gaudi.readthedocs.io/en/latest/getting_started/installation.html)
+- Ensure that all nodes in the multi-node setup are connected to an NFS mount (Network File System).
+- Create workspace directory on NFS, clone the calibration scripts repo and create an empty file `quant_config_buffer.json`.
+
+```bash
+mkdir <nfs-mount-path>/my_workspace && cd <nfs-mount-path>/my_workspace
+cd <path-to-vllm-gaudi>/calibration
+touch quant_config_buffer.json
+```
+
+- Check if all Gaudi NIC ports are up <br>
+Note : Following commands should be run on the host and NOT inside the container. <br>
+
+```bash
+cd /opt/habanalabs/qual/gaudi2/bin 
+./manage_network_ifs.sh --status 
+# All the ports should be in 'up' state. Try flipping the state
+./manage_network_ifs.sh --down 
+./manage_network_ifs.sh --up
+# Give it a minute for the NIC's to flip and check the status again
+```
+
+- Set following envs at all nodes:
+
+```bash
+# Check the network interface for outbound/inbound comms. Command 'ip a' or 'ifconfig' should list all the interfaces
+export GLOO_SOCKET_IFNAME=eth0
+export HCCL_SOCKET_IFNAME=eth0
+export QUANT_CONFIG="<nfs-path-to-config>/quant_config_buffer.json"
+```
+
+### Step 2: Start a Ray cluster to accommodate the required TP size.
+
+```bash
+# Start Ray on head node
+ray start --head --port=6379
+
+# Add worker nodes to the Ray cluster
+ray start --address='<ip-of-ray-head-node>:6379'
+
+# Check if the cluster has required number of HPU's
+ray status
+```
+
+#### Step 3: Run model calibration script
+
+```bash
+./calibrate_model.sh -m meta-llama/Llama-3.1-405B-Instruct -d <path-to-dataset>/open_orca_gpt4_tokenized_llama.calibration_1000.pkl -o <nfs-path-to-calibration-output>/fp8_output -l 4096 -t 16 -b 128
+```
+
+Running the above command will create calibration measurement files in the specified output directory, organized into model-specific subdirectories.
+
+> [!NOTE]
+> The current calibration procedure works correctly only when the multi-node configuration has more than 8 cards.
+
+#### Step 4: (Optional) Measurement unification
+
+This is an optional step and is used to reduce the target tensor parallelism level by unifying the measurement scales. For example, you can perform FP8 calibration on the Llama 3.1 405B model using 2x Gaudi2 nodes with Tensor Parallelism (TP) set to 16, and then use the unification script to reduce the TP to 8. This can be achieved in two ways:
+1. Add `-r` optional parameter to `calibration_model.sh` script, e.g.
+
+```bash
+./calibrate_model.sh -m meta-llama/Llama-3.1-405B-Instruct -d <path-to-dataset>/open_orca_gpt4_tokenized_llama.calibration_1000.pkl -o <nfs-path-to-calibration-output>/fp8_output -l 4096 -t 16 -b 128 -r 8
+```
+
+1. If calibration has already been performed, use the following command to convert existing scales:
+
+```bash
+python3 step-5-unify_measurements.py -r 8  -m <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/ -o <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/
+```
+
+- `-r`, i.e. **rank number** of unified measurements.
+- `-m`, i.e. **calibration output path** containing the measurement files.
+- `-o`, i.e. **unification output directory** where unification output will be written.
+- `-u`, i.e. unify original measurement results based on **expert parallelism** rules.
+
+> [!TIP]
+> It is a good practice to store unification results in the source directory. This allows you to run the vLLM server with FP8 precision and different TP values without modifying the directory specified in the `QUANT_CONFIG` environment variable.
+
+Below examples in case you want to convert scales from TP=16 to TP=4 and 2:
+- conversion of scales TP=16 -> TP=4:
+
+```bash
+python3 step-5-unify_measurements.py -r 4  -m <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/ -o <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/
+```
+
+- conversion of scales TP=16 -> TP=2:
+
+```bash
+python3 step-5-unify_measurements.py -r 2  -m <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/ -o <nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/g2/
+```
+
+In case the model contains MoE layers and is calibrated with expert parallelism, `-u` is required for unification:
+
+```bash
+python3 step-5-unify_measurements.py -r 4 -m <nfs-path-to-calibration-output>/fp8_output/model_name/g2 -o <nfs-path-to-calibration-output>/fp8_output/model_name/g2 -u
+```
+
+#### Step 5: Serving the FP8 quantized model
+
+```bash
+export QUANT_CONFIG='<nfs-path-to-calibration-output>/fp8_output/llama-3.1-405b-instruct/maxabs_quant_g2.json'
+vllm serve meta-llama/Llama-3.1-405B-Instruct --quantization inc --kv-cache-dtype fp8_inc --weights-load-device cpu --tensor-parallel-size 8 --max-model-len 2048
+```
+
+> [!NOTE]
+> Detailed information about serving with vLLM (including multi-node serving) you can find in [Documentation](https://vllm-gaudi.readthedocs.io/en/latest/configuration/model_calibration.html).
+
+#### Advanced Usage for MoE Models
+
+For models with Mixture of Experts (MoE), like Deepseek-R1, you may want to run calibration once and use the results for different expert parallelism and data parallelism scenarios (e.g., 8, 16, or 32 cards). To do this:
+
+1. Unify all measurement files onto a single card (TP1).
+2. (Optional) Postprocess the unified measurement for better performance.
+3. Expand the unified results to the number of expert-parallel cards you need. The `step-6-expand-measurements.py` splits expert measurements across the target number of cards, while other values are reused.
+
+The diagram below shows an example where calibration is done on 2 cards and deployment is on 4 cards.
+
+![unify-and-expand](./unify-and-expand.png)
+
+Here is a real example that calibrates Deepseek-R1 on 8 cards and deploys on 16 or 32 cards:
+
+```bash
+# Unify measurements: TP8 -> TP1
+python step-5-unify_measurements.py -m  /path/to/measurements/deepseek-r1/g3/ -r 1 -o /path/to/measurements/deepseek-r1/g3-unified-tp1/  -u -s
+
+# (Optional) Postprocess unified TP1
+python step-3-postprocess-measure.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -d
+
+# Expand to EP16TP1
+python step-6-expand-measurements.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post-expand-ep16 -w 16
+
+# Expand to EP32TP1
+python step-6-expand-measurements.py -m /path/to/measurements/deepseek-r1/g3-unified-tp1-post/ -o /path/to/measurements/deepseek-r1/g3-unified-tp1-post-expand-ep32 -w 32
+```
diff --git a/calibration/calibrate_model.sh b/calibration/calibrate_model.sh
new file mode 100755
index 000000000..8d3df4e66
--- /dev/null
+++ b/calibration/calibrate_model.sh
@@ -0,0 +1,285 @@
+#!/bin/bash
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+set -e
+cd "$(dirname "$0")"
+
+ALLOWED_DEVICES=("g2" "g3")
+
+usage() {
+    echo
+    echo "Calibrate given MODEL_PATH for FP8 inference"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - [required] huggingface stub or local directory of the MODEL_PATH"
+    echo "  -d    - [required] path to source dataset (details in README)"
+    echo "  -o    - [required] path to output directory for fp8 measurements"
+    echo "  -b    - batch size to run the measurements at (default: 32)"
+    echo "  -l    - limit number of samples in calibration dataset"
+    echo "  -t    - tensor parallel size to run at (default: 1); NOTE: if t > 8 then we need a multi-node setup"
+    echo "  -r    - rank of unified measurements, it should be smaller than original rank number and should be a factor of the original rank number"
+    echo "  -u    - unify measurement results based on expert parallelism rules (default: False), expert parallelism unification rule is unique, card 1 expert measurement will be extended to card 0 if unified to x from 2x cards number"
+    echo "  -e    - set this flag to enable enforce_eager execution"
+    echo
+}
+
+cleanup_tmp() {
+	if [[ $(pwd) == *vllm-gaudi/calibration ]]; then
+		echo "Clearing temporary directory"
+		rm -rf nc_workspace
+		rm -rf inc_tmp
+	else
+		echo "Skipping temporary directory removal"
+	fi
+}
+
+create_measure_config() {
+    mkdir -p $1/$2/$3
+
+    model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+
+    if [[ $model_name_lower =~ ^mixtral ]]; then
+        tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\":  []},\"blocklist\": {\"types\": [], \"names\":  [\"self_attn\", \"lm_head\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    elif [[ $model_name_lower =~ ^deepseek ]]; then
+        tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\":  []},\"blocklist\": {\"types\": [], \"names\":  [\"lm_head\", \"mlp\\\.gate\\\b\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    else
+        tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\":  []},\"blocklist\": {\"types\": [], \"names\":  []},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    fi
+    echo "$tmp_config" > $1/$2/maxabs_measure_$3.json
+}
+
+create_quant_config() {
+    mkdir -p $1/$2/$3
+    
+    model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+
+    #note(kwisniewski98): mixtral models has attention masked to not cause regression in accuracy
+    if [[ $model_name_lower =~ ^mixtral ]]; then
+        if [[ $PT_HPU_LAZY_MODE == 0 ]]; then
+            tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"CONST\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+        else
+            tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"self_attn\", \"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+        fi
+    elif [[ $model_name_lower =~ ^deepseek ]]; then
+        tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\", \"scale_format\": \"scalar\", \"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"lm_head\", \"mlp\\\.gate\\\b\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    else
+        tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": []},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    fi
+    echo "$tmp_config" > $1/$2/maxabs_quant_$3.json
+}
+
+extract_last_folder_name() {
+    local path="$1"
+
+    path="${path%/}"
+    last_folder="$(basename "$path")"
+    last_folder="${last_folder,,}"
+
+    echo "$last_folder"
+}
+
+cleanup_tmp
+
+EXTRA_FLAGS_STEP_1=""
+EXTRA_FLAGS_STEP_2=""
+EXTRA_FLAGS_STEP_3=""
+EXTRA_FLAGS_STEP_4=""
+BATCH_SIZE=32
+TP_SIZE=1
+MULTI_NODE_SETUP=false
+
+USE_EP=""
+ENFORCE_EAGER=false
+
+while getopts "m:b:l:t:d:h:o:r:u:e" OPT; do
+    case ${OPT} in
+        m )
+            MODEL_PATH="$OPTARG"
+            ;;
+        d )
+            DATASET_PATH_OR_NAME="$OPTARG"
+            ;;
+        b )
+            BATCH_SIZE="$OPTARG"
+            ;;
+        o )
+            FP8_DIR=$(realpath "$OPTARG")
+            ;;
+        l )
+            LIMIT="$OPTARG"
+            ;;
+        t )
+            TP_SIZE="$OPTARG"
+            ;;
+        r )
+            RANK="$OPTARG"
+            ;;
+	u )
+	    USE_EP="--use_expert_paral"
+	    ;;
+        e ) 
+            ENFORCE_EAGER=true
+            ;;
+        h )
+            usage
+            ;;
+        \? )
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "$MODEL_PATH" && -z "$FP8_DIR" && -z "$DATASET_PATH_OR_NAME" ]]; then
+    echo "Model stub, source dataset path and output path for fp8 measurements must be provided."
+    usage
+    exit 1
+fi
+
+# Store the provided MODEL_PATH name in a variable
+MODEL_NAME=$(extract_last_folder_name "$MODEL_PATH")
+model_name_lower=$(echo "$MODEL_NAME" | tr '[:upper:]' '[:lower:]')
+
+echo "Step 0 - detecting used device type [g2, g3]"
+DEVICE_TYPE=$(python3 step-0-detect-device.py) || (echo "Detecting device process failed" && exit 1)
+DEVICE_TYPE="g$DEVICE_TYPE"
+echo "Detected device type: $DEVICE_TYPE"
+echo "Step 0 done"
+
+# Check if the provided device type is valid
+if [[ ! " ${ALLOWED_DEVICES[*]} " =~ " $DEVICE_TYPE " ]]; then
+    echo "Invalid device type: $DEVICE_TYPE. Allowed devices: ${ALLOWED_DEVICES[*]}"
+    exit 1
+fi
+
+if [[ $TP_SIZE -gt 8 ]]; then
+    MULTI_NODE_SETUP=true
+fi
+
+if $MULTI_NODE_SETUP; then
+    RAY_AVAILABLE_RESOURCES=$(python3 -c 'import ray; ray.init(); print(int(ray.available_resources()["HPU"]))')
+    if [[ $RAY_AVAILABLE_RESOURCES -lt $TP_SIZE ]]; then
+        echo "Required TP size : $TP_SIZE" 
+        echo "Available HPU's : $RAY_AVAILABLE_RESOURCES "
+        echo "!! Exiting since not enough HPU resources available. You can run 'ray status' to see available resources"
+        echo "Refer https://vllm-gaudi.readthedocs.io/en/latest/configuration/multi_node.html for multi-node runs"
+        exit 1
+    fi
+
+    if [[ ! -e $QUANT_CONFIG ]]; then
+        echo " !! Exiting. Invalid QUANT_CONFIG env"
+        echo " Multi-node calibration requires QUANT_CONFIG to point to an empty buffer.json file. Refer https://vllm-gaudi.readthedocs.io/en/latest/configuration/multi_node.html"
+        exit 1
+    fi
+fi
+
+create_measure_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE
+create_quant_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE
+
+if [[ $TP_SIZE > 1 ]]; then
+    export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+fi
+
+if [[ $MODEL_PATH_NAME == llama.*2.* ]]; then
+    EXTRA_FLAGS_STEP_1+="--chat-template template/llama-2-chat.jinja "
+elif  [[ "$MODEL_PATH" == *"Mixtral-8x7B"* ]]; then
+    EXTRA_FLAGS_STEP_1+="--chat-template template/mistral_mixtral.jinja "
+fi
+
+if [[ -n $LIMIT ]]; then
+    EXTRA_FLAGS_STEP_1+="--max-dataset-samples $LIMIT "
+fi
+
+SKIP_STEP_1=false
+if [[ $DATASET_PATH_OR_NAME == *.pkl ]]; then
+    SKIP_STEP_1=false
+else
+    echo "DATASET_PATH_OR_NAME is not a .pkl file, will prepare calibration dataset based on it."
+    SKIP_STEP_1=true
+fi
+
+
+if  [[ "$model_name_lower" == *"deepseek"* ]]; then
+    EXTRA_FLAGS_STEP_2+="--block-quant --expert-parallel "
+    EXTRA_ENVS_STEP_2="VLLM_HPU_FORCE_CHANNEL_FP8=0"
+    EXTRA_FLAGS_STEP_3+="--deepseek "
+    EXTRA_ENVS_STEP_4="VLLM_HPU_FORCE_CHANNEL_FP8=0"
+    EXTRA_FLAGS_STEP_4+="--block-quant --expert-parallel "
+fi
+
+# Skip step 1 if the DATASET_PATH_OR_NAME is a .pkl file
+if $SKIP_STEP_1; then
+    EXTRA_FLAGS_STEP_2+="--max-dataset-samples 512 --batch-size 1 --max-tokens 32 "
+    EXTRA_FLAGS_STEP_2+="--auto-process-dataset --sample-len 1024 --max-model-len 2048 "
+    EXTRA_FLAGS_STEP_2+="--dataset ${DATASET_PATH_OR_NAME} "
+fi
+
+if [[ -z "$VLLM_USE_V1" || $VLLM_USE_V1 != "1" ]]; then
+    EXTRA_FLAGS_STEP_2+="--max-num-prefill-seqs 1 "
+    EXTRA_FLAGS_STEP_4+="--max-num-prefill-seqs 1 "
+fi
+
+if $MULTI_NODE_SETUP; then
+    cat $FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json > $QUANT_CONFIG
+    sleep 2
+else
+    export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json
+fi
+
+if $ENFORCE_EAGER; then
+    EXTRA_FLAGS_STEP_2+="--enforce-eager "
+    EXTRA_FLAGS_STEP_4+="--enforce-eager "
+fi
+
+if $SKIP_STEP_1; then
+    echo "Skipping step 1 - prepare calibration dataset with dataset ${DATASET_PATH_OR_NAME}"
+else
+    echo ""
+    echo "1/4 Preparing calibration dataset"
+    python3 step-1-prepare-calibration-dataset.py -m $MODEL_PATH -d $DATASET_PATH_OR_NAME -o $MODEL_NAME $EXTRA_FLAGS_STEP_1 || (echo "Error in step 1" && exit 1)
+    echo "Step 1/4 done"
+fi
+
+echo ""
+echo "2/4 Measuring scales"
+if $MULTI_NODE_SETUP; then
+    env $EXTRA_ENVS_STEP_2 python3 step-2-measure-scales.py -m $MODEL_PATH --tensor-parallel-size $TP_SIZE -d $MODEL_NAME-calibration-dataset.pkl --batch-size $BATCH_SIZE --distributed-executor-backend ray  $EXTRA_FLAGS_STEP_2 || (echo "Error in step 2" && exit 1)
+else
+    env $EXTRA_ENVS_STEP_2 python3 step-2-measure-scales.py -m $MODEL_PATH --tensor-parallel-size $TP_SIZE -d $MODEL_NAME-calibration-dataset.pkl --batch-size $BATCH_SIZE $EXTRA_FLAGS_STEP_2 || (echo "Error in step 2" && exit 1)
+fi
+echo "Step 2/4 done"
+
+echo ""
+echo "3/4 Postprocessing scales"
+python3 step-3-postprocess-measure.py -m $FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/ -o inc_tmp/$MODEL_NAME/$DEVICE_TYPE/  $EXTRA_FLAGS_STEP_3 || (echo "Error in step 3" && exit 1)
+cp inc_tmp/$MODEL_NAME/$DEVICE_TYPE/* $FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/
+echo "Step 3/4 done"
+
+
+if $MULTI_NODE_SETUP; then
+    cat $FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json > $QUANT_CONFIG
+    sleep 2
+else
+    export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json
+fi
+
+echo ""
+echo "4/4 Quantize scales"
+if $MULTI_NODE_SETUP; then
+    env $EXTRA_ENVS_STEP_4 python3 step-4-quantize-scales.py --model $MODEL_PATH --tensor-parallel-size $TP_SIZE --distributed-executor-backend ray $EXTRA_FLAGS_STEP_4 || (echo "Error in step 4" && exit 1)
+else
+    env $EXTRA_ENVS_STEP_4 python3 step-4-quantize-scales.py --model $MODEL_PATH --tensor-parallel-size $TP_SIZE $EXTRA_FLAGS_STEP_4 || (echo "Error in step 4" && exit 1)
+fi
+
+if [[ -n $RANK ]]; then
+    echo ""
+    echo "5/5 Unify scales"
+    QUANT_DIR=$FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/
+    python3 step-5-unify_measurements.py -r $RANK -m $QUANT_DIR -o $QUANT_DIR $USE_EP || (echo "Error in step 5" && exit 1)
+    echo "Step 5/5 done"
+fi
+cleanup_tmp
+echo "Calibration process done"
diff --git a/calibration/requirements.txt b/calibration/requirements.txt
new file mode 100644
index 000000000..a0f34c7b1
--- /dev/null
+++ b/calibration/requirements.txt
@@ -0,0 +1,3 @@
+datasets
+transformers
+numpy
\ No newline at end of file
diff --git a/calibration/step-0-detect-device.py b/calibration/step-0-detect-device.py
new file mode 100644
index 000000000..12c6af08b
--- /dev/null
+++ b/calibration/step-0-detect-device.py
@@ -0,0 +1,12 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import habana_frameworks.torch.hpu as hthpu
+
+
+def detect_hpu():
+    return hthpu.get_device_name()[-1]
+
+
+if __name__ == "__main__":
+    print(detect_hpu())
diff --git a/calibration/step-1-prepare-calibration-dataset.py b/calibration/step-1-prepare-calibration-dataset.py
new file mode 100755
index 000000000..374c086a6
--- /dev/null
+++ b/calibration/step-1-prepare-calibration-dataset.py
@@ -0,0 +1,93 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import argparse
+import os
+
+import pandas as pd
+import transformers
+
+os.environ["EXPERIMENTAL_WEIGHT_SHARING"] = "0"
+os.environ["VLLM_SKIP_WARMUP"] = "true"
+
+
+def get_ds(args):
+    print(f"Loading source dataset: {args.dataset}")
+    ds = pd.read_pickle(args.dataset)
+
+    if args.max_dataset_samples:
+        ds = ds.sample(frac=1, random_state=42)
+        ds = ds.head(args.max_dataset_samples)
+
+    return ds
+
+
+def load_chat_template(chat_template_path: str) -> str:
+
+    with open(chat_template_path) as f:
+        return f.read()
+
+
+def main(args):
+
+    calibration_ds = get_ds(args)
+    try:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            args.model,
+            model_max_length=args.max_model_length,
+            padding_side="left",
+            use_fast=False,
+        )
+    except (OSError, ValueError, RuntimeError, ImportError):
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            args.model,
+            model_max_length=args.max_model_length,
+            padding_side="left",
+            use_fast=True,
+        )
+
+    chat_template = load_chat_template(args.chat_template) if args.chat_template else None
+
+    print("Creating calibration dataset...")
+    inputs = []
+    for _, row in calibration_ds.iterrows():
+        question = row["question"]
+        system_prompt = row["system_prompt"]
+        if "mixtral" in args.model or "Mixtral" in args.model:
+            tmp_conversation = [{"role": "user", "content": question}, {"role": "assistant", "content": system_prompt}]
+        else:
+            tmp_conversation = [{"role": "system", "content": system_prompt}, {"role": "user", "content": question}]
+        try:
+            tmp_input = tokenizer.apply_chat_template(tmp_conversation,
+                                                      chat_template=chat_template,
+                                                      tokenize=False,
+                                                      truncation=True)
+        except ValueError:
+            # Case when given model don't need any chat-template and can process raw string without any system tokens,
+            # e.g. facebook/opt-125m
+            tmp_input = f"{system_prompt}. {question}"
+        inputs.append(tmp_input)
+
+    calibration_ds['input'] = inputs
+
+    print("Saving calibration dataset...")
+    calibration_ds.to_pickle(f"{args.output_name}-calibration-dataset.pkl")
+    print("Done.")
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Create a calibration dataset for a model.")
+    parser.add_argument("-d", "--dataset", type=str, required=True)
+    parser.add_argument("-m", "--model", type=str, required=True)
+    parser.add_argument("-o", "--output_name", type=str, required=True)
+    parser.add_argument("--max-model-length", type=int, default=1024)
+    parser.add_argument("--max-dataset-samples", type=int, default=0)
+    parser.add_argument("--chat-template",
+                        type=str,
+                        default="",
+                        help="If not provided, the default chat-template from the model will be used.")
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/calibration/step-2-measure-scales.py b/calibration/step-2-measure-scales.py
new file mode 100755
index 000000000..22790106e
--- /dev/null
+++ b/calibration/step-2-measure-scales.py
@@ -0,0 +1,197 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import vllm
+import torch
+import pandas as pd
+import time
+import argparse
+import os
+
+os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
+os.environ["VLLM_SKIP_WARMUP"] = "true"
+
+
+def get_ds(args):
+    print(f"Loading dataset: {args.dataset}")
+    ds = pd.read_pickle(args.dataset)
+
+    if args.max_dataset_samples:
+        ds = ds.head(args.max_dataset_samples)
+
+    return ds
+
+
+def get_dataset(args):
+
+    def reset_seed(seed=42):
+        import torch
+        import random
+        import numpy as np
+
+        torch.manual_seed(seed)
+        np.random.seed(seed)
+        random.seed(seed)
+
+    def get_prompt_token_ids(model_path, prompts, max_length=1024):
+        from transformers import AutoTokenizer
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        prompt_token_ids = []
+        for prompt in prompts:
+            tokens = tokenizer(
+                prompt,
+                return_tensors="pt",
+                truncation=True,
+                max_length=max_length,
+            )
+            if len(tokens.input_ids[0]) < max_length:
+                continue
+            prompt_token_ids.append([x.item() for x in tokens.input_ids[0]])
+        return prompt_token_ids
+
+    def get_prompts(
+        model_name,
+        dataset_name="NeelNanda/pile-10k",
+        num_samples=512,
+        least_tokens=1024,
+    ):
+        print(f"Loading {num_samples} samples with at least {least_tokens} tokens "
+              f"from {dataset_name} for model {model_name}...")
+        from datasets import load_dataset
+        from tqdm import tqdm
+        import transformers
+
+        seed = 42
+
+        reset_seed(seed)
+
+        dataset = load_dataset(dataset_name, split="train")
+        dataset = dataset.shuffle(seed=seed)
+
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+        num_sample = 0
+        samples_lst = []
+        for data in tqdm(dataset):
+            prompt = data["text"]
+            tokens = tokenizer(prompt, return_tensors="pt")
+            if len(tokens.input_ids[0]) < least_tokens:
+                continue
+            num_sample += 1
+            samples_lst.append(prompt)
+            if num_sample >= num_samples:
+                break
+        return samples_lst
+
+    least_tokens = args.sample_len
+    num_samples = args.max_dataset_samples
+    try:
+        prompts = get_prompts(
+            args.model,
+            dataset_name=args.dataset,
+            num_samples=num_samples,
+            least_tokens=least_tokens,
+        )
+    except (OSError, ValueError, RuntimeError, ImportError, ConnectionError, FileNotFoundError) as e:
+        import sys
+        sys.exit(f"Failed to load prompts from dataset {args.dataset}. Error: {e}")
+    prompt_token_ids = get_prompt_token_ids(args.model, prompts, least_tokens)
+    print(f"Got {len(prompts)} prompts, length of first prompt: {len(prompt_token_ids[0])}.")
+    gt = None
+    return prompts, prompt_token_ids, gt
+
+
+def generate_responses(llm, input_batch, args, sampling_params=None, prompt_token_ids=None):
+    if prompt_token_ids:
+        input_batch = [{"prompt_token_ids": p} for p in prompt_token_ids]
+    responses = llm.generate(input_batch, sampling_params, use_tqdm=True)
+
+    total_input_tokens = 0
+    total_generated_tokens = 0
+
+    for response in responses:
+        if args.verbose:
+            print(f"Prompt: {response.prompt};\nAnswer: {response.outputs[0].text}\n")
+        total_input_tokens += len(response.prompt_token_ids)
+        total_generated_tokens += len(response.outputs[0].token_ids)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-d", "--dataset", type=str, required=True)
+    parser.add_argument("-m", "--model", type=str, required=True)
+    parser.add_argument("--batch-size", type=int, default=32)
+    parser.add_argument("--tensor-parallel-size", type=int, default=1)
+    parser.add_argument("--max-dataset-samples", type=int, default=0)
+    parser.add_argument("--max-num-prefill-seqs", type=int, default=None)
+    parser.add_argument("--block-quant", action="store_true", default=False)
+    parser.add_argument("--expert-parallel", action="store_true", default=False)
+    parser.add_argument(
+        "--auto-process-dataset",
+        action="store_true",
+        default=False,
+        help="Automatically generate a calibration dataset based on the provided dataset name.",
+    )
+    parser.add_argument("--enforce-eager", action="store_true", default=False)
+    parser.add_argument("--max-model-len", type=int, default=2048)
+    parser.add_argument("--max-tokens", type=int, default=1024, help="Maximum number of tokens to generate.")
+    parser.add_argument("--sample-len", type=int, default=1024, help="Minimum number of tokens in each sample.")
+    parser.add_argument("-v", "--verbose", action="store_true")
+    parser.add_argument(
+        "--distributed-executor-backend",
+        choices=["mp", "ray"],
+        default="mp",
+        help=
+        "For single node calibration use the default multiprocessing backend. " \
+        "For multi-node calibration use ray backend"
+    )
+
+    args = parser.parse_args()
+    if not args.auto_process_dataset:
+        calibration_ds = get_ds(args)
+    llm = vllm.LLM(
+        model=args.model,
+        dtype=torch.bfloat16,
+        enforce_eager=args.enforce_eager,
+        quantization="fp8" if args.block_quant else "inc",
+        max_num_seqs=args.batch_size,
+        tensor_parallel_size=args.tensor_parallel_size,
+        max_model_len=args.max_model_len,
+        trust_remote_code=True,
+        distributed_executor_backend=args.distributed_executor_backend,
+        enable_expert_parallel=args.expert_parallel,
+    )
+
+    sampling_params = vllm.SamplingParams(temperature=0.0, top_p=1, max_tokens=args.max_tokens)
+
+    if not args.auto_process_dataset:
+        input_batch = []
+        dataset_len = len(calibration_ds)
+        batch_num = dataset_len // args.batch_size if dataset_len % args.batch_size == 0 else (dataset_len //
+                                                                                               args.batch_size) + 1
+        batch_done = 0
+        for i, (_, row) in enumerate(calibration_ds.iterrows()):
+            input_batch.append(row["input"])
+            if i and i % args.batch_size == 0:
+                t_start = time.perf_counter()
+                generate_responses(llm, input_batch, args)
+                t_end = time.perf_counter()
+                batch_done += 1
+                print(f"Batch finished: {i}/{calibration_ds.shape[0]} samples done; ETA: "
+                      f"{int((t_end - t_start) * (batch_num - batch_done) // 60)} min")
+                input_batch = []
+        generate_responses(llm, input_batch, args)
+        print(f"Last batch finished: {i + 1}/{calibration_ds.shape[0]} samples done")
+    else:
+        prompts, prompt_token_ids, gt = get_dataset(args)
+        generate_responses(
+            llm=llm,
+            input_batch=None,
+            args=args,
+            sampling_params=sampling_params,
+            prompt_token_ids=prompt_token_ids,
+        )
+
+    # Skip shutdown when VLLM_USE_V1 is set to "1"
+    if not os.environ.get("VLLM_USE_V1") or os.environ.get("VLLM_USE_V1") != "1":
+        llm.llm_engine.model_executor.shutdown()
diff --git a/calibration/step-3-postprocess-measure.py b/calibration/step-3-postprocess-measure.py
new file mode 100755
index 000000000..2d51cb698
--- /dev/null
+++ b/calibration/step-3-postprocess-measure.py
@@ -0,0 +1,122 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+
+
+def fix_cache_inputs(json_data, args):
+    layer_indexes = set([int(key.split('.')[2]) for key in json_data['Nodes'] if key.startswith('model.layers.')])
+    for layer_index in range(len(layer_indexes)):
+        matmul_av_input = None
+        v_cache_input = None
+        matmul_qk_input = None
+        k_cache_input = None
+
+        attn_name = "attn"
+        k_cache_name = "k_cache"
+        v_cache_name = "v_cache"
+        if args.deepseek:
+            attn_name = "mla_attn.mla_attn"
+            k_cache_name = "latent_cache_k"
+
+        matmul_av_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.matmul_av'
+        v_cache_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.{v_cache_name}'
+        matmul_qk_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.matmul_qk'
+        k_cache_key = f'model.layers.{layer_index}.self_attn.{attn_name}.impl.{k_cache_name}'
+
+        matmul_av_input = json_data['Nodes'].get(matmul_av_key, {}).get('inputs', [None, None])[1]
+        v_cache_input = json_data['Nodes'].get(v_cache_key, {}).get('inputs', [None])[0]
+        matmul_qk_input = json_data['Nodes'].get(matmul_qk_key, {}).get('inputs', [None, None])[1]
+        k_cache_input = json_data['Nodes'].get(k_cache_key, {}).get('inputs', [None])[0]
+
+        if matmul_av_input != v_cache_input:
+            if args.deepseek:
+                # For deepseek, there is one tensor for k_cache and v_cache
+                json_data['Nodes'][matmul_av_key]['inputs'][1] = k_cache_input
+            else:
+                json_data['Nodes'][matmul_av_key]['inputs'][1] = v_cache_input
+        if matmul_qk_input != k_cache_input:
+            json_data['Nodes'][matmul_qk_key]['inputs'][1] = k_cache_input
+
+    return json_data
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="Run the measurements parser",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-m",
+                        "--measurements",
+                        type=str,
+                        help="full path to the directory of the measurements that should be fixed")
+    parser.add_argument(
+        "-o",
+        "--out",
+        type=str,
+        default=os.getcwd(),
+        help="path to the directory where the fixed measurements will be written",
+    )
+    parser.add_argument(
+        "-d",
+        "--deepseek",
+        action="store_true",
+        help="if handle deepseek models, please set this flag",
+    )
+    return parser.parse_args(args)
+
+
+def main(args):
+    args = parse_args(args)
+    output_path = args.out
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+    measurements_path = args.measurements
+    measurements_paths = os.listdir(measurements_path)
+    measurements_paths_ranges = [
+        measurement_path for measurement_path in measurements_paths if measurement_path.endswith(".json")
+        and 'MAXABS_HW' not in measurement_path and "mod_list" not in measurement_path
+    ]
+    measurements_paths_scales = [
+        measurement_path for measurement_path in measurements_paths
+        if measurement_path.endswith(".json") and 'MAXABS_HW' in measurement_path and "mod_list" not in measurement_path
+    ]
+    print(measurements_paths_ranges)
+    print(measurements_paths_scales)
+    for measurement in measurements_paths_ranges + measurements_paths_scales:
+        fixed_json_path = os.path.join(output_path, f"{measurement.split(os.sep)[-1]}")
+        with open(fixed_json_path, "w") as fixed_json_file, \
+             open(os.path.join(measurements_path, measurement)) as json_file:
+            data_to_fix = json.load(json_file)
+            fixed_data = fix_cache_inputs(data_to_fix, args)
+            json.dump(fixed_data, fixed_json_file)
+            print("")
+            print("measurement=", measurement, flush=True)
+            print("measurements_paths_scales=", measurements_paths_scales, flush=True)
+            if measurement in measurements_paths_ranges + measurements_paths_scales:
+                global_rank = fixed_data["GlobalRank"]
+                local_rank = fixed_data["LocalRank"]
+                mode = fixed_data["Mode"]
+                nodes = fixed_data["Nodes"]
+                layers = {}
+                fixed_npz_path = fixed_json_path.replace(".json", ".npz")
+                for layer, dlayer in nodes.items():
+                    layers[layer] = {}
+                    layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
+                    if dlayer.get("outputs") is not None:
+                        layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
+                    if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
+                        layers[layer]["params"] = {}
+                        layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+                df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
+                with open(fixed_npz_path, "w"):
+                    np.savez(fixed_npz_path, df)
+
+    print("finished fix_measurements script")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/calibration/step-4-quantize-scales.py b/calibration/step-4-quantize-scales.py
new file mode 100755
index 000000000..a13061498
--- /dev/null
+++ b/calibration/step-4-quantize-scales.py
@@ -0,0 +1,47 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import vllm
+import torch
+import argparse
+import os
+
+os.environ["PT_HPU_WEIGHT_SHARING"] = "0"
+os.environ["VLLM_SKIP_WARMUP"] = "true"
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, required=True)
+    parser.add_argument("--tensor-parallel-size", type=int, default=1)
+    parser.add_argument("--block-quant", action="store_true", default=False)
+    parser.add_argument("--enforce-eager", action="store_true", default=False)
+    parser.add_argument("--expert-parallel", action="store_true", default=False)
+    parser.add_argument("--max-num-prefill-seqs", type=int, default=None)
+    parser.add_argument(
+        "--distributed-executor-backend",
+        choices=["mp", "ray"],
+        default="mp",
+        help=
+        "For single node calibration use the default multiprocessing backend. " \
+        "For multi-node calibration use ray backend"
+    )
+
+    args = parser.parse_args()
+
+    llm = vllm.LLM(
+        model=args.model,
+        tensor_parallel_size=args.tensor_parallel_size,
+        enforce_eager=args.enforce_eager,
+        dtype=torch.bfloat16,
+        quantization="fp8" if args.block_quant else "inc",
+        kv_cache_dtype="fp8_inc",
+        max_model_len=128,
+        trust_remote_code=True,
+        distributed_executor_backend=args.distributed_executor_backend,
+        enable_expert_parallel=args.expert_parallel,
+    )
+
+    # Skip shutdown when VLLM_USE_V1 is set to "1"
+    if not os.environ.get("VLLM_USE_V1") or os.environ.get("VLLM_USE_V1") != "1":
+        llm.llm_engine.model_executor.shutdown()
diff --git a/calibration/step-5-unify_measurements.py b/calibration/step-5-unify_measurements.py
new file mode 100755
index 000000000..5d283ea00
--- /dev/null
+++ b/calibration/step-5-unify_measurements.py
@@ -0,0 +1,317 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import argparse
+import glob
+import json
+import os
+import re
+import sys
+
+import numpy as np
+
+
+def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
+    measurment_card = "_" + measurement + "_" + str(group_size)
+    for measurment_file in os.listdir(measurements_dir_path):
+        filename = os.fsdecode(measurment_file)
+        if (not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename):
+            continue
+        if scales:
+            if "MAXABS" in filename:
+                return os.path.join(measurements_dir_path, measurment_file)
+        else:
+            if "MAXABS" not in filename:
+                return os.path.join(measurements_dir_path, measurment_file)
+
+
+def is_fused_moe_op(node_name):
+    return ("moe" in node_name.lower() and ".w13_list" not in node_name and ".w2_list" not in node_name)
+
+
+def is_moe_experts(node_name):
+    return ("moe" in node_name.lower() and (".w13_list" in node_name or ".w2_list" in node_name))
+
+
+def get_expert_id(node_name):
+    parts = node_name.split(".")
+    assert parts[-1].isdigit()
+    expert_id = int(parts[-1])
+    return expert_id
+
+
+def get_expert_prefix(node_name):
+    parts = node_name.split(".")
+    assert parts[-1].isdigit()
+    prefix = ".".join(parts[:-1])
+    return prefix
+
+
+def get_local_expert_num(data):
+    expert_id = -1
+    for mod_name in data:
+        if is_moe_experts(mod_name):
+            idx = get_expert_id(mod_name)
+            expert_id = max(expert_id, idx)
+    return expert_id + 1
+
+
+def unify_measurements(measurement_group,
+                       measurements_dir_path,
+                       output_path,
+                       groups_size,
+                       groups_num,
+                       group_index,
+                       scales=False,
+                       use_ep=False):
+    measurements_paths = []
+    group_name = ""
+
+    # save all the jsons paths in the given measurement group
+    for measurement in measurement_group:
+        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, groups_size)
+        if measurement_path is not None:
+            measurements_paths.append(measurement_path)
+        group_name += measurement
+
+    if len(measurements_paths) == 0:
+        print("Error: invalid measurement paths. No *.json files or no "
+              "*mod_list.json files.")
+        return
+
+    # save all the jsons content in the given measurement group
+    measurements_jsons = []
+    for measurement_path in measurements_paths:
+        with open(measurement_path) as f:
+            js = json.load(f)
+            measurements_jsons.append(js["Nodes"])
+    # create a name for the unified json that will be created for this measurement group
+    base_path = find_measurement_path(measurement_group[0], measurements_dir_path, scales, groups_size)
+    old_suffix = "_" + measurement_group[0] + "_" + str(groups_size)
+    new_suffix = "_" + str(group_index) + "_" + str(groups_num)
+    unified_json_name = base_path.split("/")[-1].replace(old_suffix, new_suffix)
+    unified_json_path = os.path.join(output_path, unified_json_name)
+
+    # open a unified json file
+    with open(measurements_paths[0]) as origin, open(unified_json_path, "w") as copy:
+        copy.write(origin.read())
+    with open(unified_json_path) as json_file:
+        unified_json = json.load(json_file)
+        unified_json["LocalRank"] = group_index if groups_num != 1 else -1
+
+    moe_experts_data = {}
+    # expert_num is original local_expert_num, it is used only when use_ep is True
+    expert_num = get_local_expert_num(unified_json["Nodes"]) if use_ep else -1
+
+    # iterate all unified json nodes
+    for node_name, node_values in unified_json["Nodes"].items():
+        max_inputs = node_values["inputs"]
+        max_outputs = None
+        if node_values.get("outputs") is not None:
+            max_outputs = node_values["outputs"]
+        max_weight = None
+        if node_values.get("params") is not None and node_values["params"].get("weight") is not None:
+            max_weight = node_values["params"]["weight"]
+
+        # iterate over all the measurment group and take the maximum for each tensor and its channel
+        if scales:
+            for idx, measurement_json in enumerate(measurements_jsons):
+                # for experts of moe, append results in all measurements
+                if use_ep and is_moe_experts(node_name):
+                    if node_name not in moe_experts_data:
+                        moe_experts_data[node_name] = node_values
+                    else:
+                        prefix, local_expert_id = get_expert_prefix(node_name), get_expert_id(node_name)
+                        # take original total_rank=8, total_expert_num=128,
+                        # local_expert_num=16 and expert string.MoeOp.w13_list.11 on rank 3
+                        # if target total_rank=4, then new local_expert_num=32,
+                        # new expert is string.MoeOp.w13_list.27(16*1+11) on rank 1
+                        new_node_name = ".".join((prefix, str(expert_num * idx + local_expert_id)))
+                        assert new_node_name not in moe_experts_data
+                        moe_experts_data[new_node_name] = measurement_json[node_name]
+                    continue
+
+                # for moe op, keep max of the first, retain rest from other measurements
+                if use_ep and is_fused_moe_op(node_name) and idx > 0:
+                    # input 0 of moe is hidden_states, we should get the max value
+                    # across ranks during unification
+                    # input 1 ~ local_expert_num is the intermidiate_amax of each
+                    # expert, we should extend them during unification
+                    max_inputs[0] = max(measurement_json[node_name]["inputs"][0], max_inputs[0])
+                    max_inputs.extend(measurement_json[node_name]["inputs"][1:])
+                else:
+                    for i in range(0, len(max_inputs)):
+                        max_inputs[i] = max(measurement_json[node_name]["inputs"][i], max_inputs[i])
+                if max_outputs is not None:
+                    max_outputs = max(measurement_json[node_name]["outputs"], max_outputs)
+                if max_weight is not None:
+                    max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
+        else:
+            for idx, measurement_json in enumerate(measurements_jsons):
+                # for experts of moe, append results in all measurements
+                if use_ep and is_moe_experts(node_name):
+                    if node_name not in moe_experts_data:
+                        moe_experts_data[node_name] = node_values
+                    else:
+                        prefix, local_expert_id = get_expert_prefix(node_name), get_expert_id(node_name)
+                        new_node_name = ".".join((prefix, str(expert_num * idx + local_expert_id)))
+                        assert new_node_name not in moe_experts_data
+                        moe_experts_data[new_node_name] = measurement_json[node_name]
+                    continue
+
+                for i in range(0, len(max_inputs)):
+                    for j in range(0, len(max_inputs[i])):
+                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
+                if max_outputs is not None:
+                    if use_ep and is_fused_moe_op(node_name) and idx > 0:
+                        max_outputs[0][0] = max(measurement_json[node_name]["outputs"][0][0], max_outputs[0][0])
+                        max_outputs.extend(measurement_json[node_name]["outputs"][1:])
+                    else:
+                        for i in range(0, len(max_outputs)):
+                            max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
+                if max_weight is not None:
+                    for i in range(0, len(max_weight)):
+                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
+
+        # update the maximum in the unified json
+        if scales:
+            for i in range(0, len(max_inputs)):
+                unified_json["Nodes"][node_name]["inputs"][i] = max_inputs[i]
+            if max_outputs is not None:
+                unified_json["Nodes"][node_name]["outputs"] = max_outputs
+            if max_weight is not None:
+                unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
+        else:
+            for i in range(0, len(max_inputs)):
+                for j in range(0, len(max_inputs[i])):
+                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+            if max_outputs is not None:
+                for i in range(0, len(max_outputs)):
+                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+            if max_weight is not None:
+                for i in range(0, len(max_weight)):
+                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
+    if use_ep:
+        unified_json["Nodes"].update(moe_experts_data)
+    global_rank = None
+    local_rank = group_index if groups_num != 1 else -1
+    mode = ""
+    layers = {}
+    with open(unified_json_path, "w") as json_file:
+        json.dump(unified_json, json_file, indent=4)
+    mode = unified_json["Mode"]
+    nodes = unified_json["Nodes"]
+
+    # create unified npz file from the unified json
+    unified_npz_path = os.path.join(output_path, unified_json_name.replace(".json", ".npz"))
+    for layer, dlayer in nodes.items():
+        layers[layer] = {}
+        layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
+        if dlayer.get("outputs") is not None:
+            layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
+        if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
+            layers[layer]["params"] = {}
+            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+    df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
+    with open(unified_npz_path, "w"):
+        np.savez(unified_npz_path, df)
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="Run the measurements parser",
+                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-m",
+                        "--measurements",
+                        type=str,
+                        help="path to the directory of the measurements that will be "
+                        "unified")
+    parser.add_argument("-r", "--rank", type=int, help="rank of unified measurements")
+    parser.add_argument(
+        "-o",
+        "--out",
+        type=str,
+        default=os.getcwd(),
+        help="path to the directory where the unified measurements will be "
+        "written",
+    )
+    parser.add_argument(
+        "-u",
+        "--use_expert_paral",
+        action="store_true",
+        help="unify original measurement results based on expert parallelism "
+        "rules",
+    )
+    parser.add_argument(
+        "-s",
+        "--skip_unify_scales",
+        action="store_true",
+        help="skip the scale unification step.",
+    )
+    return parser.parse_args(args)
+
+
+def prepare_group_list(measurements_path, rank):
+    measure_files = glob.glob(os.path.join(measurements_path, "*_mod_list.json"))
+    if len(measure_files) > 0:
+        # take original rank=8 as an example, target file name: string_0_8_mod_list.json
+        matched = re.match(r"^(\w+)_(\d+)_(\d+)_(\w+)_(\w+)\.json$", os.path.basename(measure_files[0]))
+        if matched:
+            total_rank = int(matched.group(3))
+            assert (rank < total_rank) and (total_rank % rank) == 0, (
+                f"Original total_rank {total_rank} should be larger than your "
+                f"target rank {rank} and be divisible by it")
+            group_size = total_rank // rank
+            group_list = [[str(i * group_size + j) for j in range(group_size)] for i in range(rank)]
+            print("Card grouping list >> {}".format(group_list))
+            return group_list
+        else:
+            raise ValueError("Unrecognized file name!")
+    else:
+        raise ValueError("*_mod_list.json doesn't exist in {}".format(measurements_path))
+
+
+def main(args):
+    args = parse_args(args)
+    output_path = args.out
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    measurements_path = args.measurements
+    groups = prepare_group_list(measurements_path, args.rank)
+
+    num_jsons_drange = 0
+    num_jsons_scales = 0
+    for path in os.listdir(measurements_path):
+        if path.endswith(".json"):
+            if "MAXABS" in path:
+                num_jsons_scales += 1
+            elif "mod_list" not in path:
+                num_jsons_drange += 1
+    assert (os.path.isdir(measurements_path) and (num_jsons_drange % len(groups)) == 0
+            and (num_jsons_scales % len(groups)) == 0)
+
+    for group_index, group in enumerate(groups):
+        unify_measurements(group,
+                           measurements_path,
+                           output_path,
+                           num_jsons_drange,
+                           len(groups),
+                           group_index,
+                           scales=False,
+                           use_ep=args.use_expert_paral)
+        if not args.skip_unify_scales:
+            unify_measurements(
+                group,
+                measurements_path,
+                output_path,
+                num_jsons_scales,
+                len(groups),
+                group_index,
+                scales=True,
+                use_ep=args.use_expert_paral,
+            )
+
+    print("finished measurement unifier script")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/calibration/step-6-expand-measurements.py b/calibration/step-6-expand-measurements.py
new file mode 100644
index 000000000..c6dabb5da
--- /dev/null
+++ b/calibration/step-6-expand-measurements.py
@@ -0,0 +1,213 @@
+###############################################################################
+# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
+###############################################################################
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+import logging
+
+# from loguru import logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def find_measurement_path(measurement, measurements_dir_path, group_size):
+    measurement_card = "_" + measurement + "_" + str(group_size)
+    for measurement_file in os.listdir(measurements_dir_path):
+        filename = os.fsdecode(measurement_file)
+        if (not filename.endswith(".json") or "_mod_list" in filename or measurement_card not in filename):
+            continue
+        if "MAXABS" not in filename:
+            return os.path.join(measurements_dir_path, measurement_file)
+
+
+def is_fused_moe_op(node_name):
+    return ("moe" in node_name.lower() and ".w13_list" not in node_name and ".w2_list" not in node_name)
+
+
+def is_moe_experts(node_name):
+    # model.layers.3.mlp.experts.moe_op.w13_list.0
+    return ("moe" in node_name.lower() and (".w13_list" in node_name or ".w2_list" in node_name))
+
+
+def get_expert_id(node_name):
+    parts = node_name.split(".")
+    assert parts[-1].isdigit()
+    expert_id = int(parts[-1])
+    return expert_id
+
+
+def get_expert_prefix(node_name):
+    parts = node_name.split(".")
+    assert parts[-1].isdigit()
+    prefix = ".".join(parts[:-1])
+    return prefix
+
+
+def get_local_expert_num(data):
+    expert_id = -1
+    for mod_name in data:
+        if is_moe_experts(mod_name):
+            idx = get_expert_id(mod_name)
+            expert_id = max(expert_id, idx)
+    return expert_id + 1
+
+
+def expand_measurements(
+    measurements_dir_path,
+    output_path,
+    local_rank,
+    world_size,
+):
+    measurement_group = ["0"]
+    # save all the jsons paths in the given measurement group
+    groups_size = 1
+    unified_measurement_index = "0"
+    measurement_path = find_measurement_path(unified_measurement_index, measurements_dir_path, groups_size)
+    measurements_jsons = []
+
+    with open(measurement_path) as f:
+        js = json.load(f)
+        measurements_jsons.append(js["Nodes"])
+    # New json file name
+    new_json_name = (find_measurement_path(measurement_group[0], measurements_dir_path,
+                                           groups_size).split("/")[-1].replace(
+                                               "_" + measurement_group[0] + "_" + str(groups_size),
+                                               "_" + str(local_rank) + "_" + str(world_size),
+                                           ))
+    logger.info(
+        "Generating new json file: %s with local_rank %d and world_size %d",
+        new_json_name,
+        local_rank,
+        world_size,
+    )
+    new_json_path = os.path.join(output_path, new_json_name)
+
+    # Create a replica of the measurement json file
+    with open(measurement_path) as origin, open(new_json_path, "w") as copy:
+        copy.write(origin.read())
+    with open(new_json_path) as json_file:
+        new_json = json.load(json_file)
+        new_json["LocalRank"] = local_rank
+
+    expert_num = get_local_expert_num(new_json["Nodes"])
+    total_experts = expert_num
+
+    # Iterate all nodes
+    for node_name, node_values in new_json["Nodes"].items():
+        max_outputs = None
+        if node_values.get("outputs") is not None:
+            max_outputs = node_values["outputs"]
+
+        # iterate over all the measurements and update the fused moe op with selected experts data
+
+        for idx, measurement_json in enumerate(measurements_jsons):
+            if is_fused_moe_op(node_name):
+                node_res = measurement_json[node_name]["outputs"]
+                node_res_output = node_res[0]
+                node_res_experts_intermediate_amax = node_res[1:]
+                num_intermediate_amax = len(node_res_experts_intermediate_amax)
+                assert num_intermediate_amax == total_experts, (
+                    f"the number of intermediate amax should be {total_experts}, but got {num_intermediate_amax}")
+                ep_size = world_size
+                ep_rank = local_rank
+                num_local_experts = total_experts // ep_size
+                expert_start_index = ep_rank * num_local_experts
+                expert_end_index = expert_start_index + num_local_experts
+                node_intermediate_amax = node_res_experts_intermediate_amax[expert_start_index:expert_end_index]
+                assert len(node_intermediate_amax) == num_local_experts, (
+                    f"len(node_intermediate_amax) should be {num_local_experts}, but got {len(node_intermediate_amax)}")
+                max_outputs = [node_res_output, *node_intermediate_amax]
+                logger.debug(
+                    "Selecting %d outputs for %s "
+                    "ep_rank %d with expert_start_index %d and expert_end_index %d",
+                    len(max_outputs),
+                    node_name,
+                    ep_rank,
+                    expert_start_index,
+                    expert_end_index,
+                )
+
+        if max_outputs is not None and is_fused_moe_op(node_name):
+            new_json["Nodes"][node_name]["outputs"] = max_outputs
+
+    global_rank = None
+    local_rank = local_rank
+    mode = ""
+    layers = {}
+    with open(new_json_path, "w") as json_file:
+        json.dump(new_json, json_file, indent=4)
+    mode = new_json["Mode"]
+    nodes = new_json["Nodes"]
+
+    # create unified npz file from the new json
+    unified_npz_path = os.path.join(output_path, new_json_name.replace(".json", ".npz"))
+    for layer, dlayer in nodes.items():
+        layers[layer] = {}
+        layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
+        if dlayer.get("outputs") is not None:
+            layers[layer]["outputs"] = [np.array(x) for x in dlayer["outputs"]]
+        if (dlayer.get("params") is not None and dlayer["params"].get("weight") is not None):
+            layers[layer]["params"] = {}
+            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+    df = {
+        "GlobalRank": global_rank,
+        "LocalRank": local_rank,
+        "Mode": mode,
+        "Nodes": layers,
+    }
+    with open(unified_npz_path, "w"):
+        np.savez(unified_npz_path, df)
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Run the measurements parser",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+    parser.add_argument(
+        "-m",
+        "--measurements",
+        type=str,
+        help="path to the directory of the measurements that have been unified",
+    )
+    parser.add_argument(
+        "-o",
+        "--out",
+        type=str,
+        default=os.getcwd(),
+        help="path to the directory where the expand measurements will be written",
+    )
+    parser.add_argument(
+        "-w",
+        "--target_world_size",
+        type=int,
+        help="The target number of ranks to expand the measurements to.",
+    )
+    return parser.parse_args(args)
+
+
+def main(args):
+    args = parse_args(args)
+    output_path = args.out
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    measurements_path = args.measurements
+
+    target_world_size = args.target_world_size
+    for ep_rank in range(target_world_size):
+        expand_measurements(
+            measurements_dir_path=measurements_path,
+            output_path=output_path,
+            local_rank=ep_rank,
+            world_size=target_world_size,
+        )
+
+    logger.info("finished expanding measurements for %d ranks", target_world_size)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/calibration/template/llama-2-chat.jinja b/calibration/template/llama-2-chat.jinja
new file mode 100644
index 000000000..78c348f0f
--- /dev/null
+++ b/calibration/template/llama-2-chat.jinja
@@ -0,0 +1,24 @@
+{% if messages[0]['role'] == 'system' %}
+    {% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}
+    {% set messages = messages[1:] %}
+{% else %}
+    {% set system_message = '' %}
+{% endif %}
+
+{% for message in messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+
+    {% if loop.index0 == 0 %}
+        {% set content = system_message + message['content'] %}
+    {% else %}
+        {% set content = message['content'] %}
+    {% endif %}
+
+    {% if message['role'] == 'user' %}
+        {{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' ' + content | trim + ' ' + eos_token }}
+    {% endif %}
+{% endfor %}
diff --git a/calibration/template/mistral_mixtral.jinja b/calibration/template/mistral_mixtral.jinja
new file mode 100644
index 000000000..3fd756364
--- /dev/null
+++ b/calibration/template/mistral_mixtral.jinja
@@ -0,0 +1,19 @@
+{% if messages[0]['role'] == 'system' %}
+    {% set system_message = messages[0]['content'] | trim + '\n\n' %}
+    {% set messages = messages[1:] %}
+{% else %}
+    {% set system_message = '' %}
+{% endif %}
+
+{{ bos_token + system_message}}
+{% for message in messages %}
+    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {% endif %}
+
+    {% if message['role'] == 'user' %}
+        {{ '[INST] ' + message['content'] | trim + ' [/INST]' }}
+    {% elif message['role'] == 'assistant' %}
+        {{ ' ' + message['content'] | trim + eos_token }}
+    {% endif %}
+{% endfor %}
diff --git a/calibration/unify-and-expand.png b/calibration/unify-and-expand.png
new file mode 100644
index 000000000..3c113fbf8
Binary files /dev/null and b/calibration/unify-and-expand.png differ
diff --git a/calibration/vlm-calibration/README.md b/calibration/vlm-calibration/README.md
new file mode 100644
index 000000000..c83cd2189
--- /dev/null
+++ b/calibration/vlm-calibration/README.md
@@ -0,0 +1,19 @@
+# FP8 Calibration Procedure for VLM models
+
+The model calibration procedure for LLM models is a little bit different than VLM, we need to change it to adapt VLM models. To simplify this process, we've provided the `calibrate_model.sh` script. It requires the following arguments:
+
+- `-m`, i.e., **model stub or path:** Path to your model (if stored locally) or the model ID from the Hugging Face Hub.
+- `-d`, i.e., **dir to the source dataset:** It's path of HuggingFace Cache dir, which is used to store various calibration dataset. We currently hard-coded the calibration dataset to MMMU datasets. We add some valid check, please make sure the provided dataset path meet any of the following conditions: (1) The provided dir contain raw MMMU dataset and processed MMMU dataset. Specially, the dir of dataset has two subfolders, `${your_dataset_dir}/hub` contains raw datasets, `${your_dataset_dir}/datasets` contains processed datasets. (2) The provided dir `${your_dataset_dir}` contains processed datasets. (3) If users don't provided the dataset path, we will directly download it from huggingface.
+- `-o`, i.e., **output path:** Path to the directory where the generated measurements, etc., will be stored.
+- `-t`, i.e., **tensor parallel size:** Tensor parallel size to run at.
+
+Here are some examples of how to use the script:
+
+```bash
+cd vlm-calibration
+./calibrate_model.sh \
+    -m $MODEL_PATH \
+    -o $INC_OUTPUT_PATH \
+    -t $TP_SIZE \
+    -d $DATASET_PATH
+```
diff --git a/calibration/vlm-calibration/calibrate_model.sh b/calibration/vlm-calibration/calibrate_model.sh
new file mode 100755
index 000000000..9a35d3fd5
--- /dev/null
+++ b/calibration/vlm-calibration/calibrate_model.sh
@@ -0,0 +1,219 @@
+#!/bin/bash
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+set -e
+cd "$(dirname "$0")"
+
+ALLOWED_DEVICES=("g2" "g3")
+
+usage() {
+    echo
+    echo "Calibrate given MODEL_PATH for FP8 inference"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - [required] huggingface stub or local directory of the MODEL_PATH"
+    echo "  -d    - [optional] path to source dataset (details in README). If not provided, the dataset will be downloaded from HuggingFace."
+    echo "  -o    - [required] path to output directory for fp8 measurements"
+    echo "  -b    - batch size to run the measurements at (default: 32)"
+    echo "  -l    - limit number of samples in calibration dataset"
+    echo "  -t    - tensor parallel size to run at (default: 1); NOTE: if t > 8 then we need a multi-node setup"
+    echo "  -g    - groups of cards we want to unify. Card indices seperated by commas and groups seperated by double dash '--', e.g. 0,1--2,3--4,5--6,7 card 0 measurement will be unified with card 1 measurement and so on."
+    echo "  -e    - Turn on or off eager mode, default: off"
+    echo
+}
+
+cleanup_tmp() {
+	if [[ $(pwd) == *vlm-calibration ]]; then
+		echo "Clearing temporary directory"
+        mkdir -p inc_tmp nc_workspace
+		rm -rf nc_workspace
+		rm -rf inc_tmp
+	else
+		echo "Skipping temporary directory removal"
+	fi
+}
+
+create_measure_config() {
+    mkdir -p $1/$2/$3
+
+    model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+
+    tmp_config="{\"method\": \"HOOKS\",\"mode\": \"MEASURE\",\"observer\": \"maxabs\",\"allowlist\": {\"types\": [], \"names\":  []},\"blocklist\": {\"types\": [], \"names\":  [\"lm_head\"]},\"quantize_weight\": false,\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    
+    echo "$tmp_config" > $1/$2/maxabs_measure_$3.json
+}
+
+create_quant_config() {
+    mkdir -p $1/$2/$3
+    
+    model_name_lower=$(echo "$2" | tr '[:upper:]' '[:lower:]')
+
+    tmp_config="{\"mode\": \"QUANTIZE\",\"observer\": \"maxabs\",\"scale_method\": \"maxabs_hw\",\"allowlist\": {\"types\": [],\"names\": []},\"blocklist\": {\"types\": [],\"names\": [\"lm_head\"]},\"dump_stats_path\": \"$1/$2/$3/inc_output\"}"
+    
+    echo "$tmp_config" > $1/$2/maxabs_quant_$3.json
+}
+
+extract_last_folder_name() {
+    local path="$1"
+
+    path="${path%/}"
+    last_folder="$(basename "$path")"
+    last_folder="${last_folder,,}"
+
+    echo "$last_folder"
+}
+
+cleanup_tmp
+
+# jump to the script directory
+cd "$(dirname "$0")"
+echo "downloading requirements..."
+pip install -r requirements.txt 
+
+EXTRA_FLAGS=""
+BATCH_SIZE=32
+TP_SIZE=1
+eager_mode="off"
+while getopts "m:b:l:t:d:h:o:g:e:" OPT; do
+    case ${OPT} in
+        m )
+            MODEL_PATH="$OPTARG"
+            ;;
+        d )
+            DATASET_PATH="$OPTARG"
+            ;;
+        b )
+            BATCH_SIZE="$OPTARG"
+            ;;
+        o )
+            FP8_DIR=$(realpath "$OPTARG")
+            ;;
+        l )
+            LIMIT="$OPTARG"
+            ;;
+        t )
+            TP_SIZE="$OPTARG"
+            ;;
+        g )
+            CARD_GROUPS="$OPTARG"
+            ;;
+        h )
+            usage
+            ;;
+        e )
+            eager_mode="$OPTARG"
+            ;;
+        \? )
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+if [[ -z "$MODEL_PATH" && -z "$FP8_DIR" ]]; then
+    echo "Model stub and output path for fp8 measurements must be provided."
+    usage
+    exit 1
+fi
+
+if [[ -z "$DATASET_PATH" ]]; then
+    echo "Local calibration dataset path not provided. Will download it from HuggingFace."
+else
+    echo "Using local calibration dataset path: $DATASET_PATH"
+    if [[ -d "$DATASET_PATH/hub/datasets--MMMU--MMMU" && -d "$DATASET_PATH/datasets/MMMU___mmmu" ]]; then
+        export HF_HOME="/root/.cache/huggingface"
+        echo "copying local calibration dataset $DATASET_PATH to $HF_HOME"
+        mkdir -p $HF_HOME "$HF_HOME/hub" "$HF_HOME/datasets"
+        cp -rf "$DATASET_PATH/hub/datasets--MMMU--MMMU" "$HF_HOME/hub"
+        cp -rf "$DATASET_PATH/datasets/MMMU___mmmu" "$HF_HOME/datasets"
+    elif [[ -d "$DATASET_PATH/MMMU___mmmu" ]]; then
+        export HF_DATASETS_CACHE="/root/.cache/huggingface/datasets"
+        echo "copying local calibration dataset $DATASET_PATH to $HF_DATASETS_CACHE"
+        mkdir -p $HF_DATASETS_CACHE
+        cp -rf "$DATASET_PATH/MMMU___mmmu" $HF_DATASETS_CACHE
+    else
+        echo "Your provided dataset path doesn't contain MMMU dataset. Please refer to README for details."
+        exit 1
+    fi
+fi
+
+
+if [[ $eager_mode == "on" ]]; then
+    EXTRA_FLAGS+="--enforce-eager "
+fi
+
+# Store the provided MODEL_PATH name in a variable
+MODEL_NAME=$(extract_last_folder_name "$MODEL_PATH")
+
+echo ""
+echo "Step 1/3 - detecting used device type [g2, g3]"
+DEVICE_TYPE=$(python3 ../step-0-detect-device.py) || (echo "Detecting device process failed" && exit 1)
+DEVICE_TYPE="g$DEVICE_TYPE"
+echo "Detected device type: $DEVICE_TYPE"
+echo "Step 1 done"
+
+# Check if the provided device type is valid
+if [[ ! " ${ALLOWED_DEVICES[*]} " =~ " $DEVICE_TYPE " ]]; then
+    echo "Invalid device type: $DEVICE_TYPE. Allowed devices: ${ALLOWED_DEVICES[*]}"
+    exit 1
+fi
+
+
+create_measure_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE
+create_quant_config $FP8_DIR $MODEL_NAME $DEVICE_TYPE
+
+if [[ $TP_SIZE > 1 ]]; then
+    export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+fi
+export VLLM_SKIP_WARMUP=true
+max_model_len=8192
+
+
+echo ""
+echo "2/3 Measuring scales"
+export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_measure_$DEVICE_TYPE.json
+# quantization='None'
+# kv_cache_dtype='auto'
+quantization='inc'
+kv_cache_dtype='auto'
+
+python3 vision_lm_eval.py \
+    --max-model-len $max_model_len \
+    --model-path $MODEL_PATH \
+    --quantization $quantization \
+    --kv-cache-dtype $kv_cache_dtype \
+    --tensor-parallel-size $TP_SIZE \
+    $EXTRA_FLAGS
+echo "Step 2/3 done"
+
+
+echo ""
+echo "3/3 Quantize scales"
+export QUANT_CONFIG=$FP8_DIR/$MODEL_NAME/maxabs_quant_$DEVICE_TYPE.json
+quantization='inc'
+kv_cache_dtype='fp8_inc'
+
+python3 vision_lm_eval.py \
+    --max-model-len $max_model_len \
+    --model-path $MODEL_PATH \
+    --quantization $quantization \
+    --kv-cache-dtype $kv_cache_dtype \
+    --tensor-parallel-size $TP_SIZE \
+    $EXTRA_FLAGS
+
+echo "Step 3/3 done"
+
+
+
+if [[ -n $CARD_GROUPS ]]; then
+    echo ""
+    echo "Unify scales"
+    QUANT_DIR=$FP8_DIR/$MODEL_NAME/$DEVICE_TYPE/
+    python3 ../step-5-unify_measurements.py -g "$CARD_GROUPS" -m $QUANT_DIR -o $QUANT_DIR || (echo "Error in step 5" && exit 1)
+    echo "Unify scales done"
+fi
+cleanup_tmp
+echo "Calibration process done"
\ No newline at end of file
diff --git a/calibration/vlm-calibration/requirements.txt b/calibration/vlm-calibration/requirements.txt
new file mode 100644
index 000000000..b0d742b52
--- /dev/null
+++ b/calibration/vlm-calibration/requirements.txt
@@ -0,0 +1,2 @@
+lm_eval
+datasets
diff --git a/calibration/vlm-calibration/vision_lm_eval.py b/calibration/vlm-calibration/vision_lm_eval.py
new file mode 100644
index 000000000..8feba9d81
--- /dev/null
+++ b/calibration/vlm-calibration/vision_lm_eval.py
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to use vLLM for running offline inference with
+multi-image input on vision language models for text generation,
+using the chat template defined by the model.
+"""
+
+from vllm.utils import FlexibleArgumentParser
+from vllm.engine.arg_utils import AsyncEngineArgs
+
+from lm_eval import tasks, evaluator
+from lm_eval.models.vllm_vlms import VLLM_VLM
+
+IMAGE_LIMIT = 1
+
+
+def run_generate():
+    config_template_bf16 = {
+        "model_name": "REPLACE_ME",
+        "lm_eval_kwargs": {
+            "batch_size": "auto"
+        },
+        "vllm_kwargs": {
+            "pretrained": "REPLACE_ME",
+            "max_num_seqs": 128,
+            "max_model_len": 2048,
+            "dtype": "bfloat16",
+            "data_parallel_size": 1,
+            "tensor_parallel_size": args.tensor_parallel_size,
+            "disable_log_stats": False,
+        },
+    }
+    config_template_fp8 = {
+        **config_template_bf16, "vllm_kwargs": {
+            **config_template_bf16["vllm_kwargs"],
+            "quantization": args.quantization,
+            "kv_cache_dtype": args.kv_cache_dtype,
+            "weights_load_device": args.weights_load_device,
+        }
+    }
+    config_template_vision_fp8 = {
+        **config_template_fp8,
+        "lm_eval_kwargs": {
+            **config_template_fp8["lm_eval_kwargs"],
+            "max_images": IMAGE_LIMIT,
+        },
+        "vllm_kwargs": {
+            **config_template_fp8["vllm_kwargs"],
+            "max_num_seqs": 32,
+            "use_padding_aware_scheduling": True,
+            "max_num_prefill_seqs": 1,  # TODO: remove when higher prefill batch size will be supported
+            "disable_log_stats": True,  # TODO: investigate error when running with log stats
+        },
+    }
+    lm_instance_cfg = {
+        **config_template_vision_fp8,
+        "model_name": "Meta-Llama-3.2-11B-Vision-Instruct",
+        "lm_eval_kwargs": {
+            **config_template_vision_fp8["lm_eval_kwargs"],
+            "batch_size": 8,
+        },
+        "vllm_kwargs": {
+            **config_template_vision_fp8["vllm_kwargs"],
+            "pretrained": args.model_path,
+            "enforce_eager": args.enforce_eager,
+            "max_model_len": args.max_model_len,
+        },
+    }
+    lm = VLLM_VLM(**lm_instance_cfg["vllm_kwargs"], **lm_instance_cfg["lm_eval_kwargs"])
+
+    task_name = "mmmu_val"
+    task_manager = tasks.TaskManager(include_path="./meta-configs")
+    task_dict = tasks.get_task_dict(task_name, task_manager)
+    eval_kwargs = {
+        "limit": 1,
+        "fewshot_as_multiturn": True,
+        "apply_chat_template": True,
+    }
+
+    results = evaluator.evaluate(lm=lm, task_dict=task_dict, **eval_kwargs)
+    return results
+
+
+def main(args):
+    run_generate()
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description='Demo on using vLLM for offline inference with '
+                                    'vision language models that support multi-image input for text '
+                                    'generation')
+    parser.add_argument('--model-path', '-p', type=str, default="", help='Huggingface model path')
+    parser = AsyncEngineArgs.add_cli_args(parser)
+
+    args = parser.parse_args()
+    main(args)
diff --git a/docs/.nav.yml b/docs/.nav.yml
index 42ace218e..510922d6a 100644
--- a/docs/.nav.yml
+++ b/docs/.nav.yml
@@ -15,7 +15,12 @@ nav:
       - user_guide/*
     - Configuration:
       - Summary: configuration/README.md
-      - configuration/*
+      - configuration/env_vars.md
+      - configuration/long_context.md
+      - configuration/model_calibration.md
+      - configuration/optimization.md
+      - configuration/pipeline_parallelism.md
+      #- configuration/*
     - Models:
       - models/validated_models.md
     - Features:
diff --git a/docs/configuration/env_vars.md b/docs/configuration/env_vars.md
index 8451c7a6d..7751c2e5f 100644
--- a/docs/configuration/env_vars.md
+++ b/docs/configuration/env_vars.md
@@ -16,6 +16,9 @@
 
 - `VLLM_GRAPH_RESERVED_MEM`: percentage of memory dedicated to HPUGraph capture. The default is `0.1`.
 - `VLLM_EXPONENTIAL_BUCKETING`: if `true`, enables exponential bucket spacing instead of linear. The default is `true`.
+
+**Experimental Knobs:**
+
 - `VLLM_SKIP_WARMUP`: if `true`, warmup is skipped. The default is `false`.
 
 !!! note
@@ -23,7 +26,10 @@
 
 !!! tip
     When a deployed workload does not utilize the full context that a model can handle, it is good practice to limit the maximum values upfront based on the input and output token lengths that will be generated after serving the vLLM server.
-    <br><br>**Example:**<br><br>Let's assume that we want to deploy text generation model Qwen2.5-1.5B, which has a defined `max_position_embeddings` of 131072 (our `max_model_len`). At the same time, we know that our workload pattern will not use the full context length because we expect a maximum input token size of 1K and predict generating a maximum of 2K tokens as output. In this case, starting the vLLM server to be ready for the full context length is unnecessary. Instead, we should limit it upfront to achieve faster service preparation and decrease warmup time. The recommended values in this example should be:
+
+    **Example:**
+    
+    Let's assume that we want to deploy text generation model Qwen2.5-1.5B, which has a defined `max_position_embeddings` of 131072 (our `max_model_len`). At the same time, we know that our workload pattern will not use the full context length because we expect a maximum input token size of 1K and predict generating a maximum of 2K tokens as output. In this case, starting the vLLM server to be ready for the full context length is unnecessary. Instead, we should limit it upfront to achieve faster service preparation and decrease warmup time. The recommended values in this example should be:
     > - `--max_model_len`: `3072` - the sum of input and output sequences (1+2)*1024.  
     > - `VLLM_PROMPT_SEQ_BUCKET_MAX`: `1024` - the maximum input token size that we expect to handle.
 
@@ -48,11 +54,11 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
   - Default values:
     - Prompt:
       - batch size min (`VLLM_PROMPT_BS_BUCKET_MIN`): `1`
-      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `32`
+      - batch size step (`VLLM_PROMPT_BS_BUCKET_STEP`): `1`
       - batch size max (`VLLM_PROMPT_BS_BUCKET_MAX`): `max_num_prefill_seqs`
-      - sequence length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
-      - sequence length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
-      - sequence length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_model_len`
+      - query length min (`VLLM_PROMPT_SEQ_BUCKET_MIN`): `block_size`
+      - query length step (`VLLM_PROMPT_SEQ_BUCKET_STEP`): `block_size`
+      - query length max (`VLLM_PROMPT_SEQ_BUCKET_MAX`): `max_num_batched_tokens`
       - sequence ctx min (`VLLM_PROMPT_CTX_BUCKET_MIN`): `0`
       - sequence ctx step (`VLLM_PROMPT_CTX_BUCKET_STEP`): `1`
       - sequence ctx max (`VLLM_PROMPT_CTX_BUCKET_MAX`): `(max_model_len - block_size) // block_size`
@@ -62,4 +68,4 @@ Additionally, there are HPU PyTorch Bridge environment variables impacting vLLM
       - batch size max (`VLLM_DECODE_BS_BUCKET_MAX`): `max_num_seqs`
       - block size min (`VLLM_DECODE_BLOCK_BUCKET_MIN`): `block_size`
       - block size step (`VLLM_DECODE_BLOCK_BUCKET_STEP`): `block_size`
-      - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_blocks`
+      - block size max (`VLLM_DECODE_BLOCK_BUCKET_MAX`): `max_model_len * max_num_seqs // block_size` by default or `max_blocks` for CONTIGUOUS PA
diff --git a/docs/dev_guide/ci-failures.md b/docs/dev_guide/ci-failures.md
index 96f897c5f..712ac98f1 100644
--- a/docs/dev_guide/ci-failures.md
+++ b/docs/dev_guide/ci-failures.md
@@ -1,3 +1,53 @@
 # CI Failures
 
-WIP
+## CI
+
+For all PRs that are created in vllm-gaudi repository all checks in CI are required:
+- pre-commit & DCO
+- HPU tests
+- HPU Gaudi tests
+
+### Pre-commit & DCO
+To install run:
+
+```pre-commit install```
+
+This way all of your commits should be correctly formated and signed-off. If you need to manually sign off your commits, remember to use ```git commit -s``` to pass DCO.
+
+### HPU tests
+HPU tests consist of several unit tests:
+- pre merge tests
+- unit tests
+- perf test
+- feature tests
+- e2e tests
+
+All of the above tests are mandatory. Those tests operate in fast fail mode, meaning if one test fails, all of the others won't be triggered.
+
+### HPU Gaudi tests
+Additional Gaudi tests are expectd to pass, but aren't mandatory. Those tests are being run on internal Jenkins system, so results are internal only. Those tests can be run by CODEOWNERs and TESTOWNERs only.
+
+## Docs Pull Requests
+All PRs that do not interfere in code, like docstring changes or README updates can be merged without HPU tests and Gaudi tests. It is still required to pass pre-commit check.
+
+## Hourly Checks and Tests
+On vllm-gaudi repository hourly tests can be found in ```Hourly Commit Check and Tests``` under ```Actions``` tab. This tab also allows developers to manually trigger hourly tests on selected branch.
+
+If the last hourly test is failing it means that vllm-gaudi main branch doesn't work with upstream newest main commit. To find last good commit check [last good commit](https://github.com/vllm-project/vllm-gaudi/blob/vllm/last-good-commit-for-vllm-gaudi/VLLM_STABLE_COMMIT).
+
+Failing hourly checks will be fixed by developers as soon as possible.
+
+## Troubleshooting
+### Unreleated failures
+Sometimes there may be some issues that are unreleated to your specific changes in code. Often causeb by connection problems. In this case failed checks should be reruned. Those errors are:
+- ```Error response from daemon: No such container```
+- ```ValueError: Unsupported device: the device type is 7.```
+- ```[Device not found] Device acquire failed.```
+
+### Accuracy and functionality issues
+Accuracy issues can be tracked in HPU Gaudi tests with gsm8k runs. If any check fails with accuracy - too low accuracy compare to the one measured, or functionality issues, the **PR can't be merged** until solved.
+
+### Pre-commit failures
+To run pre-commit test manually run:
+
+```pre-commit run --show-diff-on-failure --color=always --all-files --hook-stage manual```
diff --git a/docs/features/bucketing_mechanism.md b/docs/features/bucketing_mechanism.md
index 6a98cdc2d..cda63a654 100644
--- a/docs/features/bucketing_mechanism.md
+++ b/docs/features/bucketing_mechanism.md
@@ -17,6 +17,7 @@ In dynamic inference serving scenarios, minimizing the number of graph compilati
 ## Bucketing Strategies
 
 Bucketing is focused on three dimensions:
+
 - `batch size`: number of samples in batch
 - `query lenght`: sequence length without context tokens
 - `num blocks`: context length counted in blocks
@@ -44,6 +45,7 @@ a `(2, 1, 512)` bucket, or the context length increases beyond 512 tokens. It wi
 ### Exponential Strategy  - Default
 
 Exponential strategy is the default warm-up mechanism. It is based on 4 parameters:
+
 - `min`: the smallest value
 - `step`: the rounding value for bucket boundaries
 - `max`: the largest value
@@ -60,7 +62,7 @@ Example distribution is shown below:
 min = 128, step = 128, max = 4096, limit = 13
 ```
 
-![exponential bucketing distribution for 4096 max query length](../../docs/assets/graphs/exponential_bucketing_example.png)
+![exponential bucketing distribution for 4096 max query length](../assets/graphs/exponential_bucketing_example.png)
 
 This strategy creates more buckets with smaller values closer to `min`. As the values increase toward `max`, the buckets become less frequent, meaning the distance between them gets larger. This helps prioritize warming up the smaller values more precisely, while still covering the full range.
 
@@ -73,7 +75,7 @@ Linear strategy is determined with 3 parameters only - `min`, `step` and `max`.
 
 `min` determines the lowest value of the bucket. `step` determines the interval between buckets, and `max` determines the upper bound of the bucket. Furthermore, the interval between `min` and `step` has special handling: `min` is multiplied by consecutive powers of two until the multiplier is less than or equal to `step`. We refer to this as the ramp-up phase, which is used for handling lower batch sizes with minimal wastage, while allowing for larger padding on larger batch sizes.
 
-**Example with ramp-up**
+#### Example with ramp-up
 
 ```{.}
 min = 2, step = 32, max = 64
@@ -82,7 +84,7 @@ min = 2, step = 32, max = 64
 => buckets = ramp_up + stable => (2, 4, 8, 16, 32, 64)
 ```
 
-**Example without ramp-up**
+#### Example without ramp-up
 
 ```{.}
 min = 128, step = 128, max = 512
@@ -94,6 +96,7 @@ min = 128, step = 128, max = 512
 ### Unified Strategy
 
 Unified strategy is dedicated strategy for Unified Attention. It's buckets are determined by different dimensions:
+
 - `query length`: number of currently processed tokens, without context tokens
 - `shared num blocks`: context length counted in blocks, including only blocks that are either shared between at least two block tables (different requests) or is used by at least two tokens in query
 - `unique num blocks`: context length counted in blocks, including only blocks that are not shared between block tables and are used only by one token
@@ -114,7 +117,7 @@ Example distribution is shown below:
 batch size = 64, max num batched tokens = 4096
 ```
 
-![exponential bucketing distribution for 4096 max query length](../../docs/assets/graphs/unified_bucketing_example.png)
+![exponential bucketing distribution for 4096 max query length](../assets/graphs/unified_bucketing_example.png)
 
 Additionaly for context blocks, both shared and unique, `0` value will be added as well.
 
diff --git a/docs/features/supported_features.md b/docs/features/supported_features.md
index 19fb4a2ac..55747f33b 100644
--- a/docs/features/supported_features.md
+++ b/docs/features/supported_features.md
@@ -3,29 +3,36 @@ title: Supported Features
 ---
 [](){ #supported-features }
 
-# Supported Features
+## Supported Features
+
 | **Feature**   | **Description**   | **References**  |
 |---    |---    |---    |
-| Offline batched inference     | Offline inference using LLM class from vLLM Python API    | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html)   |
-| Online inference via OpenAI-Compatible Server     | Online inference using HTTP server that implements OpenAI Chat and Completions API    | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html)    |
+| Offline batched inference     | Offline inference using LLM class from vLLM Python API    | [Quickstart](https://docs.vllm.ai/en/stable/getting_started/quickstart.html#offline-batched-inference)  [Example](https://docs.vllm.ai/en/stable/getting_started/examples/offline_inference.html)   |
+| Online inference via OpenAI-Compatible Server     | Online inference using HTTP server that implements OpenAI Chat and Completions API    | [Documentation](https://docs.vllm.ai/en/stable/serving/openai_compatible_server.html)  [Example](https://docs.vllm.ai/en/stable/getting_started/examples/openai_chat_completion_client.html)    |
 | HPU autodetection     | HPU users do not need to specify the target platform, it will be detected automatically upon vLLM startup     | N/A   |
 | Paged KV cache with algorithms enabled for Intel Gaudi accelerators   | vLLM HPU backend contains a custom Paged Attention and cache operators implementations optimized for Gaudi devices.   | N/A   |
 | Custom Intel Gaudi operator implementations   | vLLM HPU backend provides optimized implementations of operators such as prefill attention, Root Mean Square Layer Normalization, Rotary Positional Encoding.     | N/A   |
-| Tensor parallel inference (single or multi-node multi-HPU)     | vLLM HPU backend supports multi-HPU inference across multiple nodes with tensor parallelism with multiprocessing or Ray and HCCL.  | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)<br>[Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)<br>[HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html)    |
-| Pipeline parallel inference (single or multi-node multi-HPU)   | vLLM HPU backend supports multi-HPU inference across single or multi-node with pipeline parallelism.   | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)<br> [Running Pipeline Parallelism](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md#pipeline-parallelism)   |
-| Inference with HPU Graphs     | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time and replayed later during inference, significantly reducing host overheads.  | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)<br>[vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)<br>[Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture)    |
+| Tensor parallel inference      | vLLM HPU backend supports multi-HPU inference with tensor parallelism with multiprocessing.  | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)  [Example](https://docs.ray.io/en/latest/serve/tutorials/vllm-example.html)  [HCCL reference](https://docs.habana.ai/en/latest/API_Reference_Guides/HCCL_APIs/index.html)    |
+| Pipeline parallel inference    | vLLM HPU backend supports multi-HPU inference with pipeline parallelism.   | [Documentation](https://docs.vllm.ai/en/stable/serving/distributed_serving.html)  [Running Pipeline Parallelism](https://vllm-gaudi.readthedocs.io/en/latest/configuration/pipeline_parallelism.html)   |
+| Inference with HPU Graphs     | vLLM HPU backend uses HPU Graphs by default for optimal performance. When HPU Graphs are enabled, execution graphs will be recorded ahead of time and replayed later during inference, significantly reducing host overheads.  | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html)  [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)  [Optimization guide](https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html#hpu-graph-capture)    |
 | Inference with torch.compile   | vLLM HPU backend supports inference with `torch.compile`.    | [vLLM HPU backend execution modes](https://docs.vllm.ai/en/stable/getting_started/gaudi-installation.html#execution-modes)    |
 | INC quantization  | vLLM HPU backend supports FP8 model and KV cache quantization and calibration with Intel Neural Compressor (INC). (Not fully supported with torch.compile execution mode)    | [Documentation](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html)   |
 | AutoAWQ quantization | vLLM HPU backend supports inference with models quantized using AutoAWQ library. | [Library](https://github.com/casper-hansen/AutoAWQ) |
 | AutoGPTQ quantization | vLLM HPU backend supports inference with models quantized using AutoGPTQ library. | [Library](https://github.com/AutoGPTQ/AutoGPTQ) |
-| LoRA/MultiLoRA support    | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models.     | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)<br>[vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html)   |
-| Multi-step scheduling support     | vLLM HPU backend includes multi-step scheduling support for host overhead reduction, configurable by standard `--num-scheduler-seqs` parameter.   | [Feature RFC](https://github.com/vllm-project/vllm/issues/6854)   |
-| Automatic prefix caching   | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter.   | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)<br>[Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html)  |
-| Speculative decoding (functional release)     | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurable via standard `--speculative_model` and `--num_speculative_tokens` parameters. (Not fully supported with torch.compile execution mode)   | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)<br>[Example](https://docs.vllm.ai/en/stable/getting_started/examples/mlpspeculator.html)  |
-| Multiprocessing backend   | Multiprocessing is the default distributed runtime in vLLM. The vLLM HPU backend supports it alongside Ray.   | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html)  |
+| LoRA/MultiLoRA support    | vLLM HPU backend includes support for LoRA and MultiLoRA on supported models.     | [Documentation](https://docs.vllm.ai/en/stable/models/lora.html)   [Example](https://docs.vllm.ai/en/stable/getting_started/examples/multilora_inference.html)  [vLLM supported models](https://docs.vllm.ai/en/latest/models/supported_models.html)   |
+| Fully async model executor     | This allows the model runner to function asynchronously when using async scheduling. This allows full overlap of the cpu operations (including prepare_inputs) and the model forward pass. This does not support speculative decoding, PP, or guided decoding. Expected speedup is 5-10% over the current async scheduling.   | [Feature description](https://github.com/vllm-project/vllm/pull/23569)   |
+| Automatic prefix caching   | vLLM HPU backend includes automatic prefix caching (APC) support for more efficient prefills, configurable by standard `--enable-prefix-caching` parameter.   | [Documentation](https://docs.vllm.ai/en/stable/automatic_prefix_caching/apc.html)  [Details](https://docs.vllm.ai/en/stable/automatic_prefix_caching/details.html)  |
+| Speculative decoding (functional release)     | vLLM HPU backend includes experimental speculative decoding support for improving inter-token latency in some scenarios, configurable via standard `--speculative_model` and `--num_speculative_tokens` parameters. (Not fully supported with torch.compile execution mode)   | [Documentation](https://docs.vllm.ai/en/stable/models/spec_decode.html)  [Example](https://docs.vllm.ai/en/stable/getting_started/examples/mlpspeculator.html)  |
+| Multiprocessing backend   | Multiprocessing is the default distributed runtime in vLLM.   | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html)  |
 | Multimodal   | vLLM HPU backend supports the inference for multi-modal models. (Not fully supported with t.compile execution mode) |  [Documentation](https://docs.vllm.ai/en/latest/serving/multimodal_inputs.html) |
-| Multinode support   | vLLM HPU backend supports distributed, multiple-node inference with Ray.    | <REF>  |
-| vLLM v1 architecture (early release)   | V1 architecture is now available for the HPU backend, and will gradually enable it for every use case we plan to support.   | [Documentation](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) |
 | Guided decode   | vLLM HPU supports a guided decoding backend for generating structured outputs.   | [Documentation](https://docs.vllm.ai/en/latest/features/structured_outputs.html)  |
-| Delayed Sampling  (experimental) | vLLM HPU supports delayed sampling scheduling for asynchronous execution, enabled by `VLLM_DELAYED_SAMPLING=true` environment variable.   | N/A |
 | Exponential bucketing | vLLM HPU supports exponential bucketing spacing instead of linear to automate configuration of bucketing mechanism, enabled by default. It can be disabled via `VLLM_EXPONENTIAL_BUCKETING=false` environment variable.   | N/A |
+| Data Parellel support | vLLM HPU supports Data Parellel | [Documentation](https://docs.vllm.ai/en/stable/serving/data_parallel_deployment.html)  [Example](https://docs.vllm.ai/en/latest/examples/offline_inference/data_parallel.html)  |
+
+## Coming Soon
+
+- Sliding window attention
+- P/D disaggregate support
+- In-place weight update
+- MLA with Unified Attention
+- Multinode support
diff --git a/docs/getting_started/installation.md b/docs/getting_started/installation.md
index 8125c5365..5f01cb8fa 100644
--- a/docs/getting_started/installation.md
+++ b/docs/getting_started/installation.md
@@ -1,36 +1,35 @@
 ---
 title: Installation
 ---
-[](){ #installation }
+
 This guide provides instructions on running vLLM with Intel Gaudi devices.
 
 ## Requirements
 
 - Python 3.10
 - Intel Gaudi 2 or 3 AI accelerators
-- Intel Gaudi software version 1.21.0 or above
+- Intel Gaudi software version 1.22.0 or above
 
 !!! note
     To set up the execution environment, please follow the instructions in the [Gaudi Installation Guide](https://docs.habana.ai/en/latest/Installation_Guide/index.html).
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
+## Running vLLM on Gaudi with Docker Compose
+
+Starting with the 1.22 release, we are introducing ready-to-run container images that bundle vLLM and Gaudi software. Please follow the [instruction](https://github.com/vllm-project/vllm-gaudi/tree/main/.cd) to quickly launch vLLM on Gaudi using a prebuilt Docker image and Docker Compose, with options for custom parameters and benchmarking.
+
 ## Quick Start Using Dockerfile
-# --8<-- [start:docker_quickstart]
-Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile.
 
-=== "Ubuntu"
+## --8<-- [start:docker_quickstart]
 
-    ```
-    $ docker build -f Dockerfile.hpu -t vllm-hpu-env  .
-    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
-    ```
+Set up the container with the latest Intel Gaudi Software Suite release using the Dockerfile.
 
-=== "Red Hat Enterprise Linux for Use with Red Hat OpenShift AI"
+=== "Ubuntu"
 
     ```
-    $ docker build -f Dockerfile.hpu.ubi -t vllm-hpu-env  .
-    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --rm vllm-hpu-env
+    $ docker build -f .cd/Dockerfile.ubuntu.pytorch.vllm -t vllm-hpu-env  .
+    $ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --entrypoint='' --rm vllm-hpu-env
     ```
 
 !!! tip
@@ -38,11 +37,13 @@ Set up the container with the latest Intel Gaudi Software Suite release using th
     of [Install Driver and Software](https://docs.habana.ai/en/latest/Installation_Guide/Driver_Installation.html#install-driver-and-software) and "Configure Container
     Runtime" section of [Docker Installation](https://docs.habana.ai/en/latest/Installation_Guide/Installation_Methods/Docker_Installation.html#configure-container-runtime).
     Make sure you have ``habanalabs-container-runtime`` package installed and that ``habana`` container runtime is registered.
-# --8<-- [end:docker_quickstart]
+
+## --8<-- [end:docker_quickstart]
 
 ## Build from Source
 
 ### Environment Verification
+
 To verify that the Intel Gaudi software was correctly installed, run the following:
 
     $ hl-smi # verify that hl-smi is in your PATH and each Gaudi accelerator is visible
@@ -59,58 +60,89 @@ Refer to the [Intel Gaudi documentation](https://docs.habana.ai/en/latest/Instal
 
 Use the following commands to run a Docker image. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
 
-    docker pull vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-    docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.21.0/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
+    docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+    docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
 
 ### Build and Install vLLM
 
-Currently, multiple ways are provided which can be used to install vLLM with Intel® Gaudi®:
-
-=== "Stable vLLM-fork version"
+=== "Step 1: Get Last good commit on vllm"
 
-    vLLM releases are being performed periodically to align with Intel® Gaudi® software releases. The stable version is released with a tag, and supports fully validated features and performance optimizations in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork). To install the stable release from [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+    !!! note
+        Vllm-gaudi always follows the latest vllm commit. However, updates to the upstream vLLM
+        API may cause vLLM-Gaudi to crash. This saved commit has been verified with vLLM-Gaudi
+        on an hourly basis.
 
-    ```{.console}
-    git clone https://github.com/HabanaAI/vllm-fork.git
-    cd vllm-fork
-    git checkout v0.7.2+Gaudi-1.21.0
-    pip install -r requirements-hpu.txt
-    python setup.py develop
+    ```bash
+    git clone https://github.com/vllm-project/vllm-gaudi
+    cd vllm-gaudi
+    export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null)
     ```
 
-=== "Latest vLLM-fork"
+=== "Step 2: Install vLLM"
+
+    Install vLLM with `pip` or  [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source)
+    
+    ```bash
+    # Build vLLM from source for empty platform, reusing existing torch installation
+    git clone https://github.com/vllm-project/vllm
+    cd vllm
+    git checkout $VLLM_COMMIT_HASH
+    pip install -r <(sed '/^torch/d' requirements/build.txt)
+    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
+    cd ..
+    ```
 
-    Currently, the latest features and performance optimizations are being developed in Gaudi's [vLLM-fork](https://github.com/HabanaAI/vllm-fork) and periodically upstreamed to the vLLM main repository.
-    To install latest [HabanaAI/vLLM-fork](https://github.com/HabanaAI/vllm-fork), run the following:
+=== "Step 3: Install vLLM Plugin"
 
-    ```{.console}
-    git clone https://github.com/HabanaAI/vllm-fork.git
-    cd vllm-fork
-    git checkout habana_main
-    pip install --upgrade pip
-    pip install -r requirements-hpu.txt
-    python setup.py develop
+    Install  vLLM-Gaudi from source:
+    ```bash
+    cd vllm-gaudi
+    pip install -e .
+    cd ..
     ```
 
-=== "vLLM Upstream"
+### Build and Install vLLM with nixl
 
-    If you prefer to build and install directly from the main vLLM source, where periodically we are upstreaming new features, run the following:
+=== "Install vLLM Plugin with nixl"
 
-    ```{.console}
-    git clone https://github.com/vllm-project/vllm.git
-    cd vllm
-    pip install -r requirements-hpu.txt
-    python setup.py develop
+    ```bash
+    cd vllm-gaudi
+    python install_nixl.py
+    cd ..
     ```
 
-=== "[EXPERIMENTAL] vLLM Upstream + Plugin"
+=== "Install vLLM Gaudi and nixl with Docker file"
 
-    You're on the bleeding edge, good luck to you:
+    ```bash
+    docker build -t ubuntu.pytorch.vllm.nixl.latest \
+      -f .cd/Dockerfile.ubuntu.pytorch.vllm.nixl.latest github.com/vllm-project/vllm-gaudi
+    docker run -it --rm --runtime=habana \
+      --name=ubuntu.pytorch.vllm.nixl.latest \
+      --network=host \
+      -e HABANA_VISIBLE_DEVICES=all \
+      vllm-gaudi-for-llmd /bin/bash
+    ```
 
-    ```{.console}
-    VLLM_TARGET_DEVICE=hpu pip install git+https://github.com/HabanaAI/vllm-fork.git@dev/upstream_vllm_for_plugin
-    pip uninstall -y triton
-    git clone -b plugin_poc https://github.com/HabanaAI/vllm-hpu-extension.git vllm-hpu
-    cd vllm-hpu
+=== "Full installation from source vLLM Gaudi with nixl"
+
+    ```bash
+    # Fetch last good commit on vllm
+    git clone https://github.com/vllm-project/vllm-gaudi
+    cd vllm-gaudi
+    export VLLM_COMMIT_HASH=$(git show "origin/vllm/last-good-commit-for-vllm-gaudi:VLLM_STABLE_COMMIT" 2>/dev/null)
+    
+    # Build vLLM from source for empty platform, reusing existing torch installation
+    git clone https://github.com/vllm-project/vllm
+    cd vllm
+    git checkout $VLLM_COMMIT_HASH
+    pip install -r <(sed '/^torch/d' requirements/build.txt)
+    VLLM_TARGET_DEVICE=empty pip install --no-build-isolation -e .
+    cd ..
+    
+    # Build vLLM-Gaudi from source
+    cd vllm-gaudi
     pip install -e .
+    
+    # Build nixl
+    python install_nixl.py
     ```
diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md
index feaf0409a..5611ce4d0 100644
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@@ -1,12 +1,13 @@
 ---
 title: Quickstart
 ---
-[](){ #quickstart }
 
-This guide will help you quickly get started with vLLM to perform:
+## vLLM Quick Start Guide
 
-- [Offline batched inference][quickstart-offline]
-- [Online serving using OpenAI-compatible server][quickstart-online]
+This guide shows how to quickly launch vLLM on Gaudi using a prebuilt Docker
+image with Docker Compose which is supported on Ubuntu only. It supports model benchmarking, custom runtime parameters,
+and a selection of validated models — including the LLama, Mistral, and Qwen.
+The advanced configuration is available via environment variables or YAML files.
 
 ## Requirements
 
@@ -19,38 +20,364 @@ This guide will help you quickly get started with vLLM to perform:
     To achieve the best performance on HPU, please follow the methods outlined in the
     [Optimizing Training Platform Guide](https://docs.habana.ai/en/latest/PyTorch/Model_Optimization_PyTorch/Optimization_in_Training_Platform.html).
 
-## Quick Start Using Dockerfile
+## Running vLLM on Gaudi with Docker Compose
 
---8<-- "docs/getting_started/installation.md:docker_quickstart"
+Follow the steps below to run the vLLM server or launch benchmarks on Gaudi using Docker Compose.
+
+### 1. Clone the vLLM fork repository and navigate to the appropriate directory
+
+    git clone https://github.com/vllm-project/vllm-gaudi.git
+    cd vllm-gaudi/.cd/
+
+This ensures you have the required files and Docker Compose configurations.
+
+### 2. Set the following environment variables
+
+| **Variable** | **Description** |
+| --- |--- |
+| `MODEL` | Choose a model name from the [`vllm supported models`][supported-models] list.  |
+| `HF_TOKEN` | Your Hugging Face token (generate one at <https://huggingface.co>). |
+| `DOCKER_IMAGE` | The Docker image name or URL for the vLLM Gaudi container. When using the Gaudi repository, make sure to select Docker images with the *vllm-installer* prefix in the file name. |
+
+### 3. Run the vLLM server using Docker Compose
+
+    MODEL="Qwen/Qwen2.5-14B-Instruct" \
+    HF_TOKEN="<your huggingface token>" \
+    DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \
+    docker compose up
+
+To automatically run benchmarking for a selected model using default settings, add the  `--profile benchmark up` option
+
+    MODEL="Qwen/Qwen2.5-14B-Instruct" \
+    HF_TOKEN="<your huggingface token>" \
+    DOCKER_IMAGE=="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \
+    docker compose --profile benchmark up
+
+This command launches the vLLM server and runs the associated benchmark suite.
+
+## Advanced Options
+
+The following steps cover optional advanced configurations for
+running the vLLM server and benchmark. These allow you to fine-tune performance,
+memory usage, and request handling using additional environment variables or configuration files.
+For most users, the basic setup is sufficient, but advanced users may benefit from these customizations.
+
+=== "Run vLLM Using Docker Compose with Custom Parameters"
+
+    To override default settings, you can provide additional environment variables when starting the server. This advanced method allows fine-tuning for performance and memory usage.
+
+    **Environment variables**
+
+    | **Variable** | **Description** |
+    |---|---|
+    |  `PT_HPU_LAZY_MODE`              | Enables Lazy execution mode, potentially improving performance by batching operations. |
+    |  `VLLM_SKIP_WARMUP`              | Skips the model warmup phase to reduce startup time (may affect initial latency).                     |
+    |  `MAX_MODEL_LEN`                 | Sets the maximum supported sequence length for the model.               |    
+    |  `MAX_NUM_SEQS`                  | Specifies the maximum number of sequences processed concurrently.       |
+    |  `TENSOR_PARALLEL_SIZE`          | Defines the degree of tensor parallelism.                               |
+    |  `VLLM_EXPONENTIAL_BUCKETING`    | Enables or disables exponential bucketing for warmup strategy.          |
+    |  `VLLM_DECODE_BLOCK_BUCKET_STEP` | Configures the step size for decode block allocation, affecting memory granularity.         |
+    |  `VLLM_DECODE_BS_BUCKET_STEP`    | Sets the batch size step for decode operations, impacting how decode batches are grouped.             |
+    |  `VLLM_PROMPT_BS_BUCKET_STEP`    | Adjusts the batch size step for prompt processing.                      |
+    |  `VLLM_PROMPT_SEQ_BUCKET_STEP`   | Controls the step size for prompt sequence allocation.                  |
+
+    **Example**
+
+    ```bash
+    MODEL="Qwen/Qwen2.5-14B-Instruct" \
+    HF_TOKEN="<your huggingface token>" \
+    DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \
+    TENSOR_PARALLEL_SIZE=1 \
+    MAX_MODEL_LEN=2048 \
+    docker compose up
+    ```
+
+=== "Run vLLM and Benchmark with Custom Parameters"
+
+    You can customize benchmark behavior by setting additional environment variables before running Docker Compose.
+
+    **Benchmark parameters:**
+
+    | **Variable** | **Description** |
+    |---|---|
+    |  `INPUT_TOK`  | Number of input tokens per prompt.                           |
+    |  `OUTPUT_TOK` | Number of output tokens to generate per prompt.              |
+    |  `CON_REQ`    | Number of concurrent requests during benchmarking.           |
+    |  `NUM_PROMPTS`| Total number of prompts to use in the benchmark.             |
+
+    **Example:**
+
+    ```bash
+    MODEL="Qwen/Qwen2.5-14B-Instruct" \
+    HF_TOKEN="<your huggingface token>" \
+    DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu24.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \
+    INPUT_TOK=128 \
+    OUTPUT_TOK=128 \
+    CON_REQ=16 \
+    NUM_PROMPTS=64 \
+    docker compose --profile benchmark up
+    ```
+
+    This launches the vLLM server and runs the benchmark using your specified parameters.
+
+=== "Run vLLM and Benchmark with Combined Custom Parameters"
+
+    You can launch the vLLM server and benchmark together, providing any combination of server and benchmark-specific parameters.
+
+    **Example:**
+
+    ```bash
+    MODEL="Qwen/Qwen2.5-14B-Instruct" \
+    HF_TOKEN="<your huggingface token>" \
+    DOCKER_IMAGE="vault.habana.ai/gaudi-docker/|Version|/ubuntu22.04/habanalabs/vllm-installer-|PT_VERSION|:latest" \
+    TENSOR_PARALLEL_SIZE=1 \
+    MAX_MODEL_LEN=2048 \
+    INPUT_TOK=128 \
+    OUTPUT_TOK=128 \
+    CON_REQ=16 \
+    NUM_PROMPTS=64 \
+    docker compose --profile benchmark up
+    ```
+
+    This command starts the server and executes benchmarking with the provided configuration.
+
+=== "Run vLLM and Benchmark Using Configuration Files"
+
+    You can also configure the server and benchmark via YAML configuration files. Set the following environment variables:
+
+    | **Variable** | **Description** |
+    |---|---|
+    |  `VLLM_SERVER_CONFIG_FILE`          | Path to the server config file inside the Docker container. |
+    |  `VLLM_SERVER_CONFIG_NAME`          | Name of the server config section.                          |
+    |  `VLLM_BENCHMARK_CONFIG_FILE`       | Path to the benchmark config file inside the container.     |
+    |  `VLLM_BENCHMARK_CONFIG_NAME`       | Name of the benchmark config section.                       |
+
+    **Example**
+
+    ```bash
+    HF_TOKEN=<your huggingface token> \
+    VLLM_SERVER_CONFIG_FILE=server_configurations/server_text.yaml \
+    VLLM_SERVER_CONFIG_NAME=llama31_8b_instruct \
+    VLLM_BENCHMARK_CONFIG_FILE=benchmark_configurations/benchmark_text.yaml \
+    VLLM_BENCHMARK_CONFIG_NAME=llama31_8b_instruct \
+    docker compose --profile benchmark up
+    ```
+
+    !!! note
+        When using configuration files, you do not need to set the  `MODEL` variable as the model details are included in the config files. However, the  `HF_TOKEN` flag is still required.
+
+=== "Run vLLM Directly Using Docker"
+
+    For maximum control, you can run the server directly using the  `docker run` command, allowing full customization of Docker runtime settings.
+
+    **Example:**
+
+    ```bash
+    docker run -it --rm \
+        -e MODEL=$MODEL \
+        -e HF_TOKEN=$HF_TOKEN \
+        -e http_proxy=$http_proxy \
+        -e https_proxy=$https_proxy \
+        -e no_proxy=$no_proxy \
+        --cap-add=sys_nice \
+        --ipc=host \
+        --runtime=habana \
+        -e HABANA_VISIBLE_DEVICES=all \
+        -p 8000:8000 \
+        --name vllm-server \
+        <docker image name>
+    ```
+
+    This method provides full flexibility over how the vLLM server is executed within the container.
+
+---
+
+## Supported Models
+
+| **Model Name**                                                | **Validated TP Size**  |
+|---|---|
+| deepseek-ai/DeepSeek-R1-Distill-Llama-70B                 | 8                  |
+| meta-llama/Llama-3.1-70B-Instruct                         | 4                  |
+| meta-llama/Llama-3.1-405B-Instruct                        | 8                  |
+| meta-llama/Llama-3.1-8B-Instruct                          | 1                  |
+| meta-llama/Llama-3.3-70B-Instruct                         | 4                  |
+| mistralai/Mistral-7B-Instruct-v0.2                        | 1                  |
+| mistralai/Mixtral-8x7B-Instruct-v0.1                      | 2                  |
+| mistralai/Mixtral-8x22B-Instruct-v0.1                     | 4                  |
+| Qwen/Qwen2.5-7B-Instruct                                  | 1                  |
+| Qwen/Qwen2.5-VL-7B-Instruct                               | 1                  |
+| Qwen/Qwen2.5-14B-Instruct                                 | 1                  |
+| Qwen/Qwen2.5-32B-Instruct                                 | 1                  |
+| Qwen/Qwen2.5-72B-Instruct                                 | 4                  |
+| ibm-granite/granite-8b-code-instruct-4k                   | 1                  |
+| ibm-granite/granite-20b-code-instruct-8k                  | 1                  |
 
 ## Executing inference
 
 === "Offline Batched Inference"
 
     [](){ #quickstart-offline }
+
+    Offline inference processes multiple prompts in a batch without needing a running server. This is ideal for batch jobs and testing.
+
     ```python
     from vllm import LLM, SamplingParams
 
-    prompts = [
-        "Hello, my name is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    llm = LLM(model="facebook/opt-125m")
+    def main():
+        prompts = [
+            "Hello, my name is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+        llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct")
+
+        outputs = llm.generate(prompts, sampling_params)
+
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-    outputs = llm.generate(prompts, sampling_params)
+    if __name__ == "__main__":
+        main()
 
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
     ```
 
-=== "OpenAI Completions API"
+=== "Online Inference"
 
     [](){ #quickstart-online }
-    WIP
+
+    Online inference provides real-time text generation through a running vLLM server.
+    First, start the server:
+
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --host 0.0.0.0 \
+        --port 8000
+    ```
+
+    Then query it from Python:
+
+    ```python
+    import requests
+
+    def main():
+        url = "http://localhost:8000/v1/completions"
+        headers = {"Content-Type": "application/json"}
+
+        payload = {
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "prompt": "The future of AI is",
+            "max_tokens": 50,
+            "temperature": 0.8
+        }
+
+        response = requests.post(url, headers=headers, json=payload)
+        result = response.json()
+        print(result["choices"][0]["text"])}
+
+    if __name__ == "__main__":
+        main()
+
+    ```
+
+=== "OpenAI Completions API"
+
+    [](){ #quickstart-oopenai-completions-api }
+
+    vLLM provides an OpenAI-compatible completions API.
+    Start the server:
+
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --host 0.0.0.0 \
+        --port 8000
+    ```
+
+    Use the OpenAI Python client:
+
+    ```python
+    from openai import OpenAI
+
+    def main():
+        client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")
+
+        result = client.completions.create(
+            model="meta-llama/Llama-3.1-8B-Instruct",
+            prompt="Explain quantum computing in simple terms:",
+            max_tokens=100,
+            temperature=0.7
+        )
+        print(result.choices[0].text)
+
+    if __name__ == "__main__":
+        main()
+    ```
+
+    Or use curl:
+
+    ```bash
+    curl http://localhost:8000/v1/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "prompt": "Explain quantum computing in simple terms:",
+            "max_tokens": 100,
+            "temperature": 0.7
+        }'
+    ```
 
 === "OpenAI Chat Completions API with vLLM"
 
-    WIP
+    [](){ #quickstart-oopenai-chat-completions-api }
+
+    vLLM also supports the OpenAI chat completions API format.
+    Start the server:
+
+    ```bash
+    python -m vllm.entrypoints.openai.api_server \
+        --model meta-llama/Llama-3.1-8B-Instruct \
+        --host 0.0.0.0 \
+        --port 8000
+    ```
+
+    Use the OpenAI Python client:
+
+    ```python
+    from openai import OpenAI
+
+    def main():
+        client = OpenAI(api_key="EMPTY", base_url="http://localhost:8000/v1")
+
+        chat = client.chat.completions.create(
+            model="meta-llama/Llama-3.1-8B-Instruct",
+            messages=[
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What is the capital of France?"}
+            ],
+            max_tokens=50,
+            temperature=0.7
+        )
+        print(chat.choices[0].message.content)
+
+    if __name__ == "__main__":
+        main()
+    ```
+
+    Or use curl:
+
+    ```bash
+    curl http://localhost:8000/v1/chat/completions \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "meta-llama/Llama-3.1-8B-Instruct",
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What is the capital of France?"}
+            ],
+            "max_tokens": 50,
+            "temperature": 0.7
+        }'
+    ```
diff --git a/docs/models/validated_models.md b/docs/models/validated_models.md
index cb8bf9242..ca4811691 100644
--- a/docs/models/validated_models.md
+++ b/docs/models/validated_models.md
@@ -5,6 +5,24 @@ title: Validated Models
 
 The following configurations have been validated to function with Gaudi 2 or Gaudi 3 devices with random or greedy sampling. Configurations that are not listed may or may not work.
 
+| **Model**   | **Tensor Parallelism [x HPU]**   | **Datatype**    | **Validated on**    |
+|:---    |:---:    |:---:    |:---:  |
+| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)     | 1    | BF16, FP8    | Gaudi 2, Gaudi 3|
+| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)     | 2, 4, 8    | BF16, FP8, FP16 (Gaudi 2)    |Gaudi 2, Gaudi 3|
+| [meta-llama/Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct)     | 8    | BF16, FP8    |Gaudi 3|
+| [meta-llama/Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)     | 4  | BF16, FP8    | Gaudi 3|
+| [meta-llama/Granite-3B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k)     | 1  | BF16    | Gaudi 3|
+| [meta-llama/Granite-8B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-8b-code-instruct-128k)     | 1  | BF16    | Gaudi 3|
+| [meta-llama/Granite-3.1-8B-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)     | 1  | BF16, FP8    | Gaudi 2, Gaudi 3|
+| [meta-llama/Granite-20B-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k)     | 1  | BF16, FP8    | Gaudi 2, Gaudi 3|
+| [meta-llama/Granite-34B-code-instruc-8k](https://huggingface.co/ibm-granite/granite-34b-code-instruct-8k)     | 1  | BF16    | Gaudi 3|
+| [mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)     | 1, 4    | BF16    | Gaudi 3|
+| [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)     | 2    | FP8, BF16    |Gaudi 2, Gaudi 3|
+| [meta-llama/CodeLlama-34b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf)     | 1    | BF16    |Gaudi 3|
+| [Qwen/Qwen3-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-30B-A3B-Instruct-2507)     | 8    | BF16    |Gaudi 3|
+
+Validation of following configurations is under progress.
+
 | **Model**   | **Tensor Parallelism [x HPU]**   | **Datatype**    | **Validated on**    |
 |:---    |:---:    |:---:    |:---:  |
 | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)     | 1, 2, 8    | BF16   | Gaudi 2, Gaudi 3|
@@ -12,25 +30,12 @@ The following configurations have been validated to function with Gaudi 2 or Gau
 | [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)     | 8    | BF16    |Gaudi 2, Gaudi 3|
 | [meta-llama/Meta-Llama-3-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct)     | 8    | BF16    |Gaudi 2, Gaudi 3|
 | [meta-llama/Meta-Llama-3.1-8B](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B)     | 1    | BF16, FP8, INT4, FP16 (Gaudi 2)    | Gaudi 2, Gaudi 3|
-| [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)     | 1    | BF16, FP8    | Gaudi 2, Gaudi 3|
 | [meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B)    | 2, 4, 8    | BF16, FP8, INT4   |Gaudi 2, Gaudi 3|
-| [meta-llama/Meta-Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct)     | 2, 4, 8    | BF16, FP8, FP16 (Gaudi 2)    |Gaudi 2, Gaudi 3|
 | [meta-llama/Meta-Llama-3.1-405B](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B)     | 8    | BF16, FP8    |Gaudi 3|
-| [meta-llama/Meta-Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct)     | 8    | BF16, FP8    |Gaudi 3|
-| [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)     | 1    | BF16, FP8    | Gaudi 2, Gaudi 3|
-| [meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)     | 4, 8 (min. for Gaudi 2)    | BF16, FP8    | Gaudi 2, Gaudi 3|
-| [meta-llama/Llama-3.2-90B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct)     | 4, 8 (min. for Gaudi 2)    | BF16    | Gaudi 2, Gaudi 3 |
 | [meta-llama/Meta-Llama-3.3-70B](https://huggingface.co/meta-llama/Llama-3.3-70B)     | 4  | BF16, FP8    | Gaudi 3|
-| [meta-llama/Granite-3B-code-instruct-128k](https://huggingface.co/ibm-granite/granite-3b-code-instruct-128k)     | 1  | BF16    | Gaudi 3|
-| [meta-llama/Granite-3.0-8B-instruct](https://huggingface.co/ibm-granite/granite-3.0-8b-instruct)     | 1  | BF16, FP8    | Gaudi 2, Gaudi 3|
-| [meta-llama/Granite-20B-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k)     | 1  | BF16, FP8    | Gaudi 2, Gaudi 3|
-| [meta-llama/Granite-34B-code-instruc-8k](https://huggingface.co/ibm-granite/granite-34b-code-instruct-8k)     | 1  | BF16    | Gaudi 3|
-| [mistralai/Mistral-Large-Instruct-2407](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)     | 1, 4    | BF16    | Gaudi 3|
 | [mistralai/Mistral-7B-Instruct-v0.3](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)     | 1, 2    | BF16    | Gaudi 2|
-| [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)     | 2    | FP8, BF16    |Gaudi 2, Gaudi 3|
 | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)     | 1, 8    | BF16    | Gaudi 2, Gaudi 3 |
 | [princeton-nlp/gemma-2-9b-it-SimPO](https://huggingface.co/princeton-nlp/gemma-2-9b-it-SimPO)     | 1    | BF16    |Gaudi 2, Gaudi 3|
 | [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct)     | 8    | BF16    |Gaudi 2|
 | [Qwen/Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)     | 8    | BF16    |Gaudi 2|
-| [meta-llama/CodeLlama-34b-Instruct-hf](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf)     | 1    | BF16    |Gaudi 3|
-| [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)<br> [quick start scripts](https://github.com/HabanaAI/vllm-fork/blob/deepseek_r1/scripts/DEEPSEEK_R1_ON_GAUDI.md)   | 8    | FP8, BF16    |Gaudi 2, Gaudi 3|
+| [deepseek-ai/DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1)  | 8    | FP8, BF16    |Gaudi 2, Gaudi 3|
diff --git a/examples/data_parallel.py b/examples/data_parallel.py
index 349145b5f..1d1eec2d4 100644
--- a/examples/data_parallel.py
+++ b/examples/data_parallel.py
@@ -30,7 +30,6 @@
 """
 
 import os
-import sys
 from time import sleep
 import torch
 
@@ -178,9 +177,6 @@ def start(rank):
 if __name__ == "__main__":
     args = parse_args()
 
-    print("Execution is currently disabled. Exiting. expected fix in SW-241972")
-    sys.exit(0)  # Exits gracefully with an success code
-
     dp_size = args.dp_size
     tp_size = args.tp_size
     node_size = args.node_size
diff --git a/mkdocs.yaml b/mkdocs.yaml
index 4b141430c..7b06f9b5f 100644
--- a/mkdocs.yaml
+++ b/mkdocs.yaml
@@ -82,6 +82,7 @@ plugins:
 
 markdown_extensions:
   - attr_list
+  - sane_lists
   - md_in_html
   - admonition
   - pymdownx.details
@@ -109,6 +110,10 @@ markdown_extensions:
   # For math rendering
   - mdx_math:
       enable_dollar_delimiter: true
+  # For checkbox feature
+  - pymdownx.tasklist:
+      custom_checkbox: true
+
 
 extra_javascript:
   - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
diff --git a/requirements.txt b/requirements.txt
index 8b8ff73f1..701716ea9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,8 @@
 # Dependencies for HPU code
-ray
+numexpr==2.13.1
+ray<2.49.0
 pandas
-numpy==1.26.4
+numpy
 tabulate
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
diff --git a/tests/full_tests/ci_gsm8k_tests.sh b/tests/full_tests/ci_gsm8k_tests.sh
index 8c0136c2d..776301d88 100644
--- a/tests/full_tests/ci_gsm8k_tests.sh
+++ b/tests/full_tests/ci_gsm8k_tests.sh
@@ -13,8 +13,12 @@ echo $VLLM_GAUDI_PREFIX
 # Gemma3 with image input
 run_gemma3_test() {
     echo "➡️ Testing gemma-3-4b-it..."
-    VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
-    echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
+    #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-4b-it.yaml"
+    #echo "✅ Test with multimodal-support with gemma-3-4b-it passed."
+    #echo "➡️ Testing gemma-3-27b-it..."
+    #VLLM_SKIP_WARMUP=true PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 python -u "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/generation_mm_multi.py" --model-card-path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/gemma-3-27b-it.yaml"
+    #echo "✅ Test with multimodal-support with multiple images gemma-3-27b-it passed."
+    # echo "Skipping gemma-3-4b-it due to changes from https://github.com/vllm-project/vllm/pull/26715 
 }
 
 # Basic model test
@@ -158,6 +162,14 @@ run_gsm8k_granite_test() {
     echo "✅ Test with granite-8b passed."
 }
 
+# GSM8K on granite-8b (unified attn)
+run_gsm8k_granite_test_unified_attn() {
+    echo "➡️ Testing GSM8K on granite-8b with unified attention..."
+    VLLM_UNIFIED_ATTN=True VLLM_SKIP_WARMUP=True PT_HPU_LAZY_MODE=1 VLLM_USE_V1=1 \
+    pytest -v -s "${VLLM_GAUDI_PREFIX}/tests/models/language/generation/test_common.py" --model_card_path "${VLLM_GAUDI_PREFIX}/tests/full_tests/model_cards/granite-8b.yaml"
+    echo "✅ Test with granite-8b unified attention passed."
+}
+
 # GSM8K on granite-8b with async scheduling
 run_gsm8k_granite_async_test() {
     echo "➡️ Testing GSM8K on granite-8b with async scheduling..."
@@ -230,6 +242,7 @@ launch_all_tests() {
     run_compressed_w4a16_channelwise_test
     run_compressed_w4a16_moe_gidx_test
     run_gsm8k_granite_test
+    run_gsm8k_granite_test_unified_attn
     run_gsm8k_granite_async_test
     run_gsm8k_deepseek_test
     run_gsm8k_qwen3_30b_test
diff --git a/tests/full_tests/model_cards/gemma-3-27b-it.yaml b/tests/full_tests/model_cards/gemma-3-27b-it.yaml
new file mode 100644
index 000000000..5631346a2
--- /dev/null
+++ b/tests/full_tests/model_cards/gemma-3-27b-it.yaml
@@ -0,0 +1,26 @@
+model_name: "google/gemma-3-27b-it"
+test_config:
+  # Single image test
+  - modality: image
+    extra_engine_args:
+      mm_processor_kwargs:
+          min_pixels: 802816                # 896x896
+          max_pixels: 1003520
+          fps: 1
+    input_data_config:
+      num_prompts: 4
+      media_source: default
+
+  # Multi-image test
+  - modality: multi_image
+    extra_engine_args:
+      mm_processor_kwargs:
+          min_pixels: 802816                # 896x896
+          max_pixels: 1003520
+          fps: 1
+      limit_mm_per_prompt:
+        image: 10                           # Allow up to 10 images per prompt
+    input_data_config:
+      num_prompts: 2
+      media_source: default                 # Uses default images
+      num_images: 6
diff --git a/tests/models/language/generation/generation_mm_multi.py b/tests/models/language/generation/generation_mm_multi.py
new file mode 100644
index 000000000..7ec7ff974
--- /dev/null
+++ b/tests/models/language/generation/generation_mm_multi.py
@@ -0,0 +1,246 @@
+from argparse import ArgumentParser
+from vllm import LLM, EngineArgs, SamplingParams
+from vllm.assets.image import ImageAsset, ImageAssetName
+from vllm.assets.video import VideoAsset
+from vllm.multimodal.image import convert_image_mode
+from dataclasses import asdict
+from typing import Union, get_args
+from PIL import Image
+from dataclasses import dataclass
+import yaml
+import os
+from vllm_gaudi.extension.logger import logger as init_logger
+
+logger = init_logger()
+
+
+@dataclass
+class PROMPT_DATA:
+    _questions = {
+        "image": [
+            "What is the most prominent object in this image?", "Describe the scene in the image.",
+            "What is the weather like in the image?", "Write a short poem about this image."
+        ],
+        "multi_image": [
+            "Compare and contrast these images. What are the similarities and differences?",
+            "Tell a story that connects all these images together.",
+            "What common themes do you see across these images?",
+            "Describe the progression or sequence shown in these images.", "Which image stands out the most and why?",
+            "What emotions or moods are conveyed by these images collectively?"
+        ],
+        "video": ["Describe this video", "Which movie would you associate this video with?"]
+    }
+
+    def __post_init__(self):
+        self._questions = self._questions
+
+    def _load_single_image(self, source: str) -> Image.Image:
+        """Load a single image"""
+        if source == "default":
+            return convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+        else:
+            return convert_image_mode(Image.open(source), "RGB")
+
+    def _load_video(self, source: str):
+        """Load video data"""
+        return VideoAsset(name="baby_reading" if source == "default" else source, num_frames=16).np_ndarrays
+
+    def _load_multiple_images(self, source: Union[str, list[str]]) -> list[Image.Image]:
+        images = []
+        """Load multiple images from various sources"""
+        if source == "default":
+            # Get all available ImageAsset names from the Literal type
+            available_assets = list(get_args(ImageAssetName))
+            logger.info("Available ImageAssets: %(available_assets)s", {"available_assets": available_assets})
+
+            # Load up to 6 different assets (or more if needed)
+            target_count = 6
+            loaded_count = 0
+            for asset_name in available_assets:
+                if loaded_count >= target_count:
+                    break
+
+                try:
+                    img = ImageAsset(asset_name).pil_image
+                    converted_img = convert_image_mode(img, "RGB")
+                    images.append(converted_img)
+                    loaded_count += 1
+                    logger.info("Successfully loaded ImageAsset: %(asset_name)s (Size: %(size)s)",
+                                dict(asset_name=asset_name, size=converted_img.size))
+                except Exception as e:
+                    logger.warning("Failed to load ImageAsset '%(asset_name)s': %(e)s", dict(asset_name=asset_name,
+                                                                                             e=e))
+                    continue
+
+        elif isinstance(source, list):
+            # Load from list of file paths
+            for img_path in source:
+                try:
+                    img = Image.open(img_path)
+                    images.append(convert_image_mode(img, "RGB"))
+                except Exception as e:
+                    logger.warning("Failed to load image %(img_path)s: %(e)s", dict(img_path=img_path, e=e))
+
+        logger.info("Loaded %(num_images)s images for multi-image processing", {"num_images": len(images)})
+        return images
+
+    def _get_data(self, modality: str, source: str):
+        """Get data based on modality"""
+        if modality == "image":
+            return self._load_single_image(source)
+        elif modality == "multi_image":
+            return self._load_multiple_images(source)
+        elif modality == "video":
+            return self._load_video(source)
+        else:
+            raise ValueError(f"Unsupported modality: {modality}")
+
+    def get_prompts(self,
+                    model_name: str = "",
+                    modality: str = "image",
+                    media_source: str = "default",
+                    num_prompts: int = 1,
+                    num_images: int = 1,
+                    skip_vision_data=False):
+
+        # Handle multi-image modality
+        if modality == "multi_image" or modality == "image":
+            pholder = "<start_of_image>" * num_images if "gemma" in model_name.lower() else "<|image_pad|>" * num_images
+        elif modality == "video":
+            pholder = "<video>" if "gemma" in model_name.lower() else "<|video_pad|>"
+        else:
+            raise ValueError(f"Unsupported modality: {modality}."
+                             " Supported modality: [image, video, multi_image]")
+
+        questions = self._questions[modality]
+
+        prompts = [("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+                    f"<|im_start|>user\n<|vision_start|>{pholder}<|vision_end|>"
+                    f"{question}<|im_end|>\n"
+                    "<|im_start|>assistant\n") for question in questions]
+
+        data = self._get_data(modality, media_source)
+
+        # For multi_image, data is a list of images
+        if modality == "multi_image":
+            inputs = [
+                {
+                    "prompt": prompts[i % len(prompts)],
+                    "multi_modal_data": {
+                        "image": data  # Pass list of images
+                    },
+                } if not skip_vision_data else {
+                    "prompt": questions[i % len(questions)],
+                } for i in range(num_prompts)
+            ]
+        else:
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } if not skip_vision_data else {
+                "prompt": questions[i % len(questions)],
+            } for i in range(num_prompts)]
+
+        return inputs
+
+
+def run_model(model_name: str, inputs: Union[dict, list[dict]], modality: str, **extra_engine_args):
+    # Default mm_processor_kwargs
+    passed_mm_processor_kwargs = extra_engine_args.get("mm_processor_kwargs", {})
+    passed_mm_processor_kwargs.setdefault("min_pixels", 28 * 28)
+    passed_mm_processor_kwargs.setdefault("max_pixels", 1280 * 28 * 28)
+    passed_mm_processor_kwargs.setdefault("fps", 1)
+    extra_engine_args.update({"mm_processor_kwargs": passed_mm_processor_kwargs})
+
+    extra_engine_args.setdefault("max_model_len", 32768)
+    extra_engine_args.setdefault("max_num_seqs", 5)
+
+    # For multi-image, allow multiple images per prompt
+    if modality == "multi_image":
+        extra_engine_args.setdefault("limit_mm_per_prompt", {"image": 10})  # Allow up to 10 images
+    else:
+        extra_engine_args.setdefault("limit_mm_per_prompt", {modality: 1})
+
+    sampling_params = SamplingParams(
+        temperature=0.0,
+        max_tokens=256,  # Increased for multi-image responses
+    )
+
+    engine_args = EngineArgs(model=model_name, **extra_engine_args)
+
+    engine_args = asdict(engine_args)
+    llm = LLM(**engine_args)
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params=sampling_params,
+        use_tqdm=False,  # Disable tqdm for CI tests
+    )
+    return outputs
+
+
+def start_test(model_card_path: str):
+    with open(model_card_path) as f:
+        model_card = yaml.safe_load(f)
+
+    model_name = model_card.get("model_name", "Qwen/Qwen2.5-VL-7B-Instruct")
+    test_config = model_card.get("test_config", [])
+    if not test_config:
+        logger.warning("No test configurations found.")
+        return
+
+    for config in test_config:
+        modality = "image"  # Ensure modality is always defined
+        try:
+            modality = config.get("modality", "image")
+            extra_engine_args = config.get("extra_engine_args", {})
+            input_data_config = config.get("input_data_config", {})
+            num_prompts = input_data_config.get("num_prompts", 1)
+            num_images = input_data_config.get("num_images", 6)
+            media_source = input_data_config.get("media_source", "default")
+
+            logger.info(
+                "================================================\n"
+                "Running test with configs:\n"
+                "modality: %(modality)s\n"
+                "input_data_config: %(input_data_config)s\n"
+                "extra_engine_args: %(extra_engine_args)s\n"
+                "================================================",
+                dict(modality=modality, input_data_config=input_data_config, extra_engine_args=extra_engine_args))
+
+            data = PROMPT_DATA()
+            inputs = data.get_prompts(model_name=model_name,
+                                      modality=modality,
+                                      media_source=media_source,
+                                      num_prompts=num_prompts,
+                                      num_images=num_images)
+
+            logger.info("*** Questions for modality %(modality)s: %(questions)s",
+                        dict(modality=modality, questions=data._questions[modality]))
+            responses = run_model(model_name, inputs, modality, **extra_engine_args)
+            for response in responses:
+                print(f"{response.outputs[0].text}")
+                print("=" * 80)
+        except Exception as e:
+            logger.error("Error during test with modality %(modality)s: %(e)s", dict(modality=modality, e=e))
+            raise
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument("--model-card-path", required=True, help="Path to .yaml file describing model parameters")
+    args = parser.parse_args()
+    start_test(args.model_card_path)
+
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception:
+        import os
+        import traceback
+        print("An error occurred during generation:")
+        traceback.print_exc()
+        os._exit(1)
diff --git a/tests/models/language/generation/test_common.py b/tests/models/language/generation/test_common.py
index 1e5680004..edff19682 100644
--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -38,6 +38,7 @@ def launch_lm_eval(eval_config):
         'batch_size': max_num_seqs,
         'enable_expert_parallel': eval_config.get('enable_expert_parallel', False),
         'chat_template_args': eval_config.get('chat_template_args', {}),
+        'seed': eval_config.get('seed', 42),
     }
     if kv_cache_dtype is not None:
         model_args['kv_cache_dtype'] = kv_cache_dtype
@@ -72,13 +73,14 @@ def test_models(model_card_path, monkeypatch) -> None:
     print(f"{model_card=}")
     model_config = model_card['model_card']
     results = launch_lm_eval(model_config)
+    RTOL = 0.03
     metric = model_card['metrics']
     task = model_config['tasks']
     try:
         measured_value = results["results"][task][metric["name"]]
     except KeyError as e:
         raise KeyError(f"Available metrics: {results['results']}") from e
-    if metric["value"] > measured_value:
+    if metric["value"] > (measured_value + RTOL):
         raise AssertionError(f"Expected: {metric['value']} |  Measured: {measured_value}")
     print(f"Model: {model_config['model_name']} | "
           f"Task: {task} | "
diff --git a/tests/models/test_initialization.py b/tests/models/test_initialization.py
index 5250ddd00..f55ce1823 100644
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -8,7 +8,7 @@
 
 from vllm import LLM
 from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
-from vllm.utils import GiB_bytes
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_configs
 from vllm.v1.engine.core import EngineCore as V1EngineCore
 
diff --git a/tests/unit_tests/run_accuracy_test.sh b/tests/unit_tests/run_accuracy_test.sh
index 136500ad2..a2016e5f8 100755
--- a/tests/unit_tests/run_accuracy_test.sh
+++ b/tests/unit_tests/run_accuracy_test.sh
@@ -16,6 +16,7 @@ echo "Dependency installation complete."
 # Models to run
 MODELS=(
     "Qwen/Qwen3-0.6B"
+    "deepseek-ai/DeepSeek-V2-Lite-Chat"
 )
 #MODELS=(
 #	"meta-llama/Llama-3.1-8B"
@@ -38,7 +39,7 @@ fi
 NUM_PREFILL_INSTANCES=${NUM_PREFILL_INSTANCES:-1} # Default to 1
 NUM_DECODE_INSTANCES=${NUM_DECODE_INSTANCES:-1}   # Default to 1
 PREFILLER_TP_SIZE=${PREFILLER_TP_SIZE:-1}
-DECODER_TP_SIZE=${DECODER_TP_SIZE:-1}
+DECODER_TP_SIZE=${DECODER_TP_SIZE:-2}
 
 # Find the git repository root directory
 #GIT_ROOT=$(git rev-parse --show-toplevel)
diff --git a/tests/unit_tests/sampler/test_hpu_sampler.py b/tests/unit_tests/sampler/test_hpu_sampler.py
index 7d9e989fe..877d20c56 100644
--- a/tests/unit_tests/sampler/test_hpu_sampler.py
+++ b/tests/unit_tests/sampler/test_hpu_sampler.py
@@ -88,6 +88,7 @@ def _prepare_metadata(batch_size: int,
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
     if is_seeded_random:
         generator = torch.Generator(device=DEVICE)
diff --git a/tests/unit_tests/sinks/test_gpt_oss.py b/tests/unit_tests/sinks/test_gpt_oss.py
new file mode 100644
index 000000000..2c33c29da
--- /dev/null
+++ b/tests/unit_tests/sinks/test_gpt_oss.py
@@ -0,0 +1,73 @@
+import os
+import sys
+import vllm
+from vllm.entrypoints.llm import LLM
+import numpy as np
+
+RUN_20B_MODEL = True  # Set to False to run the 120B model instead
+MODEL_PATH = "lmsys/gpt-oss-20b-BF16"
+MODEL_PATH_120 = "lmsys/gpt-oss-120b-BF16"
+# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L397
+original_output = "Roses are red, violets are blue, I love you, and I love you too.\n\nRoses are red, vio"
+# reference https://github.com/huggingface/transformers/blob/68eb1a9a6353911f491b1c8139eb73d052a8e9b9/tests/models/gpt_oss/test_modeling_gpt_oss.py#L462
+original_output_120 = "Roses are red, violets are blue,\nI am a language model, not a human being"
+
+def do_sample(llm: LLM, original_output: str, rtol: float, atol: float, max_num_seqs:int) -> list[str]:
+    prompts = [
+            "Roses are red, violets",
+            ] * max_num_seqs
+
+    sampling_params = vllm.SamplingParams(temperature=0,
+                                          max_tokens=20,
+                                          logprobs=1 if not PT_PROFILE else None,)
+    outputs = llm.generate(
+        prompts,
+        sampling_params)
+
+    if not PT_PROFILE:
+        # Print the outputs.
+        generated_texts: list[str] = []
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            generated_texts.append(generated_text)
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+        assert prompts[0]+generated_texts[0] == original_output, "Generated text does not match the expected output."
+        return generated_texts
+
+
+
+
+expected_output = [
+     "are blue, I love you, and I love you too.\n\nRoses are red, vio"  # noqa: E501
+]
+
+
+def _test_gpt_oss():
+    """Main function that sets up and runs the prompt processing."""
+    if RUN_20B_MODEL:
+        llm = LLM(MODEL_PATH,
+                        max_num_seqs=8 if not PT_PROFILE else max_num_seqs,
+                        dtype='bfloat16',
+                        enforce_eager=True,
+                        max_model_len=512,
+                        max_num_batched_tokens=2048,
+                        tensor_parallel_size=1,
+                        )
+        generated_texts = do_sample(llm, original_output=original_output, rtol=1e-01, atol=1e-01, max_num_seqs=1)
+    else:
+        llm = LLM(MODEL_PATH_120,
+                        max_num_seqs=8,
+                        dtype='bfloat16',
+                        enforce_eager=False,
+                        max_model_len=512,
+                        max_num_batched_tokens=2048,
+                        tensor_parallel_size=4,
+                        )
+        generated_texts = do_sample(llm, original_output=original_output_120, rtol=1e-01, atol=1e-01, max_num_seqs=1)
+    assert generated_texts == expected_output
+
+
+def test_gpt_oss_1x():
+    _test_gpt_oss()
diff --git a/tests/unit_tests/test_bucketing.py b/tests/unit_tests/test_bucketing.py
index 1a4b27622..f86277cb6 100644
--- a/tests/unit_tests/test_bucketing.py
+++ b/tests/unit_tests/test_bucketing.py
@@ -50,7 +50,7 @@ def test_generate_prompt_buckets():
     ctx_range = [0, 1, 2, 3, 4]
     buckets = generate_buckets(bs_range, query_range, ctx_range, True, max_model_len, bs, prompt_bs,
                                max_num_batched_tokens, block_size, max_blocks)
-    assert len(buckets) == 40
+    assert len(buckets) == 25
 
 
 def test_generate_decode_buckets():
diff --git a/tests/unit_tests/test_prefix_caching.py b/tests/unit_tests/test_prefix_caching.py
new file mode 100644
index 000000000..1632fa646
--- /dev/null
+++ b/tests/unit_tests/test_prefix_caching.py
@@ -0,0 +1,180 @@
+import pytest
+
+import vllm_gaudi.extension.environment as environment
+
+from vllm_gaudi.v1.worker.hpu_model_runner import HPUModelRunner
+
+from vllm.sampling_params import SamplingParams
+from vllm.attention import Attention
+from vllm.platforms import current_platform
+from vllm.v1.core.sched.output import SchedulerOutput, NewRequestData, CachedRequestData
+from vllm.config import (VllmConfig, ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig)
+
+DEVICE = current_platform.device_type
+
+
+def get_vllm_config():
+    scheduler_config = SchedulerConfig(
+        max_num_seqs=10,
+        max_num_batched_tokens=512,
+        max_model_len=512,
+    )
+    model_config = ModelConfig(
+        model="facebook/opt-125m",
+        task="generate",
+        tokenizer="facebook/opt-125m",
+        tokenizer_mode="auto",
+        trust_remote_code=True,
+        dtype="bfloat16",
+        seed=42,
+    )
+    cache_config = CacheConfig(
+        block_size=128,
+        gpu_memory_utilization=0.9,
+        swap_space=0,
+        cache_dtype="auto",
+    )
+    parallel_config = ParallelConfig()
+    vllm_config = VllmConfig(
+        model_config=model_config,
+        cache_config=cache_config,
+        scheduler_config=scheduler_config,
+        parallel_config=parallel_config,
+    )
+    return vllm_config
+
+
+@pytest.fixture
+def model_runner():
+    vllm_config = get_vllm_config()
+    model_config = vllm_config.model_config
+    num_heads = model_config.get_num_kv_heads(vllm_config.parallel_config)
+    head_size = model_config.get_head_size()
+    environment.set_vllm_config(vllm_config)
+    vllm_config.compilation_config.static_forward_context = {"layer.0": Attention(num_heads, head_size, 0.1)}
+    runner = HPUModelRunner(vllm_config, DEVICE)
+    return runner
+
+
+def make_new_request(req_id, prompt_token_ids, num_computed_tokens=0):
+    return NewRequestData(
+        req_id=req_id,
+        prompt_token_ids=prompt_token_ids,
+        mm_features=[],
+        sampling_params=SamplingParams(),
+        pooling_params=None,
+        block_ids=[[0]],
+        num_computed_tokens=num_computed_tokens,
+        lora_request=None,
+    )
+
+
+@pytest.mark.parametrize(
+    "prompt1, prompt2, num_common_prefix, expected_tokens",
+    [
+        ([1, 2, 3, 4], [1, 2, 3, 4], 4, 0),  # full prefix cache hit
+        ([1, 2, 3], [1, 2, 3, 6, 7], 3, 2)  # partial prefix cache hit (3 cached, 2 new)
+    ])
+def test_prefix_cache_hits(model_runner, prompt1, prompt2, num_common_prefix, expected_tokens, dist_init):
+    req_id1 = "req1"
+    req_id2 = "req2"
+
+    # First request: all tokens need compute
+    new_req1 = make_new_request(req_id1, prompt1)
+    sched_out1 = SchedulerOutput(
+        scheduled_new_reqs=[new_req1],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id1: len(prompt1)},
+        total_num_scheduled_tokens=len(prompt1),
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=0,
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+    model_runner._update_states(sched_out1)
+    cached_state = model_runner.requests[req_id1]
+
+    assert cached_state.prompt_token_ids == prompt1
+    assert cached_state.num_computed_tokens == 0
+    assert req_id1 in model_runner.requests
+    assert sched_out1.num_scheduled_tokens[req_id1] == len(prompt1)
+
+    # Second request: full prefix cache hit or partial prefix cache hit
+    new_req2 = make_new_request(req_id2, prompt2, num_computed_tokens=num_common_prefix)
+    sched_out2 = SchedulerOutput(
+        scheduled_new_reqs=[new_req2],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id2: expected_tokens},
+        total_num_scheduled_tokens=expected_tokens,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=num_common_prefix,
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+    model_runner._update_states(sched_out2)
+    cached_state = model_runner.requests[req_id2]
+
+    assert cached_state.prompt_token_ids == prompt2
+    assert cached_state.num_computed_tokens == num_common_prefix
+    assert req_id2 in model_runner.requests
+    assert sched_out2.num_scheduled_tokens[req_id2] == expected_tokens
+
+
+@pytest.mark.parametrize(
+    "prompt, cache_first, cache_second",
+    [
+        ([10, 11, 12], 3, 0),  # first: all tokens cached, second: cache reset, all tokens need compute
+    ])
+def test_prefix_cache_reset(model_runner, prompt, cache_first, cache_second, dist_init):
+    req_id = "req_reset"
+    new_req_1 = make_new_request(req_id, prompt, num_computed_tokens=cache_first)
+    # All tokens cached (simulate by setting num_scheduled_tokens=0)
+    sched_out1 = SchedulerOutput(
+        scheduled_new_reqs=[new_req_1],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id: 0},
+        total_num_scheduled_tokens=0,
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=cache_first,
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+    model_runner._update_states(sched_out1)
+    cached_state1 = model_runner.requests[req_id]
+
+    assert req_id in model_runner.requests
+    assert cached_state1.prompt_token_ids == prompt
+    assert cached_state1.num_computed_tokens == cache_first
+    assert sched_out1.num_scheduled_tokens[req_id] == 0
+
+    # Cache reset, all tokens need compute
+    new_req_2 = make_new_request(req_id, prompt, num_computed_tokens=cache_second)
+    sched_out2 = SchedulerOutput(
+        scheduled_new_reqs=[new_req_2],
+        scheduled_cached_reqs=CachedRequestData.make_empty(),
+        num_scheduled_tokens={req_id: len(prompt)},
+        total_num_scheduled_tokens=len(prompt),
+        scheduled_spec_decode_tokens={},
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=cache_second,
+        finished_req_ids=set(),
+        free_encoder_mm_hashes=[],
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+    model_runner._update_states(sched_out2)
+    cached_state2 = model_runner.requests[req_id]
+
+    assert req_id in model_runner.requests
+    assert cached_state2.prompt_token_ids == prompt
+    assert cached_state2.num_computed_tokens == cache_second
+    assert sched_out2.num_scheduled_tokens[req_id] == len(prompt)
diff --git a/tests/unit_tests/worker/test_hpu_input_batch.py b/tests/unit_tests/worker/test_hpu_input_batch.py
index 1beba5a77..4dddd2881 100644
--- a/tests/unit_tests/worker/test_hpu_input_batch.py
+++ b/tests/unit_tests/worker/test_hpu_input_batch.py
@@ -11,7 +11,8 @@
 import habana_frameworks.torch  # noqa: F401
 
 from vllm.sampling_params import SamplingParams
-from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+from vllm.utils import is_pin_memory_available
+from vllm.utils.torch_utils import make_tensor_with_pad
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.logits_processor import LogitsProcessors
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -202,6 +203,7 @@ def test_sampling_metadata_in_input_batch(device: str, batch_size: int):
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
     reqs: list[CachedRequestState] = []
     req_id_reqs = {}
@@ -286,6 +288,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: lis
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
     ref_input_batch: InputBatch = InputBatch(
         max_num_reqs=batch_size,
@@ -295,6 +298,7 @@ def test_swap_states_in_input_batch(device: str, batch_size: int, swap_list: lis
         pin_memory=is_pin_memory_available(),
         vocab_size=1024,
         block_sizes=[1],
+        kernel_block_sizes=[1],
     )
 
     reqs: list[CachedRequestState] = []
diff --git a/tests/unit_tests/worker/test_hpu_model_runner.py b/tests/unit_tests/worker/test_hpu_model_runner.py
index 91e4b0404..39811fd3f 100644
--- a/tests/unit_tests/worker/test_hpu_model_runner.py
+++ b/tests/unit_tests/worker/test_hpu_model_runner.py
@@ -11,7 +11,7 @@
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig, set_current_vllm_config)
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
-from vllm.utils import GiB_bytes
+from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.core.kv_cache_utils import (estimate_max_model_len, get_kv_cache_configs)
 from vllm.v1.core.sched.output import (CachedRequestData, NewRequestData, SchedulerOutput)
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheGroupSpec, KVCacheTensor)
@@ -52,6 +52,7 @@ def initialize_kv_cache(runner: HPUModelRunner):
         pin_memory=runner.pin_memory,
         vocab_size=runner.model_config.get_vocab_size(),
         block_sizes=[kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size],
+        kernel_block_sizes=[kv_cache_config.kv_cache_groups[0].kv_cache_spec.block_size],
     )
 
 
@@ -239,6 +240,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
         req_ids=[req_id],
         resumed_from_preemption=[False],
         new_token_ids=[[]],
+        resumed_req_token_ids=[None],
         new_block_ids=([[0]], ),
         num_computed_tokens=[0],
         num_output_tokens=[0],
diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py
index 9b541a4ef..d1724c9ee 100644
--- a/vllm_gaudi/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/attention/backends/hpu_attn.py
@@ -167,6 +167,7 @@ def __init__(
         qk_head_dim: int,
         v_head_dim: int,
         kv_b_proj: ColumnParallelLinear,
+        sinks: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> None:
         torch.nn.Module.__init__(self)
@@ -218,6 +219,13 @@ def __init__(
                                       "encoder/decoder cross-attention "
                                       "are not implemented for "
                                       "TritonMLAImpl")
+        self.sinks = sinks
+        if sinks is not None:
+            assert sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                f"heads in the layer. Sinks shape: {sinks.shape}, "
+                f"num_heads: {num_heads}."
+            )
 
     def forward(
         self,
@@ -389,6 +397,7 @@ def __init__(
         attn_type: str = AttentionType.DECODER,
         kv_sharing_target_layer_name: Optional[str] = None,
         use_irope: bool = False,
+        sinks: Optional[torch.Tensor] = None,
     ) -> None:
         super(AttentionImpl, self).__init__()
         if kv_sharing_target_layer_name is not None:
@@ -453,6 +462,13 @@ def __init__(
             raise NotImplementedError("Encoder self-attention "
                                       "is not implemented for "
                                       "HPUAttentionImpl")
+        self.sinks = sinks
+        if sinks is not None:
+            assert sinks.shape[0] == num_heads, (
+                "Sinks must have the same number of heads as the number of "
+                f"heads in the layer. Sinks shape: {sinks.shape}, "
+                f"num_heads: {num_heads}."
+            )
 
     def _maybe_init_alibi_biases(
         self,
@@ -534,6 +550,12 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
+            if key.dtype != key_cache.dtype:
+                key = key.to(key_cache.dtype)
+            if value.dtype != value_cache.dtype:
+                value = value.to(value_cache.dtype)
+            if query.dtype != key.dtype:
+                query = query.to(key.dtype)
             key_cache = self.k_cache(key, key_cache, slot_mapping)
             value_cache = self.v_cache(value, value_cache, slot_mapping)
 
@@ -571,8 +593,16 @@ def forward(
             common_args = self.common_attention_args(block_list, key_cache, value_cache, attn_metadata.block_size)
 
             if self.sliding_window and hasattr(attn_metadata,
-                                               'window_attn_bias') and attn_metadata.window_attn_bias is not None:
+                                               'window_attn_bias') and attn_metadata.window_attn_bias is not None \
+                                                and self.prefill_impl == 'naive_impl':
                 attn_bias = attn_metadata.window_attn_bias
+            if self.sliding_window:
+                # TODO - change 128 to proper window size
+                window_size = (
+                        128,
+                        0,
+                    )
+                common_args["window_size"] = window_size
 
             out = ops.prompt_attention(impl=self.prefill_impl,
                                        query=query.view(query_shape),
@@ -637,6 +667,7 @@ def common_attention_args(self, block_list=None, key_cache=None, value_cache=Non
             'key_cache': key_cache,
             'value_cache': value_cache,
             'block_size': block_size,
+            "sinks": self.sinks,
         }
 
     def forward_encoder_decoder(
diff --git a/vllm_gaudi/distributed/device_communicators/hpu_communicator.py b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
index e45d5eba7..697c5fe4f 100644
--- a/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
+++ b/vllm_gaudi/distributed/device_communicators/hpu_communicator.py
@@ -11,6 +11,8 @@
 
 import habana_frameworks.torch as htorch  # noqa: F401
 
+from vllm_gaudi.v1.worker.hpu_dp_utils import get_hpu_dp_metadata
+
 
 class HpuCommunicator(DeviceCommunicatorBase):
 
@@ -64,29 +66,39 @@ def dispatch(self,
                  is_sequence_parallel: bool = False) -> tuple[torch.Tensor, torch.Tensor]:
         assert self.dp_group is not None
         assert hidden_states.dim() == 2, "Input hidden states must be 2D"
-        input_size = hidden_states.size()
-        # Allocate output tensor.
-        output_size = list(input_size)
-        if is_sequence_parallel:
-            # if sequence parallel enabled, hidden states was already being chunked by sp_size
-            output_size[0] *= self.world_size
+
+        dp_metadata = get_hpu_dp_metadata()
+        if dp_metadata is not None:
+            hidden_states_across_dp = dp_metadata.hidden_states_across_dp
+            router_logits_across_dp = dp_metadata.router_logits_across_dp
         else:
-            output_size[0] *= self.dp_world_size
-        hidden_states_across_dp = torch.empty(output_size, dtype=hidden_states.dtype, device=hidden_states.device)
+            # create hidden_states_across_dp tensor
+            input_size = hidden_states.size()
+            # Allocate output tensor.
+            output_size = list(input_size)
+            if is_sequence_parallel:
+                # if sequence parallel enabled, hidden states was already being chunked by sp_size
+                output_size[0] *= self.world_size
+            else:
+                output_size[0] *= self.dp_world_size
+            hidden_states_across_dp = torch.empty(output_size, dtype=hidden_states.dtype, device=hidden_states.device)
+
+            # create router_logits_across_dp tensor
+            router_logits_size = router_logits.size()
+            router_logits_output_size = list(router_logits_size)
+            if is_sequence_parallel:
+                router_logits_output_size[0] *= self.world_size
+            else:
+                router_logits_output_size[0] *= self.dp_world_size
+            router_logits_across_dp = torch.empty(router_logits_output_size,
+                                                  dtype=router_logits.dtype,
+                                                  device=router_logits.device)
+
         torch.distributed.all_gather_into_tensor(
             hidden_states_across_dp,
             hidden_states,
             group=get_ep_group().device_group if is_sequence_parallel else self.dp_group.device_group)
 
-        router_logits_size = router_logits.size()
-        router_logits_output_size = list(router_logits_size)
-        if is_sequence_parallel:
-            router_logits_output_size[0] *= self.world_size
-        else:
-            router_logits_output_size[0] *= self.dp_world_size
-        router_logits_across_dp = torch.empty(router_logits_output_size,
-                                              dtype=router_logits.dtype,
-                                              device=router_logits.device)
         torch.distributed.all_gather_into_tensor(
             router_logits_across_dp,
             router_logits,
@@ -99,11 +111,15 @@ def combine(self, hidden_states: torch.Tensor, is_sequence_parallel: bool = Fals
         assert self.dp_group is not None
         assert hidden_states.dim() == 2, "Input hidden states must be 2D"
 
-        local_num_tokens = hidden_states.size(0) // self.world_size if is_sequence_parallel else hidden_states.size(
-            0) // self.dp_world_size
-        local_hidden_states = torch.empty((local_num_tokens, hidden_states.size(-1)),
-                                          device=hidden_states.device,
-                                          dtype=hidden_states.dtype)
+        dp_metadata = get_hpu_dp_metadata()
+        if dp_metadata is not None:
+            local_hidden_states = dp_metadata.local_hidden_states
+        else:
+            local_num_tokens = hidden_states.size(0) // self.world_size if is_sequence_parallel else hidden_states.size(
+                0) // self.dp_world_size
+            local_hidden_states = torch.empty((local_num_tokens, hidden_states.size(-1)),
+                                              device=hidden_states.device,
+                                              dtype=hidden_states.dtype)
 
         torch.distributed.reduce_scatter_tensor(
             local_hidden_states,
diff --git a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py
index 3a5995add..27441f8dc 100644
--- a/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py
+++ b/vllm_gaudi/distributed/kv_transfer/kv_connector/v1/hpu_nixl_connector.py
@@ -3,37 +3,16 @@
 import torch
 from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (NixlConnectorWorker)
 from vllm_gaudi.platform import logger
-import habana_frameworks.torch.core as htexp
-
-
-def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
-    """
-    Initialize transfer buffer in CPU mem for accelerators
-    NOT directly supported by NIXL (e.g., tpu)
-    """
-    xfer_buffers: dict[str, torch.Tensor] = {}
-    try:
-        for layer_name, kv_cache in kv_caches.items():
-            if self.device_type == "hpu":
-                kv_shape = kv_cache[0].shape
-                kv_shape_new = (2, kv_shape[0] // self.block_size, self.block_size, *kv_shape[1:])
-                kv_dtype = kv_cache[0].dtype
-                xfer_buffers[layer_name] = torch.empty(kv_shape_new, dtype=kv_dtype, device="cpu")
-            else:
-                kv_shape = kv_cache.shape
-                kv_dtype = kv_cache.dtype
-                xfer_buffers[layer_name] = torch.empty(kv_shape, dtype=kv_dtype, device="cpu")
-    except MemoryError as e:
-        logger.error("NIXLConnectorWorker gets %s.", e)
-        raise
-
-    self.host_xfer_buffers = xfer_buffers
-
+import habana_frameworks.torch.utils.experimental as htexp
 
 original_data_ptr = torch.Tensor.data_ptr
+#NOTE(Chendi): Temp solution for HPU htexp._data_ptr
+# If same tensor assigned with two Views, the htexp._data_ptr() fails on non-in-place view.
+# So we record the mapping from original data_ptr to htexp._data_ptr
+global_data_ptr_record = {}
 
 
-def _hpu_data_ptr(tensor_self):
+def _hpu_data_ptr(tensor_self) -> int:
     """
     A temporary replacement for tensor.data_ptr().
     
@@ -44,12 +23,42 @@ def _hpu_data_ptr(tensor_self):
     # The first `self` refers to the class instance (from the outer scope)
     # The `tensor_self` is the tensor instance on which .data_ptr() is called
     if tensor_self.device.type == 'hpu':
-        return htexp._data_ptr(tensor_self)
+        #return htexp._data_ptr(tensor_self)
+        v_dataptr = original_data_ptr(tensor_self)
+        if v_dataptr not in global_data_ptr_record:
+            p_dataptr = htexp._data_ptr(tensor_self)
+            global_data_ptr_record[v_dataptr] = p_dataptr
+        else:
+            p_dataptr = global_data_ptr_record[v_dataptr]
+        return p_dataptr
 
     # Fallback to the original implementation for CPU tensors or host buffers
     return original_data_ptr(tensor_self)
 
 
-torch.Tensor.data_ptr = _hpu_data_ptr
+def initialize_host_xfer_buffer(self, kv_caches: dict[str, torch.Tensor]) -> None:
+    """
+    Initialize transfer buffer in CPU mem for accelerators
+    NOT directly supported by NIXL (e.g., tpu)
+    
+    NOTE(Chendi): override to support HPU heterogeneousTP size.
+    We intended to prepare host_buffer with HND layout.
+    """
+    xfer_buffers: dict[str, torch.Tensor] = {}
+    inv_order = [0, 1, 3, 2, 4]
+    try:
+        for layer_name, kv_cache in kv_caches.items():
+            kv_shape = kv_cache.shape
+            kv_dtype = kv_cache.dtype
+            if not self.use_mla:
+                kv_shape = tuple(kv_shape[i] for i in inv_order)
+            xfer_buffers[layer_name] = torch.empty(kv_shape, dtype=kv_dtype, device="cpu")
+    except MemoryError as e:
+        logger.error("NIXLConnectorWorker gets %s.", e)
+        raise
 
+    self.host_xfer_buffers = xfer_buffers
+
+
+torch.Tensor.data_ptr = _hpu_data_ptr
 NixlConnectorWorker.initialize_host_xfer_buffer = initialize_host_xfer_buffer
diff --git a/vllm_gaudi/extension/bucketing/bucketing_file.txt b/vllm_gaudi/extension/bucketing/bucketing_file.txt
new file mode 100644
index 000000000..494ed3764
--- /dev/null
+++ b/vllm_gaudi/extension/bucketing/bucketing_file.txt
@@ -0,0 +1,24 @@
+# This is a sample bucketing file
+
+# Add buckets as follows:
+# Prompt: (batch_size, query_len, context)
+# Decode: (batch_size, 1, context)
+
+# You can also use lists to end up with Cartesian product like so:
+# (1, [256, 512], [0, 4, 8])
+# In this case you will end up with 6 buckets
+# You can also use python's range to create similiar lists
+# range(min, max, step)
+# Examples are shown below
+
+# Not supported for unified attention buckets
+# use '#' to comment out lines
+
+
+# Buckets:
+(1, 2048, 0)
+(1, [256, 512], [0, 4, 8])
+
+(64, 1, 1024)
+(32, 1, 512)
+(1, 1, range(256, 512, 32))
diff --git a/vllm_gaudi/extension/bucketing/common.py b/vllm_gaudi/extension/bucketing/common.py
index 11242ea6a..fcf3e3b8b 100644
--- a/vllm_gaudi/extension/bucketing/common.py
+++ b/vllm_gaudi/extension/bucketing/common.py
@@ -54,13 +54,18 @@ def initialize(self, max_num_seqs, max_num_prefill_seqs, block_size, max_num_bat
         self.num_hpu_blocks = None
         self.max_model_len = max_model_len
         self.initialized = True
-
         self.fallback_bs_base_step = 2
         self.fallback_seq_base_step = 32
         self.fallback_blocks_base_step = 32
 
     ### GENERATE BUCKETS FUNCTIONS ###
 
+    def read_from_file(self, is_prompt):
+        file_name = get_config().VLLM_BUCKETING_FROM_FILE
+        from vllm_gaudi.extension.bucketing.file_strategy import (FileBucketingStrategy)
+        strategy = FileBucketingStrategy()
+        return strategy.get_buckets(file_name, is_prompt)
+
     def get_bucketing_strategy(self):
         strategy = None
         # TODO - we can use different strategies for decode and prompt
@@ -78,6 +83,8 @@ def get_bucketing_strategy(self):
 
     def generate_unified_buckets(self):
         if self.initialized:
+            if get_config().VLLM_BUCKETING_FROM_FILE:
+                assert "Unified attention doesn't support bucketing from file"
             from vllm_gaudi.extension.bucketing.unified import (UnifiedBucketingStrategy)
             strategy = UnifiedBucketingStrategy()
 
@@ -105,20 +112,29 @@ def generate_unified_buckets(self):
 
     def generate_prompt_buckets(self):
         if self.initialized:
-            strategy = self.get_bucketing_strategy()
-
-            bs_cfg, query_cfg, ctx_cfg = strategy.get_prompt_cfgs(max_num_prefill_seqs=self.max_num_prefill_seqs,
-                                                                  block_size=self.block_size,
-                                                                  max_num_batched_tokens=self.max_num_batched_tokens,
-                                                                  max_model_len=self.max_model_len)
-
-            bs_range = strategy.get_range(bs_cfg)
-            query_range = strategy.get_range(query_cfg)
-            ctx_range = strategy.get_range(ctx_cfg)
+            buckets_from_file = None
+            bs_range = []
+            query_range = []
+            ctx_range = []
+            if get_config().VLLM_BUCKETING_FROM_FILE:
+                buckets_from_file = self.read_from_file(is_prompt=True)
+            else:
+                strategy = self.get_bucketing_strategy()
+
+                bs_cfg, query_cfg, ctx_cfg = strategy.get_prompt_cfgs(
+                    max_num_prefill_seqs=self.max_num_prefill_seqs,
+                    block_size=self.block_size,
+                    max_num_batched_tokens=self.max_num_batched_tokens,
+                    max_model_len=self.max_model_len)
+
+                bs_range = strategy.get_range(bs_cfg)
+                query_range = strategy.get_range(query_cfg)
+                ctx_range = strategy.get_range(ctx_cfg)
 
             self.prompt_buckets = generate_buckets(bs_range, query_range, ctx_range, True, self.max_model_len,
                                                    self.max_num_seqs, self.max_num_prefill_seqs,
-                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks)
+                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
+                                                   buckets_from_file)
             self.log_generate_info(True)
         else:
             logger().info("Bucketing is off - skipping prompt buckets generation")
@@ -127,24 +143,33 @@ def generate_prompt_buckets(self):
 
     def generate_decode_buckets(self):
         if self.initialized:
-            strategy = self.get_bucketing_strategy()
-
-            bs_cfg, query_cfg, ctx_cfg = strategy.get_decode_cfgs(max_num_seqs=self.max_num_seqs,
-                                                                  block_size=self.block_size,
-                                                                  max_num_batched_tokens=self.max_num_batched_tokens,
-                                                                  max_model_len=self.max_model_len,
-                                                                  max_blocks=self.num_hpu_blocks)
-
-            bs_range = strategy.get_range(bs_cfg)
-            query_range = strategy.get_range(query_cfg)
-            ctx_range = strategy.get_range(ctx_cfg)
-
-            if get_config().use_contiguous_pa and ctx_range[-1] < self.num_hpu_blocks:
-                ctx_range.append(self.num_hpu_blocks)
+            buckets_from_file = None
+            bs_range = []
+            query_range = []
+            ctx_range = []
+            if get_config().VLLM_BUCKETING_FROM_FILE:
+                buckets_from_file = self.read_from_file(is_prompt=False)
+            else:
+                strategy = self.get_bucketing_strategy()
+
+                bs_cfg, query_cfg, ctx_cfg = strategy.get_decode_cfgs(
+                    max_num_seqs=self.max_num_seqs,
+                    block_size=self.block_size,
+                    max_num_batched_tokens=self.max_num_batched_tokens,
+                    max_model_len=self.max_model_len,
+                    max_blocks=self.num_hpu_blocks)
+
+                bs_range = strategy.get_range(bs_cfg)
+                query_range = strategy.get_range(query_cfg)
+                ctx_range = strategy.get_range(ctx_cfg)
+
+                if get_config().use_contiguous_pa and ctx_range[-1] < self.num_hpu_blocks:
+                    ctx_range.append(self.num_hpu_blocks)
 
             self.decode_buckets = generate_buckets(bs_range, query_range, ctx_range, False, self.max_model_len,
                                                    self.max_num_seqs, self.max_num_prefill_seqs,
-                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks)
+                                                   self.max_num_batched_tokens, self.block_size, self.num_hpu_blocks,
+                                                   buckets_from_file)
             self.log_generate_info(False)
         else:
             logger().info("Bucketing is off - skipping decode buckets generation")
@@ -225,8 +250,17 @@ def get_bucketing_manager():
     return instance
 
 
-def generate_buckets(bs_range, query_range, ctx_range, is_prompt, max_model_len, max_num_seqs, max_num_prefill_seqs,
-                     max_num_batched_tokens, block_size, max_blocks):
+def generate_buckets(bs_range,
+                     query_range,
+                     ctx_range,
+                     is_prompt,
+                     max_model_len,
+                     max_num_seqs,
+                     max_num_prefill_seqs,
+                     max_num_batched_tokens,
+                     block_size,
+                     max_blocks,
+                     file_buckets=None):
     use_merged_prefill = get_config().merged_prefill
     use_contiguous_pa = get_config().use_contiguous_pa
 
@@ -241,11 +275,8 @@ def expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range, max_num_bat
         values that are in and out of budget:
         bs < edge_case_bs < next bs and query < edge_case_query < next query
         '''
+
         candidates = [(bs_idx, ctx_idx), (bs_idx + 1, ctx_idx), (bs_idx, ctx_idx + 1), (bs_idx + 1, ctx_idx + 1)]
-        valid = bs_range[bs_idx] <= max_num_batched_tokens
-        if not valid:
-            omitted_buckets.add(("bs_range[bs_idx] <= max_num_batched_tokens", "-> bs, ctx: ", bs_idx, ctx_idx))
-            return {}
         valid_candidates = [(b_idx, q_idx) for b_idx, q_idx in candidates
                             if b_idx < len(bs_range) and q_idx < len(ctx_range)]
         return {(bs_range[b_idx], ctx_range[q_idx]) for b_idx, q_idx in valid_candidates}
@@ -253,26 +284,35 @@ def expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range, max_num_bat
     # filter rules for buckets
     # prompt
     def not_over_max_model_len(bs, query, ctx):
-        if not query + ctx * block_size <= max_model_len:
+        smaller_than_limit = (query + ctx * block_size) <= max_model_len
+        if not smaller_than_limit:
+            omitted_buckets.add(
+                ("condition: (query + ctx * block_size) <= max_model_len", "-> bs, query, ctx: ", bs, query, ctx))
+        return smaller_than_limit
+
+    def not_over_max_num_batched_tokens(bs, query, ctx):
+        smaller_than_limit = bs * query <= max_num_batched_tokens
+        if not smaller_than_limit:
             omitted_buckets.add(
-                ("condition: query + ctx * block_size <= max_model_len", "-> bs, query, ctx: ", bs, query, ctx))
-        return query + ctx * block_size <= max_model_len
+                ("condition: bs * query <= max_num_batched_tokens", "-> bs, query, ctx: ", bs, query, ctx))
+        return smaller_than_limit
 
     def ctx_not_over_max_ctx_for_merged_prefill(bs, query, ctx):
-        if not ctx <= max_num_prefill_seqs * math.ceil(
-            (max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size):
+        smaller_than_limit = ctx <= max_num_prefill_seqs * math.ceil(
+            (max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size)
+        if not smaller_than_limit:
             omitted_buckets.add((
                 "ctx <= max_num_prefill_seqs * math.ceil((max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size)",
                 "-> bs, query, ctx: ", bs, query, ctx))
-        return ctx <= max_num_prefill_seqs * math.ceil(
-            (max_model_len - math.floor(query / max_num_prefill_seqs)) // block_size)
+        return smaller_than_limit
 
     # decode
     def block_not_greater_than_max_model_len(bs, query, ctx):
-        if not ctx <= bs * math.ceil(max_model_len / block_size):
+        smaller_than_limit = ctx <= bs * math.ceil(max_model_len / block_size)
+        if not smaller_than_limit:
             omitted_buckets.add(
                 ("condition: ctx <= bs * math.ceil(max_model_len / block_size)", "-> bs, query, ctx: ", bs, query, ctx))
-        return ctx <= bs * math.ceil(max_model_len / block_size)
+        return smaller_than_limit
 
     def batch_size_smaller_than_blocks(bs, query, ctx):
         if not bs <= ctx:
@@ -283,7 +323,7 @@ def batch_size_smaller_than_blocks(bs, query, ctx):
         "prompt": {
             # depends only on merged_prefill
             True: [ctx_not_over_max_ctx_for_merged_prefill],
-            False: [not_over_max_model_len],
+            False: [not_over_max_model_len, not_over_max_num_batched_tokens],
         },
         "decode": {
             # depends only on contiguous PA
@@ -302,16 +342,22 @@ def get_filters(is_prompt, use_merged_prefill, use_contiguous_pa):
     buckets_2d = set()
     omitted_buckets = set()
     filters = get_filters(is_prompt, use_merged_prefill, use_contiguous_pa)
-    for bs_idx, bs in enumerate(bs_range):
-        for ctx_idx, ctx in enumerate(ctx_range):
-            local_buckets = expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range,
-                                                       max_num_batched_tokens) if not is_prompt else {(bs, ctx)}
-            buckets_2d.update(local_buckets)
-
-    for bs, ctx in buckets_2d:
-        for query in query_range:
-            if all(bucket_filter(bs, query, ctx) for bucket_filter in filters):
-                buckets.add((bs, query, ctx))
+
+    if file_buckets:
+        for bs, query, blocks in file_buckets:
+            if all(bucket_filter(bs, query, blocks) for bucket_filter in filters):
+                buckets.add((bs, query, blocks))
+    else:
+        for bs_idx, bs in enumerate(bs_range):
+            for ctx_idx, ctx in enumerate(ctx_range):
+                local_buckets = expand_to_neighbor_buckets(bs_idx, bs_range, ctx_idx, ctx_range,
+                                                           max_num_batched_tokens) if not is_prompt else {(bs, ctx)}
+                buckets_2d.update(local_buckets)
+
+        for bs, ctx in buckets_2d:
+            for query in query_range:
+                if all(bucket_filter(bs, query, ctx) for bucket_filter in filters):
+                    buckets.add((bs, query, ctx))
     if not buckets:
         phase = 'prompt' if is_prompt else 'decode'
         for bucket in omitted_buckets:
@@ -334,12 +380,12 @@ def generate_unified_buckets(query_range, shared_ctx_range, unique_ctx_range, bs
             max_bs = min(bs, query)
             if math.ceil(shared_ctx * block_size // max_bs) <= max_model_len:
                 buckets.add((query, shared_ctx, unique_ctx, causal))
-        elif (query <= bs):
+        elif query <= bs:
             # non causal query = current bs
             if shared_ctx > 0 or unique_ctx > 0:
-                if shared_ctx == 0 or (query > 1 and \
-                    math.ceil(shared_ctx * block_size // (query // 2)) <= max_model_len):
-                    buckets.add((query, shared_ctx, unique_ctx, causal))
+                if shared_ctx == 0 or (math.ceil(shared_ctx * block_size // (query // 2)) <= max_model_len):
+                    if shared_ctx > 0 or query <= unique_ctx:
+                        buckets.add((query, shared_ctx, unique_ctx, causal))
 
     return sorted(buckets)
 
diff --git a/vllm_gaudi/extension/bucketing/exponential.py b/vllm_gaudi/extension/bucketing/exponential.py
index 81e78d3b5..610baca54 100644
--- a/vllm_gaudi/extension/bucketing/exponential.py
+++ b/vllm_gaudi/extension/bucketing/exponential.py
@@ -37,8 +37,8 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke
         prompt_bs_bucket_cfg = [1, 2, max_num_prefill_seqs, prompt_bs_limit]
         max_prompt_seq_limit = math.ceil(math.log2(max_num_batched_tokens))
         prompt_query_bucket_cfg = [block_size, block_size, max_num_batched_tokens, max_prompt_seq_limit]
-        max_ctx = math.ceil((max_model_len - prompt_query_bucket_cfg[0]) // block_size)
-        max_prompt_ctx_limit = math.ceil(math.log2(max_ctx)) + 1
+        max_ctx = max(1, math.ceil((max_model_len - prompt_query_bucket_cfg[0]) // block_size))
+        max_prompt_ctx_limit = 2 if max_ctx == 1 else math.ceil(math.log2(max_ctx)) + 1
         prompt_ctx_bucket_cfg = [0, 1, max_ctx, max_prompt_ctx_limit]
 
         if use_merged_prefill:
diff --git a/vllm_gaudi/extension/bucketing/file_strategy.py b/vllm_gaudi/extension/bucketing/file_strategy.py
new file mode 100644
index 000000000..6ef6d711c
--- /dev/null
+++ b/vllm_gaudi/extension/bucketing/file_strategy.py
@@ -0,0 +1,54 @@
+import itertools
+import operator
+import os
+import math
+import ast
+from dataclasses import dataclass, field
+from typing import List, Tuple
+
+from vllm_gaudi.extension.logger import logger as logger
+from vllm_gaudi.extension.runtime import get_config
+
+
+class FileBucketingStrategy:
+
+    def get_buckets(self, file_name, is_prompt):
+        prompt_buckets = []
+        decode_buckets = []
+
+        with open(file_name, 'r') as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith('#'):
+                    continue
+
+                try:
+                    bucket = eval(line, {"__builtins__": None}, {"range": range})
+                except Exception as e:
+                    print(f"Skipping line due to eval error: {e} - {line}")
+                    continue
+
+                if not isinstance(bucket, tuple) or len(bucket) != 3:
+                    print('Skipping line due to incorrect format - ', bucket)
+                    continue
+
+                x_num = ensure_is_list(bucket[0])
+                y_num = ensure_is_list(bucket[1])
+                z_num = ensure_is_list(bucket[2])
+
+                for full_bucket in itertools.product(x_num, y_num, z_num):
+                    x, y, z = map(int, full_bucket)
+                    if y == 1:
+                        decode_buckets.append((x, y, z))
+                    else:
+                        prompt_buckets.append((x, y, z))
+        return sorted(prompt_buckets) if is_prompt else sorted(decode_buckets)
+
+
+def ensure_is_list(value):
+    if isinstance(value, list):
+        return value
+    elif isinstance(value, range):
+        return list(value)
+    else:
+        return [value]
diff --git a/vllm_gaudi/extension/bucketing/linear.py b/vllm_gaudi/extension/bucketing/linear.py
index 4e677a290..4943cd9dc 100644
--- a/vllm_gaudi/extension/bucketing/linear.py
+++ b/vllm_gaudi/extension/bucketing/linear.py
@@ -15,12 +15,12 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke
         use_merged_prefill = get_config().merged_prefill
         prefix_caching = get_config().prefix_caching
 
-        prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=32, max=max_num_prefill_seqs)
+        prompt_bs_bucket_cfg = read_bucket_settings('prompt', 'bs', min=1, step=1, max=max_num_prefill_seqs)
         prompt_query_bucket_cfg = read_bucket_settings('prompt',
                                                        'seq',
                                                        min=block_size,
                                                        step=block_size,
-                                                       max=max_model_len)
+                                                       max=max_num_batched_tokens)
         max_ctx = math.ceil((max_model_len - prompt_query_bucket_cfg[0]) // block_size)
         prompt_ctx_bucket_cfg = read_bucket_settings('prompt', 'ctx', min=0, step=1, max=max_ctx)
 
@@ -55,14 +55,14 @@ def get_prompt_cfgs(self, max_num_prefill_seqs, block_size, max_num_batched_toke
 
     def get_decode_cfgs(self, max_num_seqs, block_size, max_num_batched_tokens, max_model_len, max_blocks):
         prefix_caching = get_config().prefix_caching
+        contiguous_pa = get_config().use_contiguous_pa
 
         decode_bs_bucket_cfg = read_bucket_settings('decode', 'bs', min=1, step=32, max=max_num_seqs)
         decode_query_bucket_cfg = [1, 1, 1]
-        decode_block_bucket_cfg = read_bucket_settings('decode',
-                                                       'block',
-                                                       min=block_size,
-                                                       step=block_size,
-                                                       max=max_blocks)
+        max_decode_blocks = max(math.ceil(max_model_len * max_num_seqs // block_size), block_size)
+        if contiguous_pa:
+            max_decode_blocks = max_blocks
+        decode_block_bucket_cfg = read_bucket_settings('decode', 'block', min=1, step=block_size, max=max_decode_blocks)
         if decode_block_bucket_cfg[2] > max_blocks:
             logger().info(
                 f'VLLM_DECODE_BLOCK_BUCKET_MAX={decode_block_bucket_cfg[2]} is higher than max_blocks={max_blocks}. Your configuration VLLM_DECODE_BLOCK_BUCKET_MAX={decode_block_bucket_cfg[2]} will be overwritten to VLLM_DECODE_BLOCK_BUCKET_MAX={max_blocks}'
@@ -113,11 +113,11 @@ def warmup_range(config: Tuple[int, int, int]):
     """
     bmin, bstep, bmax = config
     add_zero_bucket = bmin == 0
-    if add_zero_bucket:
-        bmin = bstep
     assert bmin <= bmax, ("Min. batch size cannot be greater than max. "
                           "batch size. If you want to skip warmup, "
                           "set VLLM_SKIP_WARMUP=true")
+    if add_zero_bucket:
+        bmin = bstep
     base = itertools.repeat(2)
     ramp_up_acc = itertools.accumulate(base, func=operator.mul, initial=bmin)
     ramp_up_tw = itertools.takewhile(lambda x: x < bstep and x <= bmax, \
diff --git a/vllm_gaudi/extension/bucketing/unified.py b/vllm_gaudi/extension/bucketing/unified.py
index 317f47aee..b1f5111d3 100644
--- a/vllm_gaudi/extension/bucketing/unified.py
+++ b/vllm_gaudi/extension/bucketing/unified.py
@@ -14,8 +14,8 @@ class UnifiedBucketingStrategy():
 
     def get_unified_cfgs(self, bs, max_model_len, block_size, max_blocks, max_num_batched_tokens):
         # [min, max, turning_point]
-        query_cfg = [block_size, max_num_batched_tokens, bs]
-        max_shared_ctx = math.ceil(max_model_len // block_size) * bs
+        query_cfg = [1, max_num_batched_tokens, bs]
+        max_shared_ctx = min(math.ceil(max_model_len // block_size), max_blocks)
         shared_ctx_cfg = [0, max_shared_ctx, bs]
         max_unique_ctx = max_blocks
         unique_ctx_cfg = [0, max_unique_ctx, bs]
@@ -28,19 +28,24 @@ def get_range(self, cfg):
 
 def warmup_unified_range(cfg):
     bmin, bmax, turning_point = cfg
+    limit = 10
+    round_up = 128
 
     buckets: Set[Tuple[int, int]] = set()
 
     if bmin == 0:
         buckets.add(bmin)
-
-    # alpha version: [bs/4, bs/2, bs, bt/4, bt/2, bt]
-
-    buckets.add(turning_point // 4)
-    buckets.add(turning_point // 2)
-    buckets.add(turning_point)
-    buckets.add(bmax // 4)
-    buckets.add(bmax // 2)
-    buckets.add(bmax)
+        bmin = 1
+
+    num_buckets_exp = limit
+    first_step = bmax
+
+    for i in range(num_buckets_exp):
+        power_unpadded = bmin * np.float_power(first_step / bmin, (1. / float(num_buckets_exp - 1)) * i)
+        if i == limit - 1:
+            bucket = bmax
+        else:
+            bucket = math.ceil(power_unpadded / round_up) * round_up
+        buckets.add(bucket)
 
     return list(sorted(buckets))
diff --git a/vllm_gaudi/extension/environment.py b/vllm_gaudi/extension/environment.py
index 306cbeee3..45c3e3384 100644
--- a/vllm_gaudi/extension/environment.py
+++ b/vllm_gaudi/extension/environment.py
@@ -4,9 +4,10 @@
 # This source code is licensed under the Apache 2.0 license found in the
 # LICENSE file in the root directory of this source tree.
 ###############################################################################
+import os
 
 from .logger import logger
-from .config import Value, boolean, split_values_and_flags
+from .config import Value, boolean, split_values_and_flags, Any, Disabled, Enabled
 from .validation import choice, regex
 
 _VLLM_VALUES = {}
@@ -27,6 +28,31 @@ def _get_hw(_):
     return None
 
 
+def _get_prefix(_):
+    if os.environ.get('VLLM_UNIFIED_ATTN'):
+        return True
+    conti_pa = os.environ.get('VLLM_CONTIGUOUS_PA')
+    if conti_pa is None:
+        return True
+    elif boolean(conti_pa) is True:
+        return False
+    return True
+
+
+def _get_vllm_hash(_):
+    import subprocess
+
+    try:
+        commit_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('utf-8').strip()
+        try:
+            branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).decode('utf-8').strip()
+        except subprocess.CalledProcessError:
+            branch = "Error getting branch name"
+        return branch + "+" + commit_hash
+    except subprocess.CalledProcessError as e:
+        return "Error getting commit hash"
+
+
 def _get_build(_):
     import re
     import subprocess
@@ -70,7 +96,9 @@ def _get_pt_bridge_mode(_):
     return 'lazy' if htorch.utils.internal.is_lazy() else 'eager'
 
 
-def VllmValue(name, env_var_type):
+def VllmValue(name, env_var_type, depend=None):
+    if depend is not None:
+        return Value(name, env_var_type=env_var_type, dependencies=depend)
     global _VLLM_VALUES
     return Value(name, lambda _: _VLLM_VALUES[name], env_var_type=env_var_type)
 
@@ -86,6 +114,7 @@ def get_environment():
         Value('engine_version', _get_vllm_engine_version, env_var_type=str),
         Value('bridge_mode', _get_pt_bridge_mode, env_var_type=str, check=choice('eager', 'lazy')),
         VllmValue('model_type', str),
-        VllmValue('prefix_caching', boolean),
+        VllmValue('prefix_caching', boolean, depend=_get_prefix),
+        Value('vllm_gaudi_commit', _get_vllm_hash, env_var_type=str)
     ]
     return split_values_and_flags(values)
diff --git a/vllm_gaudi/extension/features.py b/vllm_gaudi/extension/features.py
index a2002a826..d0aea982f 100644
--- a/vllm_gaudi/extension/features.py
+++ b/vllm_gaudi/extension/features.py
@@ -34,6 +34,7 @@ def get_user_flags():
         Env('VLLM_DECODE_BLOCK_BUCKET_STEP', int),
         Env('VLLM_DECODE_BLOCK_BUCKET_MAX', int),
         Env('VLLM_DECODE_BLOCK_BUCKET_LIMIT', int),
+        Env('VLLM_BUCKETING_FROM_FILE', str),
 
         # Non-vllm flags that are also important to print
         Env('EXPERIMENTAL_WEIGHT_SHARING', str),
diff --git a/vllm_gaudi/extension/ops.py b/vllm_gaudi/extension/ops.py
index 9150619fa..a03ad545e 100644
--- a/vllm_gaudi/extension/ops.py
+++ b/vllm_gaudi/extension/ops.py
@@ -52,8 +52,8 @@ def block2batch(tensor, block_mapping, matmul_op=torch.matmul):
     return b2b_impl(tensor, block_mapping.t(), matmul_op)
 
 
-def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_size, matmul_av_op, batch2block_matmul_op,
-                 block2batch_matmul_op):
+def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, sink, block_size, batch_size, 
+                 matmul_av_op, batch2block_matmul_op, block2batch_matmul_op):
     # When fp32_softmax is enabled attn is left in fp32 after Q@K
     # We can return to native dtype after we renormalize and calculate the adjustments
     if block_bias is not None and attn.dtype != block_bias.dtype:
@@ -67,11 +67,27 @@ def pipelined_pa(attn, value, block_bias, block_groups, block_mapping, batch_siz
         if block_bias is not None:
             attn.add_(block_bias)
         block_max = attn.amax(dim=-1, keepdim=True)
+        if sink is not None:
+            block_max = torch.maximum(block_max, sink)
         attn = attn.sub(block_max)
         attn = attn.exp()
         if attn.dtype == torch.float32:
             attn = attn.to(value.dtype)
-        block_sums = attn.sum(dim=-1, keepdim=True)
+        attn_shape = attn.shape
+        block_sums = attn.view(-1,attn_shape[-1]).sum(dim=-1, keepdim=True)
+        attn_shape = list(attn_shape)
+        attn_shape[-1] = 1
+        block_sums = block_sums.view(attn_shape)
+        if sink is not None:
+            attn_sink = sink.sub(block_max)
+            attn_sink = attn_sink.exp()
+            if attn_sink.dtype == torch.float32:
+                attn_sink = attn_sink.to(value.dtype)
+            #TODO: Removing this .sum and using attn_sink directly
+            #results in wrong output which does not make sense. 
+            #Looks like a Synapse issue, need to investigate further.
+            block_sums_sink = attn_sink.sum(dim=-1, keepdim=True)
+            block_sums = block_sums + block_sums_sink
     attn = matmul_av_op(attn, value)
     if get_config().fused_block_softmax_adjustment:
         out_shape = list(attn.shape[:3]) + [1] * (attn.dim() - 3)
@@ -154,9 +170,9 @@ def flat_pa_mla(query, key_cache, value_cache, block_list, block_mapping, block_
     return attn
 
 
-def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale,
-            matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func,
-            values_fetch_func, **ignored_args):
+def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias, block_groups, block_size, scale, 
+            matmul_qk_op, position_bias, matmul_av_op, batch2block_matmul_op, block2batch_matmul_op, keys_fetch_func, 
+            values_fetch_func, sinks, **ignored_args):
     batch_size, _, hidden_size = query.shape
     _, kv_heads, head_size = key_cache.shape
     q_heads = hidden_size // head_size
@@ -166,6 +182,13 @@ def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias
     key = keys_fetch_func(key_cache.unflatten(0, (-1, block_size)), block_list).transpose(1, 2)
     value = values_fetch_func(value_cache.unflatten(0, (-1, block_size)), block_list).transpose(1, 2)
     block_bias = block_bias.view(key.size(0), 1, 1, -1)
+    sink = None
+    if sinks is not None:
+        sinks = sinks.reshape(sinks.shape[0], 1)
+        sink = sinks.reshape(1, sinks.shape[0], 1, sinks.shape[1])
+        sink = sink.expand(query.shape[0], -1, query.shape[-2], -1)
+        if kv_heads != q_heads:
+            sink = sink.unflatten(1, (kv_heads, -1))
     if kv_heads != q_heads:
         query = query.unflatten(1, (kv_heads, -1))
         key = key.unflatten(1, (kv_heads, 1))
@@ -187,11 +210,13 @@ def flat_pa(query, key_cache, value_cache, block_list, block_mapping, block_bias
             attn = attn.to(dtype=position_bias.dtype)
         attn.add_(position_bias.unsqueeze(-2))
 
-    attn = pipelined_pa(attn,
+    attn = pipelined_pa(attn, 
                         value,
                         block_bias,
                         block_groups,
                         block_mapping,
+                        sink,
+                        block_size, 
                         batch_size=batch_size,
                         matmul_av_op=matmul_av_op,
                         batch2block_matmul_op=batch2block_matmul_op,
@@ -250,6 +275,7 @@ def _naive_prompt_attention(query: torch.Tensor,
                             matmul_qk_op=torch.matmul,
                             softmax_op=torch.softmax,
                             matmul_av_op=torch.matmul,
+                            sinks: Optional[torch.Tensor] = None,
                             **ignored_args) -> torch.Tensor:
     query = query.transpose(1, 2)
     key = key.transpose(1, 2)
@@ -281,10 +307,19 @@ def _naive_prompt_attention(query: torch.Tensor,
         if attn_weights.dtype != attn_bias.dtype:
             attn_bias = attn_bias.to(dtype=attn_weights.dtype)
         attn_weights.add_(attn_bias)
+    if sinks is not None:
+        sink = sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
+        if query_heads != kv_heads:
+            sink = sink.unflatten(1, (kv_heads, -1))
+        combined_logits = torch.cat([attn_weights, sink], dim=-1)
+        combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
+        attn_weights = combined_logits
     if get_config().fp32_softmax:
         attn_weights = torch.softmax(attn_weights, dim=-1)
     else:
         attn_weights = softmax_op(attn_weights, dim=-1)
+    if sinks is not None:
+        attn_weights = attn_weights[..., :-1]
     attn_weights = attn_weights.to(query.dtype)
     attn_weights = matmul_av_op(attn_weights, value)
 
@@ -302,10 +337,13 @@ def _fsdpa_prompt_attention(query: torch.Tensor,
                             is_causal: bool,
                             attn_bias: Optional[torch.Tensor] = None,
                             valid_seq_lengths: Optional[torch.Tensor] = None,
+                            window_size: Optional[int] = None,
+                            sinks: Optional[torch.Tensor] = None,
                             **ignored_args) -> torch.Tensor:
     query = query.transpose(1, 2)
     key = key.transpose(1, 2)
     value = value.transpose(1, 2)
+    padding_side = 'right'
     if get_config().fp32_softmax:
         softmax_mode = 'fp32'
     else:
@@ -317,9 +355,23 @@ def _fsdpa_prompt_attention(query: torch.Tensor,
         # TODO: causal + attn_bias is not yet supported
         is_causal = False
         valid_seq_lengths = None
-    attn_weights = fsdpa_op(query, key, value, attn_bias, 0.0, is_causal, scale, softmax_mode, recompute_mode,
-                            valid_seq_lengths, 'right')
+    # TODO - remove this once fsdpa op support fast mode for sliding window
+    if window_size is not None:
+        #causal window sdpa kernel only supports softmax None
+        softmax_mode = 'None'
+    args = [query, key, value, attn_bias, 0.0, is_causal,
+                                scale, softmax_mode, recompute_mode,
+                                valid_seq_lengths, padding_side]
+    args += [window_size] if window_size else [None]
+    # use sinks in fsdpa
+    if sinks is not None:
+        args += [sinks]
+
+    attn_weights = fsdpa_op(*args)
     attn_weights = attn_weights.transpose(1, 2)
+    if sinks is not None:
+        # TODO - check if we can remove this
+        htcore.mark_step()
     return attn_weights
 
 
@@ -439,19 +491,23 @@ def __init__(self):
     def set_weight(self, w):
         self.weight = w
 
+    def set_bias(self, b):
+        self.bias = b
+
     def forward(self, state, expert_id, w):
         raise NotImplementedError()
 
 
 class VllmMixtureOfExpertsOp(torch.nn.Module):
 
-    def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8):
+    def __init__(self, num_total_experts, experts_min: int = 0, experts_max: int = 8, bias = None):
         super().__init__()
         self.w13_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
         self.w2_list = torch.nn.ModuleList([MoeMatmul() for _ in range(num_total_experts)])
         self.num_experts = num_total_experts
         self.experts_min = experts_min
         self.experts_max = experts_max
+        self.bias = bias
 
         if MAX_EXPERTS_PER_SLICE > 0:
             max_expert_per_slice = MAX_EXPERTS_PER_SLICE
@@ -468,29 +524,64 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_
         w2_list = [self.w2_list[i].weight.squeeze() for i in experts_range]
 
         if self.moe_n_slice == 1:
-            return torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states,
-                                                    expert_routing_table=expert_routing_table,
-                                                    router_weights=router_weights,
-                                                    w12=w1_list,
-                                                    w3=w2_list,
-                                                    permuted_weights=permuted_weights,
-                                                    activation=activation,
-                                                    experts_min=self.experts_min,
-                                                    experts_max=self.experts_max)
+            if self.bias is not None:
+                w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range]
+                w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range] 
+                return torch.ops.hpu.mixture_of_experts.bias_fused_weights(
+                    hidden_states=hidden_states,
+                    expert_routing_table=expert_routing_table,
+                    router_weights=router_weights,
+                    w12=w1_list,
+                    w3=w2_list,
+                    w12_bias = w1_bias_list,
+                    w3_bias = w2_bias_list,
+                    permuted_weights=permuted_weights,
+                    experts_min=self.experts_min,
+                    experts_max=self.experts_max)
+            else:
+                return torch.ops.hpu.mixture_of_experts(
+                    hidden_states=hidden_states,
+                    expert_routing_table=expert_routing_table,
+                    router_weights=router_weights,
+                    w12=w1_list,
+                    w3=w2_list,
+                    permuted_weights=permuted_weights,
+                    activation=activation,
+                    experts_min=self.experts_min,
+                    experts_max=self.experts_max)
+                
         for i in range(self.moe_n_slice):
             w1_list_slice = w1_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group]
             w2_list_slice = w2_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group]
             min_expert = self.experts_min + i * self.num_expert_per_group
             max_expert = min_expert + self.num_expert_per_group - 1
-            slice_final_hidden_states = torch.ops.hpu.mixture_of_experts(hidden_states=hidden_states,
-                                                                         expert_routing_table=expert_routing_table,
-                                                                         router_weights=router_weights,
-                                                                         w12=w1_list_slice,
-                                                                         w3=w2_list_slice,
-                                                                         permuted_weights=permuted_weights,
-                                                                         activation=activation,
-                                                                         experts_min=min_expert,
-                                                                         experts_max=max_expert)
+            if self.bias is not None:
+                w1_bias_list = [self.w13_list[i].bias.squeeze() for i in experts_range]
+                w2_bias_list = [self.w2_list[i].bias.squeeze() for i in experts_range]                 
+                w1_bias_list_slice = w1_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group]
+                w2_bias_list_slice = w2_bias_list[i * self.num_expert_per_group:(i + 1) * self.num_expert_per_group]                
+                slice_final_hidden_states = torch.ops.hpu.mixture_of_experts.bias_fused_weights(
+                    hidden_states=hidden_states,
+                    expert_routing_table=expert_routing_table,
+                    router_weights=router_weights,
+                    w12=w1_list,
+                    w3=w2_list,
+                    w12_bias = w1_bias_list_slice,
+                    w3_bias = w2_bias_list_slice,
+                    permuted_weights=permuted_weights,
+                    experts_min=self.experts_min,
+                    experts_max=self.experts_max)
+            else:            
+                slice_final_hidden_states = torch.ops.hpu.mixture_of_experts(
+                    hidden_states=hidden_states,
+                    expert_routing_table=expert_routing_table,
+                    router_weights=router_weights,
+                    w12=w1_list_slice,
+                    w3=w2_list_slice,
+                    permuted_weights=permuted_weights,
+                    activation=activation,
+                    experts_min=min_expert,
+                    experts_max=max_expert)
             if i == 0:
                 final_hidden_states = slice_final_hidden_states
             else:
@@ -499,6 +590,7 @@ def forward(self, hidden_states, expert_routing_table, router_weights, permuted_
         return final_hidden_states
 
 
+
 class DynamicFusedMOE(torch.nn.Module):
 
     def __init__(self, num_total_experts):
diff --git a/vllm_gaudi/extension/runtime.py b/vllm_gaudi/extension/runtime.py
index 26e6aa8be..629a1bcb1 100644
--- a/vllm_gaudi/extension/runtime.py
+++ b/vllm_gaudi/extension/runtime.py
@@ -74,7 +74,7 @@ def finalize_config():
         logger().warning(
             f"Following environment variables are considered experimental: {', '.join(experimental_flags)}")
         logger().warning(
-            "In future releases using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.")
+            "From v0.12.0 release using those flags without VLLM_ENABLE_EXPERIMENTAL_FLAGS will trigger a fatal error.")
         logger().warning(footer)
 
     dump('Environment', environment_values)
diff --git a/vllm_gaudi/extension/unified.py b/vllm_gaudi/extension/unified.py
index 6ed2a3adf..8c52aef31 100644
--- a/vllm_gaudi/extension/unified.py
+++ b/vllm_gaudi/extension/unified.py
@@ -357,8 +357,12 @@ def create(total_tokens: torch.tensor, block_table: torch.tensor, block_size: in
 
         group_ids, group_offsets = indices_and_offsets(num_ctx_blocks)
         block_ids = fetch_2d(block_table, group_ids, group_offsets)
-        block_usages = torch.clamp(
-            total_tokens.index_select(0, group_ids) - group_offsets * block_size + 1, 1, block_size)
+        #NOTE(kzawora): Originally, we were clamping
+        # total_tokens.index_select(0, group_ids) - group_offsets * block_size + 1
+        # I'm not sure why +1 was there originally, but in non-block-aligned prefix-prefill scenarios
+        # it made causal mask not cover the first unused token.
+        # (e.g. with context 28, the 28th slot was unmasked, causing the effective context length to be 29)
+        block_usages = torch.clamp(total_tokens.index_select(0, group_ids) - group_offsets * block_size, 1, block_size)
 
         ctx = Context(group_ids, group_offsets, block_ids, block_usages)
         all_shapes = [v.shape for v in ctx._values() if torch.is_tensor(v)]
@@ -403,8 +407,8 @@ def hpu_tensor(tensor: torch.tensor, shape: tuple, pad_value: Union[int, float])
 def create_unified_batch(req_ids: list[str], all_token_ids: torch.tensor, num_computed_tokens: torch.tensor,
                          num_scheduled_tokens: torch.tensor, num_prompt_tokens: torch.tensor, block_table: torch.tensor,
                          block_size: int, dtype: torch.dtype, bucketing_fn: Callable[[bool, int, int, int, int],
-                                                                                     tuple[int, int, int,
-                                                                                           int]]) -> UnifiedBatch:
+                                                                                     tuple[int, int, int, int]],
+                         get_dp_padding_fn: Callable[[int], int]) -> UnifiedBatch:
     """ Calculate all necessary tensors needed for batch scheduling """
     total_tokens = num_computed_tokens + num_scheduled_tokens
     query_len = num_scheduled_tokens.sum().item()
@@ -483,6 +487,11 @@ def first_dim(t: Optional[torch.tensor]) -> int:
                           first_dim(logits_indices))
     target_qlen, target_shared_blocks, target_unique_blocks, target_logits = bucket
 
+    target_qlen += get_dp_padding_fn(target_qlen)
+    target_shared_blocks += get_dp_padding_fn(target_shared_blocks)
+    target_unique_blocks += get_dp_padding_fn(target_unique_blocks)
+    target_logits += get_dp_padding_fn(target_logits)
+
     default_causal_width = 512
     fmin = torch.finfo(dtype).min
     feps = torch.finfo(dtype).tiny
diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index 6b7739252..0f1389686 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -148,8 +148,28 @@ def forward(
         recompute_mode,
         valid_sequence_lengths,
         padding_side="left",
+        window_size=None,
+        sinks=None,
     ):
-        return self._hpu_kernel_fsdpa.apply(
+        if window_size:
+            return self._hpu_kernel_fsdpa.apply(
+                query,
+                key,
+                value,
+                attn_mask,
+                dropout_p,
+                is_causal,
+                scale,
+                softmax_mode,
+                recompute_mode,
+                valid_sequence_lengths,
+                padding_side,
+                False,
+                False,
+                window_size,
+                sinks)
+        else:
+            return self._hpu_kernel_fsdpa.apply(
             query,
             key,
             value,
@@ -161,9 +181,12 @@ def forward(
             recompute_mode,
             valid_sequence_lengths,
             padding_side,
+            False,
+            False,
+            (-1,-1),
+            sinks
         )
 
-
 def pad_list(input, target_len, val_generator):
     padding = target_len - len(input)
     if padding > 0:
diff --git a/vllm_gaudi/lora/punica_wrapper/punica_hpu.py b/vllm_gaudi/lora/punica_wrapper/punica_hpu.py
index e3efd936b..a3c3a38cc 100644
--- a/vllm_gaudi/lora/punica_wrapper/punica_hpu.py
+++ b/vllm_gaudi/lora/punica_wrapper/punica_hpu.py
@@ -31,7 +31,6 @@ def add_lora_linear(self,
                         x: torch.Tensor,
                         lora_a_stacked: tuple[torch.Tensor, ...],
                         lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
                         scale: float,
                         output_slices: tuple[int, ...],
                         *,
@@ -75,7 +74,6 @@ def add_expand(
         y: torch.Tensor,
         x: Union[tuple[torch.Tensor, ...], torch.Tensor],
         lora_b_stacked: tuple[torch.Tensor, ...],
-        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
         output_slices: tuple[int, ...],
         offset_start: int = 0,
         add_inputs=True,
diff --git a/vllm_gaudi/ops/hpu_fused_moe.py b/vllm_gaudi/ops/hpu_fused_moe.py
index 41b7a62ce..70749245d 100644
--- a/vllm_gaudi/ops/hpu_fused_moe.py
+++ b/vllm_gaudi/ops/hpu_fused_moe.py
@@ -2,6 +2,7 @@
 
 import torch
 import vllm
+from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.fused_moe.layer import (FusedMoE, UnquantizedFusedMoEMethod)
 from vllm_gaudi.extension.ops import (VllmMixtureOfExpertsOp)
 
@@ -13,23 +14,29 @@ class HPUUnquantizedFusedMoEMethod(UnquantizedFusedMoEMethod):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         torch.hpu.synchronize()
-
+        vllm_config = get_current_vllm_config()
+        self.model_type = vllm_config.model_config.hf_config.model_type
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         super().process_weights_after_loading(layer)
         # custom handling for HPU
         num_experts = layer.local_num_experts
         ep_shift = layer.ep_rank * num_experts
+        has_bias = hasattr(layer, 'w13_bias') and hasattr(layer, 'w2_bias')
 
         experts_min, experts_max = ep_shift, num_experts + ep_shift - 1
         layer.moe_op = VllmMixtureOfExpertsOp(
             num_experts,
             experts_min,
             experts_max,
+            bias=has_bias,
         )
 
         for expert_id in range(layer.local_num_experts):
             layer.moe_op.w13_list[expert_id].set_weight(layer.w13_weight.data[expert_id])
             layer.moe_op.w2_list[expert_id].set_weight(layer.w2_weight.data[expert_id])
+            if has_bias:
+                layer.moe_op.w13_list[expert_id].set_bias(layer.w13_bias.data[expert_id])
+                layer.moe_op.w2_list[expert_id].set_bias(layer.w2_bias.data[expert_id])
 
     def forward_oot(
         self,
@@ -66,9 +73,17 @@ def forward_oot(
                 e_score_correction_bias=e_score_correction_bias)
         else:
             import torch.nn.functional as F
-            topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
-            topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
-            topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
+            if self.model_type in ["gpt_oss"]:
+                topk_weights, topk_ids = torch.topk(router_logits,
+                                                    top_k,
+                                                    dim=-1)
+                topk_weights = F.softmax(topk_weights,
+                                         dim=-1,
+                                         dtype=torch.float32)
+            else:
+                topk_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
+                topk_weights, topk_ids = torch.topk(topk_weights, top_k, dim=-1)
+                topk_weights /= topk_weights.sum(dim=-1, keepdim=True)
             topk_weights = topk_weights.to(x.dtype)
         topk_ids = topk_ids.view(*x.shape[:-1], -1)
         topk_weights = topk_weights.view(*x.shape[:-1], -1)
diff --git a/vllm_gaudi/platform.py b/vllm_gaudi/platform.py
index da17a537b..aa690975d 100644
--- a/vllm_gaudi/platform.py
+++ b/vllm_gaudi/platform.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
-from typing import TYPE_CHECKING, Optional
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 import habana_frameworks.torch as htorch
@@ -112,20 +112,23 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             vllm_config.model_config.dtype = torch.bfloat16
 
         if envs.VLLM_USE_V1:
-            from vllm.config import CompilationLevel, CUDAGraphMode
+            from vllm.config import CompilationMode, CUDAGraphMode
             compilation_config = vllm_config.compilation_config
             # Activate custom ops for v1.
             compilation_config.custom_ops = ["all"]
             compilation_config.cudagraph_mode = CUDAGraphMode.NONE
             compilation_config.cudagraph_capture_sizes = []
 
-            if compilation_config.level != CompilationLevel.NO_COMPILATION:
-                logger.info("[HPU] Forcing CompilationLevel.NO_COMPILATION "
-                            "compilation level")
-                compilation_config.level = CompilationLevel.NO_COMPILATION
+            if compilation_config.mode != CompilationMode.NONE:
+                logger.info("[HPU] Forcing CompilationMode.NONE "
+                            "compilation mode")
+                compilation_config.mode = CompilationMode.NONE
 
             print(f"========={compilation_config.custom_ops=}===========")
 
+        # Disable multi-stream for shared experts as no Stream on CPU
+        os.environ["VLLM_DISABLE_SHARED_EXPERTS_STREAM"] = "0"
+
     @classmethod
     def is_pin_memory_available(cls):
         logger.warning("Pin memory is not supported on HPU.")
@@ -173,9 +176,13 @@ def set_torch_compile(cls) -> None:
             # requires enabling lazy collectives
             # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html  # noqa: E501
             os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
-        # If not set by user then for torch compile enable Runtime scale patching by default
-        elif os.environ.get('RUNTIME_SCALE_PATCHING') is None:
-            os.environ['RUNTIME_SCALE_PATCHING'] = '1'
+        else:
+            # If not set by user then for torch compile enable Runtime scale patching by default
+            if os.environ.get('RUNTIME_SCALE_PATCHING') is None:
+                os.environ['RUNTIME_SCALE_PATCHING'] = '1'
+            #This allows for utilization of Parallel Compilation feature
+            if os.environ.get('FUSER_ENABLE_MULTI_THREADED_INVOCATIONS') is None:
+                os.environ['FUSER_ENABLE_MULTI_THREADED_INVOCATIONS'] = '1'
 
     @classmethod
     def is_kv_cache_dtype_supported(cls, kv_cache_dtype: str, model_config: ModelConfig) -> bool:
@@ -202,6 +209,43 @@ def _synced_weight_loader(param, *args, **kwargs):
 
         return _synced_weight_loader
 
+    @classmethod
+    def insert_blocks_to_device(
+        cls,
+        src_cache: torch.Tensor,
+        dst_cache: Union[tuple[torch.Tensor], torch.Tensor],
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from src_cache to dst_cache on HPU."""
+        if isinstance(dst_cache, tuple):
+            _src_cache = src_cache[:, src_block_indices]
+            if _src_cache.shape[2:] != dst_cache.shape[2:]:  # type: ignore[attr-defined]
+                _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
+            for i in range(len(dst_cache)):
+                dst_cache[i].index_copy_(0, dst_block_indices, _src_cache[i].to(dst_cache[i].device))
+        else:
+            dst_cache.index_copy_(0, dst_block_indices, src_cache[src_block_indices].to(dst_cache.device))
+        torch.hpu.synchronize()
+
+    @classmethod
+    def swap_out_blocks_to_host(
+        cls,
+        src_cache: Union[tuple[torch.Tensor], torch.Tensor],
+        dst_cache: torch.Tensor,
+        src_block_indices: torch.Tensor,
+        dst_block_indices: torch.Tensor,
+    ) -> None:
+        """Copy blocks from HPU to host (CPU)."""
+        if isinstance(src_cache, tuple):
+            _src_cache = torch.stack([c[src_block_indices] for c in src_cache], dim=0)
+            # permute back to original shape
+            if _src_cache.shape[2:] != dst_cache.shape[2:]:  # type: ignore[attr-defined]
+                _src_cache = _src_cache.permute(0, 1, 3, 2, 4)
+            dst_cache[:, dst_block_indices] = _src_cache.cpu()
+        else:
+            dst_cache[dst_block_indices] = src_cache[src_block_indices].cpu()
+
     @classmethod
     def patch_for_pt27(cls) -> None:
 
diff --git a/vllm_gaudi/utils.py b/vllm_gaudi/utils.py
index 8546088a9..82647e5a0 100644
--- a/vllm_gaudi/utils.py
+++ b/vllm_gaudi/utils.py
@@ -1,6 +1,6 @@
 from functools import cache
 import os
-from vllm.utils import make_tensor_with_pad, TORCH_DTYPE_TO_NUMPY_DTYPE
+from vllm.utils.torch_utils import make_tensor_with_pad, TORCH_DTYPE_TO_NUMPY_DTYPE
 from vllm_gaudi.extension.runtime import get_config
 from typing import (Any, Optional, TypeVar, Union)
 import torch
diff --git a/vllm_gaudi/v1/worker/hpu_dp_utils.py b/vllm_gaudi/v1/worker/hpu_dp_utils.py
new file mode 100644
index 000000000..ed3dcc1d1
--- /dev/null
+++ b/vllm_gaudi/v1/worker/hpu_dp_utils.py
@@ -0,0 +1,98 @@
+import torch
+from contextlib import contextmanager
+from vllm.config import VllmConfig
+from dataclasses import dataclass
+from typing import Optional
+from vllm.platforms import current_platform
+import habana_frameworks.torch as htorch
+
+
+@dataclass
+class HPUDPMetadata:
+    hidden_states_across_dp: torch.Tensor
+    router_logits_across_dp: torch.Tensor
+    local_hidden_states: torch.Tensor
+
+    @staticmethod
+    def make(
+        vllm_config: VllmConfig,
+        num_tokens: int,
+    ) -> "HPUDPMetadata":
+        hidden_size = vllm_config.model_config.get_hidden_size()
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        tp_size = vllm_config.parallel_config.tensor_parallel_size
+
+        num_tokens_across_dp = num_tokens * dp_size
+
+        dtype = vllm_config.model_config.dtype
+        device = current_platform.device_type
+
+        num_expert_names = [
+            "moe_num_experts",  # Dbrx
+            "num_experts",  # Jamba
+            "n_routed_experts",  # DeepSeek
+            "num_local_experts",  # Mixtral
+        ]
+        num_experts = 0
+        for name in num_expert_names:
+            num_experts = getattr(vllm_config.model_config.hf_text_config, name, 0)
+            if num_experts > 0:
+                break
+        assert num_experts > 0, \
+            "No expert found in the model config. Please check the model config."
+
+        hidden_states_across_dp = torch.empty(
+            (num_tokens_across_dp, hidden_size),
+            dtype=dtype,
+            device=device,
+        )
+        router_logits_across_dp = torch.empty(
+            (num_tokens_across_dp, num_experts),
+            dtype=dtype,
+            device=device,
+        )
+        local_num_tokens = (num_tokens //
+                            tp_size) if vllm_config.parallel_config.use_sequence_parallel_moe else num_tokens
+        local_hidden_states = torch.empty((local_num_tokens, hidden_size), dtype=dtype, device=device)
+
+        return HPUDPMetadata(hidden_states_across_dp, router_logits_across_dp, local_hidden_states)
+
+
+_hpu_dp_metadata: Optional[HPUDPMetadata] = None
+
+
+@contextmanager
+def override_hpu_dp_metadata(hpu_dp_metadata: Optional[HPUDPMetadata]):
+    """A context manager that overrides the current HPU DP metadata.
+    This is used to override the HPU DP metadata for a specific
+    forward pass.
+    """
+    global _hpu_dp_metadata
+    prev_metadata = _hpu_dp_metadata
+    _hpu_dp_metadata = hpu_dp_metadata
+    try:
+        yield
+    finally:
+        _hpu_dp_metadata = prev_metadata
+
+
+@contextmanager
+def set_hpu_dp_metadata(
+    vllm_config: VllmConfig,
+    num_tokens: int,
+):
+    dp_metadata = None
+    if htorch.utils.internal.is_lazy(
+    ) and not vllm_config.model_config.enforce_eager and vllm_config.parallel_config.data_parallel_size > 1:
+        dp_metadata = HPUDPMetadata.make(vllm_config, num_tokens)
+
+    try:
+        with override_hpu_dp_metadata(dp_metadata):
+            yield
+    finally:
+        pass
+
+
+def get_hpu_dp_metadata() -> Optional[HPUDPMetadata]:
+    """Get the current HPU DP metadata."""
+    return _hpu_dp_metadata
diff --git a/vllm_gaudi/v1/worker/hpu_input_batch.py b/vllm_gaudi/v1/worker/hpu_input_batch.py
index d40df39f6..40845bc1e 100644
--- a/vllm_gaudi/v1/worker/hpu_input_batch.py
+++ b/vllm_gaudi/v1/worker/hpu_input_batch.py
@@ -12,7 +12,7 @@
 from vllm.multimodal.inputs import MultiModalFeatureSpec
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.utils import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 from vllm.v1.outputs import LogprobsTensors
 from vllm.v1.pool.metadata import PoolingMetadata
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -69,6 +69,7 @@ def __init__(
         pin_memory: bool,
         vocab_size: int,
         block_sizes: list[int],  # The block_size of each kv cache group
+        kernel_block_sizes: list[int],
         logitsprocs: Optional[LogitsProcessors] = None,
         is_spec_decode: bool = False,
     ):
@@ -114,6 +115,7 @@ def __init__(
             pin_memory=pin_memory,
             device=device,
             block_sizes=block_sizes,
+            kernel_block_sizes=kernel_block_sizes,
         )
 
         # Sampling-related.
diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 877e15836..733b2d521 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -2,12 +2,13 @@
 import collections
 import contextlib
 import functools
+from functools import partial
 import itertools
 import math
 import os
 import time
 from dataclasses import dataclass, field, fields
-from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, Literal, cast)
+from typing import (TYPE_CHECKING, Any, Callable, Optional, TypeAlias, Union, cast)
 
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
@@ -25,13 +26,15 @@
 from vllm_gaudi.extension.unified import (create_unified_batch)
 from vllm_gaudi.extension.utils import align_and_pad, pad_list, with_default
 from vllm_gaudi.extension.debug import init_debug_logger
+from vllm_gaudi.v1.worker.hpu_dp_utils import set_hpu_dp_metadata
 
 from vllm.attention.backends.abstract import AttentionType
 from vllm.attention.layer import Attention
+from vllm.attention.layer import MLAAttention
 from vllm.attention.selector import get_attn_backend
 from vllm.config import (VllmConfig, update_config)
 from vllm.distributed.kv_transfer import (get_kv_transfer_group, has_kv_transfer_group)
-from vllm.forward_context import set_forward_context, DPMetadata
+from vllm.forward_context import set_forward_context
 from vllm.model_executor.layers.fused_moe.layer import FusedMoE
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.vocab_parallel_embedding import (VocabParallelEmbedding)
@@ -43,11 +46,13 @@
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingType
 from vllm.transformers_utils.tokenizer import init_tokenizer_from_configs
-from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType, cdiv, is_pin_memory_available, LazyLoader)
+from vllm.utils import (LayerBlockType, cdiv, is_pin_memory_available)
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.import_utils import LazyLoader
 from vllm.utils.jsontree import json_map_leaves
 from vllm_gaudi.utils import (HPUCompileConfig, is_fake_hpu, async_h2d_copy)
 from vllm_gaudi.v1.attention.backends.hpu_attn import HPUAttentionMetadataV1
-from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec, KVCacheTensor)
+from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec, KVCacheTensor, MLAAttentionSpec)
 from vllm.v1.worker.kv_connector_model_runner_mixin import (KVConnectorModelRunnerMixin)
 from vllm.v1.outputs import (EMPTY_MODEL_RUNNER_OUTPUT, LogprobsTensors, DraftTokenIds, ModelRunnerOutput,
                              AsyncModelRunnerOutput, KVConnectorOutput)
@@ -55,7 +60,7 @@
 from vllm.v1.worker.utils import bind_kv_cache
 from vllm.v1.utils import CpuGpuBuffer
 from vllm_gaudi.v1.worker.hpu_input_batch import InputBatch, CachedRequestState
-from vllm.distributed.parallel_state import get_pp_group
+from vllm.distributed.parallel_state import get_pp_group, get_dp_group
 from vllm.model_executor.models.interfaces import (SupportsMultiModal, supports_eagle3, supports_transcription)
 from vllm.model_executor.models.interfaces_base import (VllmModelForPooling, is_pooling_model, is_text_generation_model)
 from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
@@ -77,6 +82,8 @@
 from vllm.model_executor.models import supports_lora, supports_multimodal
 from vllm_gaudi.extension.ops import LoraMask as LoraMask
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
+from vllm.platforms import current_platform
+from vllm.distributed.kv_transfer.kv_connector.utils import copy_kv_blocks
 
 if TYPE_CHECKING:
     import xgrammar as xgr
@@ -530,14 +537,17 @@ def forward(self, *args, **kwargs):
         if model_mm_kwargs is not None:
             kwargs.update(model_mm_kwargs)
 
+        num_real_tokens = input_ids.size(0) * input_ids.size(1)
+
         if self.flatten_input:
             kwargs['input_ids'] = input_ids.view(-1)
         # here num_tokens and num_tokens_across_dp are dummy values which are
-        # used to skip sync between DP ranks
+        # used to skip sync in forward_context between DP ranks
         with set_forward_context(attn_meta,
                                  self.vllm_config,
                                  num_tokens=self.dummy_num_input_tokens,
-                                 num_tokens_across_dp=self.dummy_num_tokens_across_dp_cpu):
+                                 num_tokens_across_dp=self.dummy_num_tokens_across_dp_cpu), set_hpu_dp_metadata(
+                                     self.vllm_config, num_real_tokens):
             hidden_states = self.model(*args, **kwargs)
             if self._rotary_prepare_cos_sin is not None:
                 self._reset_rotary_cos_sin()
@@ -648,6 +658,22 @@ def round_up(value: int, k: int):
     return (value + k - 1) // k * k
 
 
+def get_dp_padding(num_tokens: int, dp_size: int, dp_rank: int) -> int:
+    if dp_size == 1:
+        return 0
+
+    device = current_platform.device_type
+    group = get_dp_group().device_group
+
+    num_tokens_across_dp = [0] * dp_size
+    num_tokens_across_dp[dp_rank] = num_tokens
+    num_tokens_tensor = torch.tensor(num_tokens_across_dp, device=device, dtype=torch.int32)
+    torch.distributed.all_reduce(num_tokens_tensor, group=group)
+
+    max_tokens_across_dp_cpu = torch.max(num_tokens_tensor).item()
+    return max_tokens_across_dp_cpu - num_tokens
+
+
 class HPUModelRunner(KVConnectorModelRunnerMixin):
 
     def __init__(
@@ -792,6 +818,7 @@ def __init__(
             pin_memory=self.pin_memory,
             vocab_size=self.model_config.get_vocab_size(),
             block_sizes=[self.block_size],
+            kernel_block_sizes=[self.block_size],
             is_spec_decode=bool(self.vllm_config.speculative_config),
             logitsprocs=build_logitsprocs(self.vllm_config, self.device, self.pin_memory, self.is_pooling_model,
                                           self.vllm_config.model_config.logits_processors),
@@ -850,6 +877,10 @@ def __init__(
         self.defragmenter = OnlineDefragmenter()
         self.debug_fwd = init_debug_logger('fwd')
 
+        self.get_dp_padding = partial(get_dp_padding,
+                                      dp_size=self.parallel_config.data_parallel_size,
+                                      dp_rank=self.parallel_config.data_parallel_rank)
+
         assert not (self.unified_attn and not self.use_contiguous_pa), 'Unified attn requires contiguous_pa!'
         assert not (self.unified_attn and not self.use_merged_prefill), 'Unified attn requires merged_prefill!'
 
@@ -860,16 +891,8 @@ def unified_bucketing_fn(self, is_causal, query_len, shared_blocks, unique_block
         if not get_config().use_bucketing:
             return query_len, shared_blocks, unique_blocks, logits
 
-        def bucketize(x, buckets):
-            if x < buckets[-1]:
-                return next(b for b in buckets if b >= x)
-            else:
-                return round_up(x, buckets[-1])
-
-        logits_buckets = [self.max_num_seqs]
-        logits = min(bucketize(logits, logits_buckets), query_len)
         new_bucket = self.bucketing_manager.find_unified_bucket(query_len, shared_blocks, unique_blocks, is_causal)
-        return (new_bucket[0], new_bucket[1], new_bucket[2], logits)
+        return (new_bucket[0], new_bucket[1], new_bucket[2], self.max_num_seqs)
 
     def create_lora_mask(self, input_tokens: torch.Tensor, lora_ids: list[int], is_prompt: bool):
         '''
@@ -973,25 +996,36 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         forward_ctx = self.vllm_config.compilation_config.static_forward_context
         block_size = self.vllm_config.cache_config.block_size
         kv_cache_spec: dict[str, KVCacheSpec] = {}
+        cache_dtype_str = self.vllm_config.cache_config.cache_dtype
         for layer_name, attn_module in forward_ctx.items():
             if isinstance(attn_module, FusedMoE):
                 continue
 
             # TODO: Support other attention modules, e.g., sliding window,
             # cross-attention
-            assert isinstance(attn_module, Attention)
-            if attn_module.attn_type == AttentionType.DECODER:
-                kv_cache_spec[layer_name] = FullAttentionSpec(block_size=block_size,
-                                                              num_kv_heads=attn_module.num_kv_heads,
-                                                              head_size=attn_module.head_size,
-                                                              dtype=self.kv_cache_dtype)
-            elif attn_module.attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY):
-                # encoder-only attention does not need KV cache.
-                continue
-            elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
-                raise NotImplementedError
-            else:
-                raise ValueError(f"Unknown attention type: {attn_module.attn_type}")
+            if isinstance(attn_module, Attention):
+                if attn_module.attn_type == AttentionType.DECODER:
+                    kv_cache_spec[layer_name] = FullAttentionSpec(block_size=block_size,
+                                                                  num_kv_heads=attn_module.num_kv_heads,
+                                                                  head_size=attn_module.head_size,
+                                                                  dtype=self.kv_cache_dtype)
+                elif attn_module.attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY):
+                    # encoder-only attention does not need KV cache.
+                    continue
+                elif attn_module.attn_type == AttentionType.ENCODER_DECODER:
+                    raise NotImplementedError
+                else:
+                    raise ValueError(f"Unknown attention type: {attn_module.attn_type}")
+            elif isinstance(attn_module, MLAAttention):
+                if layer_name in kv_cache_spec:
+                    continue
+                kv_cache_spec[layer_name] = MLAAttentionSpec(
+                    block_size=block_size,
+                    num_kv_heads=1,
+                    head_size=attn_module.head_size,
+                    dtype=self.kv_cache_dtype,
+                    cache_dtype_str=cache_dtype_str,
+                )
 
         return kv_cache_spec
 
@@ -1102,7 +1136,7 @@ def _update_states(self, scheduler_output: "SchedulerOutput") -> bool:
 
                 self.requests[req_id].mrope_positions, \
                     self.requests[req_id].mrope_position_delta = \
-                    MRotaryEmbedding.get_input_positions_tensor(
+                    self.model.model.get_mrope_input_positions(
                         self.requests[req_id].prompt_token_ids,
                         hf_config=hf_config,
                         image_grid_thw=image_grid_thw,
@@ -1477,7 +1511,7 @@ def _generate_req_id_output_token_ids_lst(self,
             # Merged prefill case: remove requests without logits
             req_id_output_token_ids_lst = [r for r in req_id_output_token_ids_lst if r[0] in logits_reqs]
         else:
-            if pad_to is not None:
+            if pad_to is not None and len(req_id_output_token_ids_lst) > 0:
                 while len(req_id_output_token_ids_lst) < pad_to:
                     req_id_output_token_ids_lst.append(req_id_output_token_ids_lst[0])
         return req_id_output_token_ids_lst
@@ -1794,6 +1828,7 @@ def _form_unified_prefill_batch(self, contents):
             dtype=self.dtype,
             contiguous_kv=self.use_contiguous_pa,
             bucketing_fn=self.unified_bucketing_fn,
+            get_dp_padding_fn=self.get_dp_padding,
         )
 
         (token_ids_t, token_positions_t, logits_indices_t, logits_groups, attn_metadata) = batch_data
@@ -1912,6 +1947,7 @@ def _create_decode_input_data(self,
                 context_len = context_lens[idx]
                 position = context_len
                 if seq_data.mrope_position_delta is not None:
+                    seq_data.mrope_position_delta = int(seq_data.mrope_position_delta)
                     pos_for_mrope = MRotaryEmbedding \
                         .get_next_input_positions(
                             seq_data.mrope_position_delta,
@@ -2203,6 +2239,7 @@ def _prepare_unified_decode_inputs(self, num_decodes, num_scheduled_tokens, warm
             dtype=self.dtype,
             contiguous_kv=self.use_contiguous_pa,
             bucketing_fn=self.unified_bucketing_fn,
+            get_dp_padding_fn=self.get_dp_padding,
         )
         (token_ids_t, token_positions_t, logits_indices_t, logits_groups, attn_metadata) = batch_data
         decode_input_data = DecodeInputData(
@@ -2351,17 +2388,6 @@ def _check_config(self, batch_size, seq_len, num_blocks, attn_metadata, warmup_m
             if not seen and not warmup_mode:
                 logger.warning("Configuration: %s was not warmed-up!", cfg)
 
-    def get_dp_padding(self, num_tokens: int) -> int:
-        dp_size = self.vllm_config.parallel_config.data_parallel_size
-        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
-
-        if dp_size == 1:
-            return 0
-
-        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(num_tokens, dp_size, dp_rank)
-        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
-        return max_tokens_across_dp_cpu - num_tokens
-
     def _check_unified_config(self, attn_metadata, logits_indices, warmup_mode):
         has_causal = 'c' if attn_metadata.causal_bias is not None else '-'
         has_shared = 's' if attn_metadata.shared_bias is not None else '-'
@@ -2546,9 +2572,8 @@ def apply_grammar_bitmask(
         # Reorder the bitmask to match the order of the requests in the batch.
         sorted_bitmask = np.zeros_like(grammar_bitmask, shape=(logits.shape[0], grammar_bitmask.shape[1]))
         cumulative_index = 0
-        seq = sorted(scheduler_output.structured_output_request_ids.items(), key=lambda x: x[1])
 
-        for req_id, _ in seq:
+        for req_id in scheduler_output.structured_output_request_ids:
             logit_index = struct_out_req_batch_indices[req_id]
             num_spec_tokens = len(scheduler_output.scheduled_spec_decode_tokens.get(req_id, []))
             for i in range(1 + num_spec_tokens):
@@ -2756,9 +2781,10 @@ def prepare_unified_batch(self, scheduler_output):
         block_table = self.input_batch.block_table[0].get_cpu_tensor()[:num_reqs, :max_blocks].clone()
         if self.defragmenter.enabled:
             block_table.apply_(self.defragmenter.resolve)
+
         return create_unified_batch(self.input_batch.req_ids, all_token_ids, num_computed_tokens, num_scheduled_tokens,
                                     num_prompt_tokens, block_table, self.block_size, self.dtype,
-                                    self.unified_bucketing_fn)
+                                    self.unified_bucketing_fn, self.get_dp_padding)
 
     @torch.inference_mode()
     def unified_execute_model(
@@ -2803,6 +2829,18 @@ def unified_execute_model(
         self.input_batch.token_ids_cpu_tensor.index_put_((batch.logits_groups_cpu, batch.new_token_positions_cpu),
                                                          sampled_token_ids_cpu)
 
+        ######### UPDATE REQUEST STATE WITH GENERATED TOKENS #########
+        num_reqs = len(selected_req_ids)
+        for req_id in self.input_batch.req_ids[:num_reqs]:
+            req_state = self.requests[req_id]
+            i = self.input_batch.req_id_to_index[req_id]
+            seq_len = (req_state.num_computed_tokens + scheduler_output.num_scheduled_tokens[req_id])
+            token_ids = sampled_token_ids[i]
+            num_tokens = len(token_ids)
+            self.input_batch.token_ids_cpu[i, seq_len:seq_len + num_tokens] = token_ids
+            self.input_batch.num_tokens[i] += len(token_ids)
+            req_state.output_token_ids.extend(token_ids)
+
         model_runner_output = ModelRunnerOutput(
             req_ids=batch.req_ids_cpu,
             req_id_to_index=self.input_batch.req_id_to_index,
@@ -2985,19 +3023,24 @@ def execute_model(
 
                 self.event_start = self.profiler.get_timestamp_us()
                 self.profiler.start("internal", "prefill")
-                # Align behavior of incomplete prompt with gpu_model_runner
-                # If logits_indices is smaller than req_id,
-                # add the last token position
+                # NOTE(tianmu-li): Align behavior of incomplete prompt with gpu_model_runner
+                # If logits_indices is smaller than req_id, the last request is a chunked prompt request that
+                # hasn't finished in this step. We add the last token position to logits_indices to ensure
+                # the last token of the chunk is sampled. This sampled token will be discarded later
                 if logits_indices.shape[0] < len(req_id):
-                    if structured_output:
-                        logits_append = torch.tensor([torch.sum(prompt_len) - 1],
-                                                     device=token_ids.device,
-                                                     dtype=torch.int32)
-                        logits_indices = torch.cat([logits_indices, logits_append])
-                    elif self.use_async_scheduling:
-                        # Discard partial prefill logits for async scheduling
+                    if structured_output or self.use_async_scheduling:
+                        # When there are multiple requests in the batch (e.g. self.use_merged_prefill=True),
+                        # the last token position is the sum of all prompt lengths - 1
+                        # This logic also holds when there is only one request in the batch
+                        logits_indices_append = torch.tensor([torch.sum(prompt_len) - 1],
+                                                             device=token_ids.device,
+                                                             dtype=torch.int32)
+                        logits_indices = torch.cat([logits_indices, logits_indices_append])
+                    if self.use_async_scheduling:
+                        # Discard partial prefill logit for async scheduling
                         # Depends on 1 decode token/batch
-                        invalid_req_indices.append(num_decodes + idx)
+                        prefill_start_idx = num_decodes
+                        invalid_req_indices.append(prefill_start_idx + idx)
                 htorch.core.mark_step()
                 non_flattened_hidden_states, aux_hidden_states, \
                     sample_hidden_states, logits_device = \
@@ -3160,7 +3203,7 @@ def execute_model(
             logits_combined = logits_decode + logits_prompt
             logits = torch.cat(logits_combined, dim=0)
             # Apply structured output bitmasks if present
-            if scheduler_output.grammar_bitmask is not None:
+            if scheduler_output.structured_output_request_ids:
                 self.apply_grammar_bitmask(scheduler_output, logits)
             sampler_output, _sampling_metadata = self._run_sampling(batch_changed, logits,
                                                                     pd_info.prompt_req_ids + pd_info.decode_req_ids,
@@ -3299,7 +3342,7 @@ def execute_model(
             return AsyncHPUModelRunnerOutput(
                 model_runner_output=model_runner_output,
                 sampled_token_ids=sampled_token_ids,
-                invalid_req_indices=[],
+                invalid_req_indices=invalid_req_indices,
                 async_output_copy_stream=self.async_output_copy_stream,
             )
         model_runner_output = ModelRunnerOutput(
@@ -3820,12 +3863,10 @@ def _prepare_dummy_unified_scenario(self, unified_cfg):
             for query, blocks in zip(prompt_reqs_query, prompt_reqs_blocks):
                 self._add_dummy_unified_request(requests, True, False, blocks, num_computed_tokens, query,
                                                 scheduled_tokens)
-
         else:
             remaining_samples = query_len
             base = shared_ctx_len // remaining_samples
             remain = shared_ctx_len % remaining_samples
-
             all_shared_blocks_ids = [block for block in range(shared_ctx_len)]
             unique_block = unique_ctx_len - 1
             # do not use unique block id
@@ -3849,8 +3890,16 @@ def _prepare_dummy_unified_scenario(self, unified_cfg):
                     split_shared_blocks_ids[target].append(block)
 
             # add unique id
-            min_idx = min(range(remaining_samples), key=lambda j: len(split_shared_blocks_ids[j]))
-            split_shared_blocks_ids[min_idx].append(unique_block)
+            if unique_ctx_len > 0:
+                min_idx = min(range(remaining_samples), key=lambda j: len(split_shared_blocks_ids[j]))
+                split_shared_blocks_ids[min_idx].append(unique_block)
+
+            for i in range(len(split_shared_blocks_ids)):
+                if not split_shared_blocks_ids[i]:
+                    if unique_block - i >= 0:
+                        split_shared_blocks_ids[i] = [unique_block - i]
+                    else:
+                        split_shared_blocks_ids[i] = [all_shared_blocks_ids[0]]
 
             for request_blocks in split_shared_blocks_ids:
                 self._add_dummy_unified_request(requests, False, False, request_blocks, num_computed_tokens, 1,
@@ -3996,6 +4045,7 @@ def warmup_model(self) -> None:
                     pin_memory=self.pin_memory,
                     vocab_size=self.model_config.get_vocab_size(),
                     block_sizes=[self.block_size],
+                    kernel_block_sizes=[self.block_size],
                     logitsprocs=build_logitsprocs(self.vllm_config, self.device, self.pin_memory, self.is_pooling_model,
                                                   self.vllm_config.model_config.logits_processors),
                 )
@@ -4074,6 +4124,11 @@ def warmup_model(self) -> None:
 
         if not self.unified_attn and max_bucket > self.input_batch.max_num_reqs:
             self.input_batch = input_batch_bkp
+        # NOTE(kzawora): This is a nasty workaround - for whatever cache_utils-related reason,
+        # reusing defragmenter used in warmup causes accuracy drops, which is why we re-create
+        # and re-initialize it.
+        self.defragmenter = OnlineDefragmenter()
+        self.defragmenter.initialize(self.kv_caches, self.block_size)
 
     def shutdown_inc(self):
         can_finalize_inc = self._is_quant_with_inc() and \
@@ -4181,12 +4236,26 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
         self._PAD_SLOT_ID = num_blocks * self.block_size
 
         if has_kv_transfer_group():
-            get_kv_transfer_group().register_kv_caches(kv_caches)
+            get_kv_transfer_group().register_kv_caches(self.get_kv_caches_4D(kv_caches))
             if self.vllm_config.kv_transfer_config.kv_buffer_device == "cpu":
                 get_kv_transfer_group().set_host_xfer_buffer_ops(copy_kv_blocks)
             global hpu_buffer
         htorch.hpu.synchronize()
 
+    def get_kv_caches_4D(self, kv_caches) -> dict[str, torch.Tensor]:
+        kv_caches_4D: dict[str, torch.Tensor] = {}
+        for layer_name, cache_or_cachelist in kv_caches.items():
+            kv_cache_per_layer = []
+            for cache in cache_or_cachelist:
+                if cache is None:
+                    continue
+                kv_cache_per_layer.append(cache.view(-1, self.block_size, *cache.shape[1:]))
+                #NOTE(Chendi): Do not remove, call torch data_ptr to record physical address
+                cache.data_ptr()
+            kv_caches_4D[layer_name] = TensorTuple(tuple(kv_cache_per_layer)) \
+                if len(kv_cache_per_layer) == 2 else kv_cache_per_layer[0]
+        return kv_caches_4D
+
     def get_supported_generation_tasks(self) -> list[GenerationTask]:
         model = self.get_model()
         supported_tasks = list[GenerationTask]()
@@ -4420,82 +4489,98 @@ def propose_ngram_draft_token_ids(
         return draft_token_ids
 
 
-def _make_src_and_dst_indices(
-    block_size: int,
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    src_device: Union[torch.device, str],
-    dst_device: Union[torch.device, str],
-) -> tuple[torch.Tensor, torch.Tensor]:
-    #convert to slot mapping
-    src_slot_mapping = np.concatenate(
-        [np.arange(start=s * block_size, stop=(s + 1) * block_size) for s in src_block_ids])
-    dst_slot_mapping = np.concatenate(
-        [np.arange(start=d * block_size, stop=(d + 1) * block_size) for d in dst_block_ids])
-
-    src_slot_mapping = torch.tensor(src_slot_mapping, device=src_device, dtype=torch.int64)
-    dst_slot_mapping = torch.tensor(dst_slot_mapping, device=dst_device, dtype=torch.int64)
-    return src_slot_mapping, dst_slot_mapping
-
-
-def copy_kv_blocks(
-    src_kv_caches: dict[str, torch.Tensor],
-    dst_kv_caches: dict[str, torch.Tensor],
-    src_block_ids: list[int],
-    dst_block_ids: list[int],
-    direction: Literal["h2d", "d2h"],
-    block_size: int = 128,
-) -> None:
-    """Copy kv blocks between different buffers."""
-    if not src_kv_caches or not dst_kv_caches or \
-       not src_block_ids or not dst_block_ids or \
-       len(src_block_ids) != len(dst_block_ids):
-        return
-    assert len(src_block_ids) == len(dst_block_ids)
-    src_device = next(iter(src_kv_caches.values()))[0].device
-    dst_device = next(iter(dst_kv_caches.values()))[0].device
-
-    src_slot_mapping, dst_slot_mapping = _make_src_and_dst_indices(block_size=block_size,
-                                                                   src_block_ids=src_block_ids,
-                                                                   dst_block_ids=dst_block_ids,
-                                                                   src_device=src_device,
-                                                                   dst_device=dst_device)
-
-    start = time.perf_counter()
-    target_device = dst_device.type
-
-    i = 0
-    global hpu_buffer
-    use_hpu_buffer = False
-    for layer_name in src_kv_caches:
-        key_cache = src_kv_caches[layer_name][0]
-        value_cache = src_kv_caches[layer_name][1]
-        if direction == "d2h":
-            # NOTE(chendi): in order to keep host_buffer shape[0] same as tpu and gpu case
-            # so we need to flatten the dst_kv_caches
-            dst_kv_caches[layer_name] = dst_kv_caches[layer_name].flatten(1, 2)
-        else:
-            key_cache = key_cache.flatten(0, 1)
-            if value_cache is not None:
-                value_cache = value_cache.flatten(0, 1)
+# --- Helper Functions ---
+def get_shape(data):
+    """Recursively finds the shape of a nested tuple or list."""
+    if isinstance(data, torch.Tensor):
+        return data.shape
 
-        if direction == "d2h" and use_hpu_buffer:
-            hpu_buffer[i][0] = key_cache.index_select(0, src_slot_mapping)
-            hpu_buffer[i][1] = value_cache.index_select(0, src_slot_mapping)
-        else:
-            dst_kv_caches[layer_name][0].index_put_((dst_slot_mapping, ),
-                                                    key_cache.index_select(0, src_slot_mapping).to(target_device))
-            dst_kv_caches[layer_name][1].index_put_((dst_slot_mapping, ),
-                                                    value_cache.index_select(0, src_slot_mapping).to(target_device))
-        if direction == "d2h":
-            dst_kv_caches[layer_name] = dst_kv_caches[layer_name].unflatten(1, (-1, block_size))
-
-        i = i + 1
-
-    torch.hpu.synchronize()
-
-    logger.debug("copy_kv_blocks: copy takes %s"
-                 "|direction=%s|pid=%s|block_size=%s"
-                 "|src_blocks=%s|dst_blocks=%s",
-                 time.perf_counter() - start, direction, os.getpid(), block_size, len(src_block_ids),
-                 len(dst_block_ids))
+    if not isinstance(data, (list, tuple)):
+        return ()  # End of a non-tensor branch
+
+    if not data:
+        return (0, )
+
+    first_dim = len(data)
+    sub_shape = get_shape(data[0])
+
+    for item in data[1:]:
+        if get_shape(item) != sub_shape:
+            raise ValueError("Inconsistent dimensions: The structure is ragged.")
+
+    return (first_dim, ) + sub_shape
+
+
+def _find_tensors_and_validate(data, attr_name):
+    """
+    A generic helper to find all tensors and validate a specific attribute
+    (like 'device' or 'dtype') ensuring they are all the same.
+    """
+    found_attr = None
+
+    def find_tensors(nested_data):
+        if isinstance(nested_data, torch.Tensor):
+            yield nested_data
+        elif isinstance(nested_data, (list, tuple)):
+            for item in nested_data:
+                yield from find_tensors(item)
+
+    tensor_iterator = find_tensors(data)
+
+    try:
+        first_tensor = next(tensor_iterator)
+        found_attr = getattr(first_tensor, attr_name)
+    except StopIteration:
+        return None  # No tensors found
+
+    for tensor in tensor_iterator:
+        current_attr = getattr(tensor, attr_name)
+        if current_attr != found_attr:
+            raise ValueError(f"Inconsistent {attr_name}: Found tensors with both '{found_attr}' and '{current_attr}'.")
+
+    return found_attr
+
+
+class TensorTuple(tuple):
+    """
+    A tuple subclass designed to hold nested torch.Tensors, providing
+    .shape and .device properties.
+    
+    It ensures that the nested structure is not ragged and that all
+    contained tensors reside on the same device.
+    """
+
+    _shape: tuple[int, ...]
+    _device: Optional[torch.device]
+    _dtype: Optional[torch.dtype]
+
+    def __new__(cls, iterable):
+        # First, we create the actual tuple object instance
+        instance = super().__new__(cls, iterable)
+
+        # Now, compute and attach the custom properties.
+        # This is done here because tuples are immutable.
+        # We store them with a leading underscore.
+        instance._shape = get_shape(instance)
+        instance._device = _find_tensors_and_validate(instance, 'device')
+        instance._dtype = _find_tensors_and_validate(instance, 'dtype')
+
+        return instance
+
+    @property
+    def shape(self):
+        """Returns the shape of the nested tuple structure."""
+        return self._shape
+
+    @property
+    def device(self):
+        """
+        Returns the torch.device of the tensors within the tuple.
+        Returns None if no tensors are present.
+        """
+        return self._device
+
+    @property
+    def dtype(self):
+        """Returns the torch.dtype of the tensors within the tuple."""
+        return self._dtype
diff --git a/vllm_gaudi/v1/worker/hpu_worker.py b/vllm_gaudi/v1/worker/hpu_worker.py
index d1e3b6fbf..d360e3287 100644
--- a/vllm_gaudi/v1/worker/hpu_worker.py
+++ b/vllm_gaudi/v1/worker/hpu_worker.py
@@ -20,7 +20,7 @@
 from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment)
 from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
 from vllm.model_executor import set_random_seed
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig, KVCacheSpec)
 from vllm.v1.outputs import (DraftTokenIds, AsyncModelRunnerOutput, ModelRunnerOutput)
 from vllm.v1.worker.utils import bind_kv_cache