From d95fcc8d90d8f04b49ff537a3b30d09e1e18b763 Mon Sep 17 00:00:00 2001
From: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
Date: Tue, 23 Dec 2025 16:48:47 +0200
Subject: [PATCH 1/2] Add monkey-patch for Llama4Attention._get_attn_scale

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 30 ++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index c23acfe3b..1ab73fdb5 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -16,6 +16,7 @@
     from neural_compressor.torch.quantization import finalize_calibration
 else:
     finalize_calibration = None
+import types
 
 import habana_frameworks.torch as htorch
 import habana_frameworks.torch.internal.bridge_config as bc
@@ -354,6 +355,34 @@ def is_mm_optimized(model):
         'Gemma3ForConditionalGeneration' in str(type(model))
 
 
+def patch_llama4_get_attn_scale(model):
+
+    config = getattr(model, "config", None)
+    is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower())
+    if not is_llama4:
+        return
+
+    for layer in model.language_model.model.layers:
+
+        if "Llama4Attention" not in type(layer.self_attn).__name__:
+            continue
+
+        attn = layer.self_attn
+        orig = attn._get_attn_scale
+
+        def my_get_attn_scale(self, positions, _orig=orig):
+            positions = positions.flatten()
+            return _orig(positions)
+
+        attn._get_attn_scale = types.MethodType(my_get_attn_scale, attn)
+
+
+def apply_model_specific_patches(model):
+    """The function applies model-specific monkey patches."""
+
+    patch_llama4_get_attn_scale(model)
+
+
 class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin):
 
     def __init__(self, model, vllm_config):
@@ -3806,6 +3835,7 @@ def load_model(self) -> None:
             self.model = self.model.to("hpu")
             htcore.mark_step()
 
+        apply_model_specific_patches(self.model)
         hidden_layer_markstep_interval = int(os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1'))
         model_config = getattr(self.model, "config", None)
         modify_model_layers(self.model,

From 322c26c0a5aeb0cd6e83ae5837a32e2598773533 Mon Sep 17 00:00:00 2001
From: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
Date: Tue, 23 Dec 2025 17:04:21 +0200
Subject: [PATCH 2/2] Minor change - renaming

Signed-off-by: Radoslaw Smyrek <radoslawx.smyrek@intel.com>
---
 vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py
index 1ab73fdb5..11c5b9225 100644
--- a/vllm_gaudi/v1/worker/hpu_model_runner.py
+++ b/vllm_gaudi/v1/worker/hpu_model_runner.py
@@ -370,11 +370,11 @@ def patch_llama4_get_attn_scale(model):
         attn = layer.self_attn
         orig = attn._get_attn_scale
 
-        def my_get_attn_scale(self, positions, _orig=orig):
+        def _get_attn_scale_for_hpu(self, positions, _orig=orig):
             positions = positions.flatten()
             return _orig(positions)
 
-        attn._get_attn_scale = types.MethodType(my_get_attn_scale, attn)
+        attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn)
 
 
 def apply_model_specific_patches(model):