From d95fcc8d90d8f04b49ff537a3b30d09e1e18b763 Mon Sep 17 00:00:00 2001 From: Radoslaw Smyrek Date: Tue, 23 Dec 2025 16:48:47 +0200 Subject: [PATCH 1/2] Add monkey-patch for Llama4Attention._get_attn_scale Signed-off-by: Radoslaw Smyrek --- vllm_gaudi/v1/worker/hpu_model_runner.py | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index c23acfe3b..1ab73fdb5 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -16,6 +16,7 @@ from neural_compressor.torch.quantization import finalize_calibration else: finalize_calibration = None +import types import habana_frameworks.torch as htorch import habana_frameworks.torch.internal.bridge_config as bc @@ -354,6 +355,34 @@ def is_mm_optimized(model): 'Gemma3ForConditionalGeneration' in str(type(model)) +def patch_llama4_get_attn_scale(model): + + config = getattr(model, "config", None) + is_llama4 = (getattr(config, "model_type", None) == "llama4") or ("llama4" in type(model).__name__.lower()) + if not is_llama4: + return + + for layer in model.language_model.model.layers: + + if "Llama4Attention" not in type(layer.self_attn).__name__: + continue + + attn = layer.self_attn + orig = attn._get_attn_scale + + def my_get_attn_scale(self, positions, _orig=orig): + positions = positions.flatten() + return _orig(positions) + + attn._get_attn_scale = types.MethodType(my_get_attn_scale, attn) + + +def apply_model_specific_patches(model): + """The function applies model-specific monkey patches.""" + + patch_llama4_get_attn_scale(model) + + class HpuModelAdapter(torch.nn.Module, KVConnectorModelRunnerMixin): def __init__(self, model, vllm_config): @@ -3806,6 +3835,7 @@ def load_model(self) -> None: self.model = self.model.to("hpu") htcore.mark_step() + apply_model_specific_patches(self.model) hidden_layer_markstep_interval = int(os.getenv('VLLM_CONFIG_HIDDEN_LAYERS', '1')) model_config = getattr(self.model, "config", None) modify_model_layers(self.model, From 322c26c0a5aeb0cd6e83ae5837a32e2598773533 Mon Sep 17 00:00:00 2001 From: Radoslaw Smyrek Date: Tue, 23 Dec 2025 17:04:21 +0200 Subject: [PATCH 2/2] Minor change - renaming Signed-off-by: Radoslaw Smyrek --- vllm_gaudi/v1/worker/hpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_gaudi/v1/worker/hpu_model_runner.py b/vllm_gaudi/v1/worker/hpu_model_runner.py index 1ab73fdb5..11c5b9225 100644 --- a/vllm_gaudi/v1/worker/hpu_model_runner.py +++ b/vllm_gaudi/v1/worker/hpu_model_runner.py @@ -370,11 +370,11 @@ def patch_llama4_get_attn_scale(model): attn = layer.self_attn orig = attn._get_attn_scale - def my_get_attn_scale(self, positions, _orig=orig): + def _get_attn_scale_for_hpu(self, positions, _orig=orig): positions = positions.flatten() return _orig(positions) - attn._get_attn_scale = types.MethodType(my_get_attn_scale, attn) + attn._get_attn_scale = types.MethodType(_get_attn_scale_for_hpu, attn) def apply_model_specific_patches(model):