From bbebcc16b30b5b46726afaeb5bcbdccd2ff593d1 Mon Sep 17 00:00:00 2001
From: linoy buchnik <lbuchnik@habana.ai>
Date: Mon, 5 Jan 2026 14:52:07 +0200
Subject: [PATCH 1/7] [GAUDISW-245117] add b2b matmul

Signed-off-by: linoy buchnik <lbuchnik@habana.ai>
---
 vllm_gaudi/attention/backends/hpu_attn.py | 10 +++++-----
 vllm_gaudi/extension/utils.py             |  4 ++++
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py
index f619be467..5bc592dac 100644
--- a/vllm_gaudi/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/attention/backends/hpu_attn.py
@@ -13,7 +13,7 @@
 import vllm_gaudi.extension.kernels as kernels
 import vllm_gaudi.extension.ops as ops
 from vllm_gaudi.extension.runtime import get_config
-from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache)
+from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata,
                                               AttentionType)
@@ -226,9 +226,9 @@ def __init__(
         self.softmax = Softmax()
         self.matmul_av = Matmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \
+        self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \
+        self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
         self.latent_cache_k = VLLMKVCache() if not self.enable_fp8_attn \
             else VLLMFP8KVCache()
@@ -445,9 +445,9 @@ def __init__(
         self.softmax = Softmax()
         self.matmul_av = Matmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \
+        self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \
+        self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
         self.k_cache = VLLMKVCache() if not self.enable_fp8_attn \
             else VLLMFP8KVCache()
diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index bcdd05b21..2068f5149 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -29,6 +29,10 @@ def __init__(self):
     def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
+class B2BMatmul(Matmul):
+    def __init__(self):
+        super().__init__()
+
 
 class Softmax(torch.nn.Module):
 

From 4626d66d799936d79b9138daded63e6c14f2e385 Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Tue, 6 Jan 2026 09:28:13 +0200
Subject: [PATCH 2/7] Update vllm_gaudi/extension/utils.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/extension/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index 2068f5149..c06630e90 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -30,6 +30,13 @@ def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
 class B2BMatmul(Matmul):
+    """Specialized alias for back-to-back matmul operations.
+
+    This class is intentionally kept functionally identical to ``Matmul``.
+    It exists to provide semantic distinction in the codebase (e.g., for
+    patterns that specifically require back-to-back matmul) and to allow
+    future customization without changing call sites.
+    """
     def __init__(self):
         super().__init__()
 

From 8a8ea09fb855c5839f67ce0e44ad50984987099b Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Wed, 7 Jan 2026 11:22:34 +0200
Subject: [PATCH 3/7] Update utils.py

Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/extension/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index c06630e90..b7e10cbba 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -34,7 +34,7 @@ class B2BMatmul(Matmul):
 
     This class is intentionally kept functionally identical to ``Matmul``.
     It exists to provide semantic distinction in the codebase (e.g., for
-    patterns that specifically require back-to-back matmul) and to allow
+    patterns that specifically require batch2block and block2batch matmul) and to allow
     future customization without changing call sites.
     """
     def __init__(self):

From fc8aab7065628dfbcda410aa3dbd317e92a3c25a Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Wed, 7 Jan 2026 11:26:00 +0200
Subject: [PATCH 4/7] Update hpu_attn.py

Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/attention/backends/hpu_attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py
index 5bc592dac..f20de5195 100644
--- a/vllm_gaudi/attention/backends/hpu_attn.py
+++ b/vllm_gaudi/attention/backends/hpu_attn.py
@@ -13,7 +13,8 @@
 import vllm_gaudi.extension.kernels as kernels
 import vllm_gaudi.extension.ops as ops
 from vllm_gaudi.extension.runtime import get_config
-from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache)
+from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache,
+                                        VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata,
                                               AttentionType)

From c64b0881754cf3ee0f19870959cf8884f78c896c Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Wed, 7 Jan 2026 11:29:29 +0200
Subject: [PATCH 5/7] Update utils.py

Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/extension/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index b7e10cbba..367c08c48 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -30,7 +30,7 @@ def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
 class B2BMatmul(Matmul):
-    """Specialized alias for back-to-back matmul operations.
+    """Specialized alias for batch2block and batch2block matmul operations.
 
     This class is intentionally kept functionally identical to ``Matmul``.
     It exists to provide semantic distinction in the codebase (e.g., for

From 48bb32dc348ad5582bceb03697d629f2f32096bd Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Wed, 14 Jan 2026 08:50:25 +0200
Subject: [PATCH 6/7] Update utils.py

Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/extension/utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index 367c08c48..0fd06bc31 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -30,12 +30,13 @@ def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
 class B2BMatmul(Matmul):
-    """Specialized alias for batch2block and batch2block matmul operations.
 
-    This class is intentionally kept functionally identical to ``Matmul``.
-    It exists to provide semantic distinction in the codebase (e.g., for
-    patterns that specifically require batch2block and block2batch matmul) and to allow
-    future customization without changing call sites.
+    """Specialized alias for batch2block and block2batch matmul operations.
+    
+    This class remains functionally identical to ``Matmul`` but is used to
+    semantically mark B2B-related matmuls. This enables the system to apply the
+    fix that uses the B2B output measurements as the input measurements during
+    calibration, avoiding corrupted scales from the KV‑cache.
     """
     def __init__(self):
         super().__init__()

From 9806ecb4b451f549e35adc9ab88bdd302d51fc0b Mon Sep 17 00:00:00 2001
From: Linoy Buchnik <linoybu@gmail.com>
Date: Thu, 15 Jan 2026 14:39:56 +0200
Subject: [PATCH 7/7] Update utils.py

Signed-off-by: Linoy Buchnik <linoybu@gmail.com>
---
 vllm_gaudi/extension/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py
index 0fd06bc31..3e9428870 100644
--- a/vllm_gaudi/extension/utils.py
+++ b/vllm_gaudi/extension/utils.py
@@ -29,8 +29,8 @@ def __init__(self):
     def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
-class B2BMatmul(Matmul):
 
+class B2BMatmul(Matmul):
     """Specialized alias for batch2block and block2batch matmul operations.
     
     This class remains functionally identical to ``Matmul`` but is used to
@@ -38,6 +38,7 @@ class B2BMatmul(Matmul):
     fix that uses the B2B output measurements as the input measurements during
     calibration, avoiding corrupted scales from the KV‑cache.
     """
+    
     def __init__(self):
         super().__init__()