From bbebcc16b30b5b46726afaeb5bcbdccd2ff593d1 Mon Sep 17 00:00:00 2001 From: linoy buchnik Date: Mon, 5 Jan 2026 14:52:07 +0200 Subject: [PATCH 1/7] [GAUDISW-245117] add b2b matmul Signed-off-by: linoy buchnik --- vllm_gaudi/attention/backends/hpu_attn.py | 10 +++++----- vllm_gaudi/extension/utils.py | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index f619be467..5bc592dac 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -13,7 +13,7 @@ import vllm_gaudi.extension.kernels as kernels import vllm_gaudi.extension.ops as ops from vllm_gaudi.extension.runtime import get_config -from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache) +from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata, AttentionType) @@ -226,9 +226,9 @@ def __init__( self.softmax = Softmax() self.matmul_av = Matmul() if not self.enable_fp8_attn \ else FP8Matmul() - self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \ + self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \ else FP8Matmul() - self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \ + self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \ else FP8Matmul() self.latent_cache_k = VLLMKVCache() if not self.enable_fp8_attn \ else VLLMFP8KVCache() @@ -445,9 +445,9 @@ def __init__( self.softmax = Softmax() self.matmul_av = Matmul() if not self.enable_fp8_attn \ else FP8Matmul() - self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \ + self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \ else FP8Matmul() - self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \ + self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \ else FP8Matmul() self.k_cache = VLLMKVCache() if not self.enable_fp8_attn \ else VLLMFP8KVCache() diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index bcdd05b21..2068f5149 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -29,6 +29,10 @@ def __init__(self): def forward(self, x, y, **kwargs): return torch.matmul(x, y, **kwargs) +class B2BMatmul(Matmul): + def __init__(self): + super().__init__() + class Softmax(torch.nn.Module): From 4626d66d799936d79b9138daded63e6c14f2e385 Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Tue, 6 Jan 2026 09:28:13 +0200 Subject: [PATCH 2/7] Update vllm_gaudi/extension/utils.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Linoy Buchnik --- vllm_gaudi/extension/utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index 2068f5149..c06630e90 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -30,6 +30,13 @@ def forward(self, x, y, **kwargs): return torch.matmul(x, y, **kwargs) class B2BMatmul(Matmul): + """Specialized alias for back-to-back matmul operations. + + This class is intentionally kept functionally identical to ``Matmul``. + It exists to provide semantic distinction in the codebase (e.g., for + patterns that specifically require back-to-back matmul) and to allow + future customization without changing call sites. + """ def __init__(self): super().__init__() From 8a8ea09fb855c5839f67ce0e44ad50984987099b Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Wed, 7 Jan 2026 11:22:34 +0200 Subject: [PATCH 3/7] Update utils.py Signed-off-by: Linoy Buchnik --- vllm_gaudi/extension/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index c06630e90..b7e10cbba 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -34,7 +34,7 @@ class B2BMatmul(Matmul): This class is intentionally kept functionally identical to ``Matmul``. It exists to provide semantic distinction in the codebase (e.g., for - patterns that specifically require back-to-back matmul) and to allow + patterns that specifically require batch2block and block2batch matmul) and to allow future customization without changing call sites. """ def __init__(self): From fc8aab7065628dfbcda410aa3dbd317e92a3c25a Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Wed, 7 Jan 2026 11:26:00 +0200 Subject: [PATCH 4/7] Update hpu_attn.py Signed-off-by: Linoy Buchnik --- vllm_gaudi/attention/backends/hpu_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/attention/backends/hpu_attn.py b/vllm_gaudi/attention/backends/hpu_attn.py index 5bc592dac..f20de5195 100644 --- a/vllm_gaudi/attention/backends/hpu_attn.py +++ b/vllm_gaudi/attention/backends/hpu_attn.py @@ -13,7 +13,8 @@ import vllm_gaudi.extension.kernels as kernels import vllm_gaudi.extension.ops as ops from vllm_gaudi.extension.runtime import get_config -from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache) +from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, + VLLMKVCache) from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata, AttentionType) From c64b0881754cf3ee0f19870959cf8884f78c896c Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Wed, 7 Jan 2026 11:29:29 +0200 Subject: [PATCH 5/7] Update utils.py Signed-off-by: Linoy Buchnik --- vllm_gaudi/extension/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index b7e10cbba..367c08c48 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -30,7 +30,7 @@ def forward(self, x, y, **kwargs): return torch.matmul(x, y, **kwargs) class B2BMatmul(Matmul): - """Specialized alias for back-to-back matmul operations. + """Specialized alias for batch2block and batch2block matmul operations. This class is intentionally kept functionally identical to ``Matmul``. It exists to provide semantic distinction in the codebase (e.g., for From 48bb32dc348ad5582bceb03697d629f2f32096bd Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Wed, 14 Jan 2026 08:50:25 +0200 Subject: [PATCH 6/7] Update utils.py Signed-off-by: Linoy Buchnik --- vllm_gaudi/extension/utils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index 367c08c48..0fd06bc31 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -30,12 +30,13 @@ def forward(self, x, y, **kwargs): return torch.matmul(x, y, **kwargs) class B2BMatmul(Matmul): - """Specialized alias for batch2block and batch2block matmul operations. - This class is intentionally kept functionally identical to ``Matmul``. - It exists to provide semantic distinction in the codebase (e.g., for - patterns that specifically require batch2block and block2batch matmul) and to allow - future customization without changing call sites. + """Specialized alias for batch2block and block2batch matmul operations. + + This class remains functionally identical to ``Matmul`` but is used to + semantically mark B2B-related matmuls. This enables the system to apply the + fix that uses the B2B output measurements as the input measurements during + calibration, avoiding corrupted scales from the KV‑cache. """ def __init__(self): super().__init__() From 9806ecb4b451f549e35adc9ab88bdd302d51fc0b Mon Sep 17 00:00:00 2001 From: Linoy Buchnik Date: Thu, 15 Jan 2026 14:39:56 +0200 Subject: [PATCH 7/7] Update utils.py Signed-off-by: Linoy Buchnik --- vllm_gaudi/extension/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_gaudi/extension/utils.py b/vllm_gaudi/extension/utils.py index 0fd06bc31..3e9428870 100644 --- a/vllm_gaudi/extension/utils.py +++ b/vllm_gaudi/extension/utils.py @@ -29,8 +29,8 @@ def __init__(self): def forward(self, x, y, **kwargs): return torch.matmul(x, y, **kwargs) -class B2BMatmul(Matmul): +class B2BMatmul(Matmul): """Specialized alias for batch2block and block2batch matmul operations. This class remains functionally identical to ``Matmul`` but is used to @@ -38,6 +38,7 @@ class B2BMatmul(Matmul): fix that uses the B2B output measurements as the input measurements during calibration, avoiding corrupted scales from the KV‑cache. """ + def __init__(self): super().__init__()