vllm-project · linoybu · Jan 5, 2026 · Jan 6, 2026 · Jan 7, 2026 · Jan 7, 2026
@@ -13,7 +13,8 @@
 import vllm_gaudi.extension.kernels as kernels
 import vllm_gaudi.extension.ops as ops
 from vllm_gaudi.extension.runtime import get_config
-from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache, VLLMKVCache)
+from vllm_gaudi.extension.utils import (FP8Matmul, Matmul, B2BMatmul, ModuleFusedSDPA, Softmax, VLLMFP8KVCache,
+                                        VLLMKVCache)
 
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, AttentionLayer, AttentionMetadata,
                                               AttentionType)
@@ -226,9 +227,9 @@ def __init__(
         self.softmax = Softmax()
         self.matmul_av = Matmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \
+        self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \
+        self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
         self.latent_cache_k = VLLMKVCache() if not self.enable_fp8_attn \
             else VLLMFP8KVCache()
@@ -445,9 +446,9 @@ def __init__(
         self.softmax = Softmax()
         self.matmul_av = Matmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.batch2block_matmul = Matmul() if not self.enable_fp8_attn \
+        self.batch2block_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
-        self.block2batch_matmul = Matmul() if not self.enable_fp8_attn \
+        self.block2batch_matmul = B2BMatmul() if not self.enable_fp8_attn \
             else FP8Matmul()
         self.k_cache = VLLMKVCache() if not self.enable_fp8_attn \
             else VLLMFP8KVCache()

@@ -29,6 +29,17 @@ def __init__(self):
     def forward(self, x, y, **kwargs):
         return torch.matmul(x, y, **kwargs)
 
+class B2BMatmul(Matmul):
+    """Specialized alias for batch2block and batch2block matmul operations.
+
+    This class is intentionally kept functionally identical to ``Matmul``.
+    It exists to provide semantic distinction in the codebase (e.g., for
+    patterns that specifically require batch2block and block2batch matmul) and to allow
+    future customization without changing call sites.
+    """
+    def __init__(self):
+        super().__init__()
+
 
 class Softmax(torch.nn.Module):