MetaX-MACA · sxvvv · May 28, 2026 · May 31, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -9,7 +9,10 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     group_broadcast)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+
+
+def round_up(value: int, multiple: int) -> int:
+    return ((value + multiple - 1) // multiple) * multiple
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.

diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
@@ -6,6 +6,7 @@
 
 import vllm._custom_ops as ops
 from tests.kernels.utils import opcheck
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 
@@ -55,18 +56,21 @@ def test_silu_and_mul(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
-    layer = SiluAndMul()
+    with set_current_vllm_config(VllmConfig()):
+        layer = SiluAndMul()
 
-    # Make inputs
-    scale = (torch.randn((1), device=device, dtype=torch.float32))
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+        # Make inputs
+        scale = (torch.randn((1), device=device, dtype=torch.float32))
-        scale = (torch.randn((1), device=device, dtype=torch.float32))
+        scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5
-        scale = (torch.randn((1), device=device, dtype=torch.float32))
+        scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype)
 
-    ref_out = ref_impl(layer, x, scale)
-    ops_out = ops_impl(x, scale)
+        ref_out = ref_impl(layer, x, scale)
+        ops_out = ops_impl(x, scale)
 
-    assert ref_out.dtype == quant_dtype
-    assert ops_out.dtype == quant_dtype
-    assert ref_out.shape == ops_out.shape
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
-    opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
+        assert ref_out.dtype == quant_dtype
+        assert ops_out.dtype == quant_dtype
+        assert ref_out.shape == ops_out.shape
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32),
+                              atol=1 / 128,
+                              rtol=0)
-        assert torch.allclose(ref_out.to(dtype=torch.float32),
-                              ops_out.to(dtype=torch.float32),
-                              atol=1 / 128,
-                              rtol=0)
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32),
+                              atol=0.125,
+                              rtol=0)
-        assert torch.allclose(ref_out.to(dtype=torch.float32),
-                              ops_out.to(dtype=torch.float32),
-                              atol=1 / 128,
-                              rtol=0)
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32),
+                              atol=0.125,
+                              rtol=0)
+        opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -15,13 +15,41 @@
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+try:
+    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+except ModuleNotFoundError:
-try:
-    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-except ModuleNotFoundError:
+try:
+    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+except ImportError:
-try:
-    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
-except ModuleNotFoundError:
+try:
+    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+except ImportError:
+    from enum import Enum
+
+    AttentionBackend = Any
+    AttentionMetadata = Any
+
+    class AttentionType(Enum):
+        ENCODER_DECODER = "encoder_decoder"
+
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
-from vllm.platforms.interface import _Backend
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
-                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+try:
+    from vllm.platforms.interface import _Backend
+except ImportError:
+    _Backend = Any
+try:
+    from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+                            STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+except ImportError:
+    STR_BACKEND_ENV_VAR = "VLLM_ATTENTION_BACKEND"
+    STR_FLASH_ATTN_VAL = "FLASH_ATTN"
+    STR_XFORMERS_ATTN_VAL = "XFORMERS"
+
+    def make_tensor_with_pad(
+        x: list[list[int]],
+        max_len: int,
+        pad: int,
+        dtype: torch.dtype,
+        device: Union[torch.device, str],
+    ) -> torch.Tensor:
+        padded = [item + [pad] * (max_len - len(item)) for item in x]
-        padded = [item + [pad] * (max_len - len(item)) for item in x]
+        padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x]
-        padded = [item + [pad] * (max_len - len(item)) for item in x]
+        padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x]
+        return torch.tensor(padded, dtype=dtype, device=device)
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.