MetaX-MACA · sxvvv · May 28, 2026 · May 31, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -9,7 +9,10 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     group_broadcast)
 from vllm.platforms import current_platform
-from vllm.utils import round_up
+
+
+def round_up(value: int, multiple: int) -> int:
+    return ((value + multiple - 1) // multiple) * multiple
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.

diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py
@@ -6,6 +6,7 @@
 
 import vllm._custom_ops as ops
 from tests.kernels.utils import opcheck
+from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.platforms import current_platform
 
@@ -55,18 +56,28 @@ def test_silu_and_mul(
         torch.cuda.manual_seed(seed)
     torch.set_default_device(device)
 
-    layer = SiluAndMul()
+    with set_current_vllm_config(VllmConfig()):
+        layer = SiluAndMul()
 
-    # Make inputs
-    scale = (torch.randn((1), device=device, dtype=torch.float32))
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+        # Make inputs. The quantization scale must be strictly positive: a
+        # negative scale would flip signs and a near-zero one risks
+        # divide-by-zero / overflow, so take abs and nudge off zero.
+        scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5
+        x = torch.randn(num_tokens, hidden_size, dtype=dtype)
 
-    ref_out = ref_impl(layer, x, scale)
-    ops_out = ops_impl(x, scale)
+        ref_out = ref_impl(layer, x, scale)
+        ops_out = ops_impl(x, scale)
 
-    assert ref_out.dtype == quant_dtype
-    assert ops_out.dtype == quant_dtype
-    assert ref_out.shape == ops_out.shape
-    assert torch.allclose(ref_out.to(dtype=torch.float32),
-                          ops_out.to(dtype=torch.float32))
-    opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
+        assert ref_out.dtype == quant_dtype
+        assert ops_out.dtype == quant_dtype
+        assert ref_out.shape == ops_out.shape
+        # fp8 is highly discrete: a tiny float rounding difference between the
+        # fused kernel and the reference can land in an adjacent fp8 bucket, so
+        # an absolute-only tolerance is flaky. Compare with a relative tolerance
+        # of 1 ULP for e4m3 (2**-3 = 0.125), which holds at any magnitude, plus
+        # a small atol to absorb values near zero.
+        assert torch.allclose(ref_out.to(dtype=torch.float32),
+                              ops_out.to(dtype=torch.float32),
+                              atol=1 / 128,
+                              rtol=0.125)
+        opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -15,13 +15,45 @@
 from torch._prims_common import TensorLikeType
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
-from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+try:
+    from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
+except ImportError:
+    from enum import Enum
+
+    AttentionBackend = Any
+    AttentionMetadata = Any
+
+    class AttentionType(Enum):
+        ENCODER_DECODER = "encoder_decoder"
+
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.fused_moe.utils import (
     moe_kernel_quantize_input)
-from vllm.platforms.interface import _Backend
-from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
-                        STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+try:
+    from vllm.platforms.interface import _Backend
+except ImportError:
+    _Backend = Any
+try:
+    from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
+                            STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
+except ImportError:
+    STR_BACKEND_ENV_VAR = "VLLM_ATTENTION_BACKEND"
+    STR_FLASH_ATTN_VAL = "FLASH_ATTN"
+    STR_XFORMERS_ATTN_VAL = "XFORMERS"
+
+    def make_tensor_with_pad(
+        x: list[list[int]],
+        max_len: int,
+        pad: int,
+        dtype: torch.dtype,
+        device: Union[torch.device, str],
+    ) -> torch.Tensor:
+        # Truncate items longer than max_len before padding: otherwise
+        # max_len - len(item) is negative, [pad] * negative is [], the item is
+        # left at its original (over-length) size, and torch.tensor() raises on
+        # the ragged result.
+        padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x]
+        return torch.tensor(padded, dtype=dtype, device=device)
 
 # For now, disable "test_aot_dispatch_dynamic" since there are some
 # bugs related to this test in PyTorch 2.4.

diff --git a/vllm_metax/platform.py b/vllm_metax/platform.py
@@ -7,12 +7,14 @@
 import importlib
 import math
 import os
+import random
 from collections.abc import Callable
 from datetime import timedelta
 from functools import cache, wraps
 from pathlib import Path
 from typing import TYPE_CHECKING, TypeVar
 
+import numpy as np
 import torch
 from torch.distributed import PrefixStore, ProcessGroup
 from torch.distributed.distributed_c10d import is_nccl_available
@@ -169,6 +171,14 @@ def set_device(cls, device: torch.device) -> None:
     def manual_seed_all(cls, seed: int) -> None:
         torch.cuda.manual_seed_all(seed)
 
+    @classmethod
+    def seed_everything(cls, seed: int | None = None) -> None:
+        if seed is not None:
+            random.seed(seed)
+            np.random.seed(seed)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed_all(seed)
+
     @classmethod
     def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
         raise NotImplementedError