diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py index e0c3947c7..e5128052e 100644 --- a/tests/kernels/quant_utils.py +++ b/tests/kernels/quant_utils.py @@ -9,7 +9,10 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( group_broadcast) from vllm.platforms import current_platform -from vllm.utils import round_up + + +def round_up(value: int, multiple: int) -> int: + return ((value + multiple - 1) // multiple) * multiple # Using the default value (240.0) from pytorch will cause accuracy # issue on dynamic quantization models. Here use 224.0 for rocm. diff --git a/tests/kernels/test_fused_quant_activation.py b/tests/kernels/test_fused_quant_activation.py index 006a51158..c218865a6 100644 --- a/tests/kernels/test_fused_quant_activation.py +++ b/tests/kernels/test_fused_quant_activation.py @@ -6,6 +6,7 @@ import vllm._custom_ops as ops from tests.kernels.utils import opcheck +from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.activation import SiluAndMul from vllm.platforms import current_platform @@ -55,18 +56,28 @@ def test_silu_and_mul( torch.cuda.manual_seed(seed) torch.set_default_device(device) - layer = SiluAndMul() + with set_current_vllm_config(VllmConfig()): + layer = SiluAndMul() - # Make inputs - scale = (torch.randn((1), device=device, dtype=torch.float32)) - x = torch.randn(num_tokens, hidden_size, dtype=dtype) + # Make inputs. The quantization scale must be strictly positive: a + # negative scale would flip signs and a near-zero one risks + # divide-by-zero / overflow, so take abs and nudge off zero. + scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5 + x = torch.randn(num_tokens, hidden_size, dtype=dtype) - ref_out = ref_impl(layer, x, scale) - ops_out = ops_impl(x, scale) + ref_out = ref_impl(layer, x, scale) + ops_out = ops_impl(x, scale) - assert ref_out.dtype == quant_dtype - assert ops_out.dtype == quant_dtype - assert ref_out.shape == ops_out.shape - assert torch.allclose(ref_out.to(dtype=torch.float32), - ops_out.to(dtype=torch.float32)) - opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale)) + assert ref_out.dtype == quant_dtype + assert ops_out.dtype == quant_dtype + assert ref_out.shape == ops_out.shape + # fp8 is highly discrete: a tiny float rounding difference between the + # fused kernel and the reference can land in an adjacent fp8 bucket, so + # an absolute-only tolerance is flaky. Compare with a relative tolerance + # of 1 ULP for e4m3 (2**-3 = 0.125), which holds at any magnitude, plus + # a small atol to absorb values near zero. + assert torch.allclose(ref_out.to(dtype=torch.float32), + ops_out.to(dtype=torch.float32), + atol=1 / 128, + rtol=0.125) + opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale)) diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py index 01e1b9f2a..4b03c6e6c 100644 --- a/tests/kernels/utils.py +++ b/tests/kernels/utils.py @@ -15,13 +15,45 @@ from torch._prims_common import TensorLikeType from tests.kernels.quant_utils import native_w8a8_block_matmul -from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType +try: + from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType +except ImportError: + from enum import Enum + + AttentionBackend = Any + AttentionMetadata = Any + + class AttentionType(Enum): + ENCODER_DECODER = "encoder_decoder" + from vllm.model_executor.layers.activation import SiluAndMul from vllm.model_executor.layers.fused_moe.utils import ( moe_kernel_quantize_input) -from vllm.platforms.interface import _Backend -from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, - STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) +try: + from vllm.platforms.interface import _Backend +except ImportError: + _Backend = Any +try: + from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL, + STR_XFORMERS_ATTN_VAL, make_tensor_with_pad) +except ImportError: + STR_BACKEND_ENV_VAR = "VLLM_ATTENTION_BACKEND" + STR_FLASH_ATTN_VAL = "FLASH_ATTN" + STR_XFORMERS_ATTN_VAL = "XFORMERS" + + def make_tensor_with_pad( + x: list[list[int]], + max_len: int, + pad: int, + dtype: torch.dtype, + device: Union[torch.device, str], + ) -> torch.Tensor: + # Truncate items longer than max_len before padding: otherwise + # max_len - len(item) is negative, [pad] * negative is [], the item is + # left at its original (over-length) size, and torch.tensor() raises on + # the ragged result. + padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x] + return torch.tensor(padded, dtype=dtype, device=device) # For now, disable "test_aot_dispatch_dynamic" since there are some # bugs related to this test in PyTorch 2.4. diff --git a/vllm_metax/platform.py b/vllm_metax/platform.py index b15f0664a..ea310176c 100644 --- a/vllm_metax/platform.py +++ b/vllm_metax/platform.py @@ -7,12 +7,14 @@ import importlib import math import os +import random from collections.abc import Callable from datetime import timedelta from functools import cache, wraps from pathlib import Path from typing import TYPE_CHECKING, TypeVar +import numpy as np import torch from torch.distributed import PrefixStore, ProcessGroup from torch.distributed.distributed_c10d import is_nccl_available @@ -169,6 +171,14 @@ def set_device(cls, device: torch.device) -> None: def manual_seed_all(cls, seed: int) -> None: torch.cuda.manual_seed_all(seed) + @classmethod + def seed_everything(cls, seed: int | None = None) -> None: + if seed is not None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + @classmethod def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None: raise NotImplementedError