Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tests/kernels/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from vllm.model_executor.layers.quantization.utils.quant_utils import (
group_broadcast)
from vllm.platforms import current_platform
from vllm.utils import round_up


def round_up(value: int, multiple: int) -> int:
return ((value + multiple - 1) // multiple) * multiple

# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
Expand Down
35 changes: 23 additions & 12 deletions tests/kernels/test_fused_quant_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import vllm._custom_ops as ops
from tests.kernels.utils import opcheck
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform

Expand Down Expand Up @@ -55,18 +56,28 @@ def test_silu_and_mul(
torch.cuda.manual_seed(seed)
torch.set_default_device(device)

layer = SiluAndMul()
with set_current_vllm_config(VllmConfig()):
layer = SiluAndMul()

# Make inputs
scale = (torch.randn((1), device=device, dtype=torch.float32))
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
# Make inputs. The quantization scale must be strictly positive: a
# negative scale would flip signs and a near-zero one risks
# divide-by-zero / overflow, so take abs and nudge off zero.
scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5
x = torch.randn(num_tokens, hidden_size, dtype=dtype)

ref_out = ref_impl(layer, x, scale)
ops_out = ops_impl(x, scale)
ref_out = ref_impl(layer, x, scale)
ops_out = ops_impl(x, scale)

assert ref_out.dtype == quant_dtype
assert ops_out.dtype == quant_dtype
assert ref_out.shape == ops_out.shape
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32))
opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
assert ref_out.dtype == quant_dtype
assert ops_out.dtype == quant_dtype
assert ref_out.shape == ops_out.shape
# fp8 is highly discrete: a tiny float rounding difference between the
# fused kernel and the reference can land in an adjacent fp8 bucket, so
# an absolute-only tolerance is flaky. Compare with a relative tolerance
# of 1 ULP for e4m3 (2**-3 = 0.125), which holds at any magnitude, plus
# a small atol to absorb values near zero.
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32),
atol=1 / 128,
rtol=0.125)
opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
40 changes: 36 additions & 4 deletions tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,45 @@
from torch._prims_common import TensorLikeType

from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
try:
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
except ImportError:
from enum import Enum

AttentionBackend = Any
AttentionMetadata = Any

class AttentionType(Enum):
ENCODER_DECODER = "encoder_decoder"

from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.platforms.interface import _Backend
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
try:
from vllm.platforms.interface import _Backend
except ImportError:
_Backend = Any
try:
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
except ImportError:
STR_BACKEND_ENV_VAR = "VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL = "FLASH_ATTN"
STR_XFORMERS_ATTN_VAL = "XFORMERS"

def make_tensor_with_pad(
x: list[list[int]],
max_len: int,
pad: int,
dtype: torch.dtype,
device: Union[torch.device, str],
) -> torch.Tensor:
# Truncate items longer than max_len before padding: otherwise
# max_len - len(item) is negative, [pad] * negative is [], the item is
# left at its original (over-length) size, and torch.tensor() raises on
# the ragged result.
padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x]
return torch.tensor(padded, dtype=dtype, device=device)

# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
Expand Down
10 changes: 10 additions & 0 deletions vllm_metax/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import importlib
import math
import os
import random
from collections.abc import Callable
from datetime import timedelta
from functools import cache, wraps
from pathlib import Path
from typing import TYPE_CHECKING, TypeVar

import numpy as np
import torch
from torch.distributed import PrefixStore, ProcessGroup
from torch.distributed.distributed_c10d import is_nccl_available
Expand Down Expand Up @@ -169,6 +171,14 @@ def set_device(cls, device: torch.device) -> None:
def manual_seed_all(cls, seed: int) -> None:
torch.cuda.manual_seed_all(seed)

@classmethod
def seed_everything(cls, seed: int | None = None) -> None:
if seed is not None:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

@classmethod
def get_device_capability(cls, device_id: int = 0) -> DeviceCapability | None:
raise NotImplementedError
Expand Down