Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tests/kernels/quant_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@
from vllm.model_executor.layers.quantization.utils.quant_utils import (
group_broadcast)
from vllm.platforms import current_platform
from vllm.utils import round_up


def round_up(value: int, multiple: int) -> int:
return ((value + multiple - 1) // multiple) * multiple

# Using the default value (240.0) from pytorch will cause accuracy
# issue on dynamic quantization models. Here use 224.0 for rocm.
Expand Down
28 changes: 16 additions & 12 deletions tests/kernels/test_fused_quant_activation.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import vllm._custom_ops as ops
from tests.kernels.utils import opcheck
from vllm.config import VllmConfig, set_current_vllm_config
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.platforms import current_platform

Expand Down Expand Up @@ -55,18 +56,21 @@ def test_silu_and_mul(
torch.cuda.manual_seed(seed)
torch.set_default_device(device)

layer = SiluAndMul()
with set_current_vllm_config(VllmConfig()):
layer = SiluAndMul()

# Make inputs
scale = (torch.randn((1), device=device, dtype=torch.float32))
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
# Make inputs
scale = (torch.randn((1), device=device, dtype=torch.float32))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The quantization scale is generated using torch.randn, which can produce negative or zero values. Quantization scales must be strictly positive. A negative scale will incorrectly flip the signs of the quantized values, and a scale close to zero can cause division-by-zero or overflow issues. Consider using the absolute value of the random tensor and adding a small epsilon to ensure a valid, positive scale.

Suggested change
scale = (torch.randn((1), device=device, dtype=torch.float32))
scale = torch.randn((1), device=device, dtype=torch.float32).abs() + 1e-5

x = torch.randn(num_tokens, hidden_size, dtype=dtype)

ref_out = ref_impl(layer, x, scale)
ops_out = ops_impl(x, scale)
ref_out = ref_impl(layer, x, scale)
ops_out = ops_impl(x, scale)

assert ref_out.dtype == quant_dtype
assert ops_out.dtype == quant_dtype
assert ref_out.shape == ops_out.shape
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32))
opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
assert ref_out.dtype == quant_dtype
assert ops_out.dtype == quant_dtype
assert ref_out.shape == ops_out.shape
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32),
atol=1 / 128,
rtol=0)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Using a fixed atol=1 / 128 with rtol=0 for comparing FP8 quantized outputs can lead to flaky tests. Since FP8 is a highly discrete format, any tiny rounding difference in the intermediate float calculations between the fused kernel and the reference implementation will cause them to round to different FP8 buckets. The difference between adjacent FP8 values (1 ULP) for values around 1.0 is 0.125 (for E4M3), which is much larger than 1/128 (0.0078125). If even a single element rounds differently, the test will fail. Consider using a larger tolerance or allowing a small percentage of mismatched elements to prevent flaky CI runs.

Suggested change
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32),
atol=1 / 128,
rtol=0)
assert torch.allclose(ref_out.to(dtype=torch.float32),
ops_out.to(dtype=torch.float32),
atol=0.125,
rtol=0)

opcheck(torch.ops._C.silu_and_mul_quant, (ops_out, x, scale))
36 changes: 32 additions & 4 deletions tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,41 @@
from torch._prims_common import TensorLikeType

from tests.kernels.quant_utils import native_w8a8_block_matmul
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
try:
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
except ModuleNotFoundError:

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Catching only ModuleNotFoundError when importing from vllm.attention is less robust than catching ImportError. If vllm.attention exists but fails to import due to other issues (such as circular dependencies or missing sub-modules), an ImportError will be raised and the test suite will crash. Using except ImportError is more robust and consistent with the other import blocks in this file.

Suggested change
try:
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
except ModuleNotFoundError:
try:
from vllm.attention import AttentionBackend, AttentionMetadata, AttentionType
except ImportError:

from enum import Enum

AttentionBackend = Any
AttentionMetadata = Any

class AttentionType(Enum):
ENCODER_DECODER = "encoder_decoder"

from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe.utils import (
moe_kernel_quantize_input)
from vllm.platforms.interface import _Backend
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
try:
from vllm.platforms.interface import _Backend
except ImportError:
_Backend = Any
try:
from vllm.utils import (STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL,
STR_XFORMERS_ATTN_VAL, make_tensor_with_pad)
except ImportError:
STR_BACKEND_ENV_VAR = "VLLM_ATTENTION_BACKEND"
STR_FLASH_ATTN_VAL = "FLASH_ATTN"
STR_XFORMERS_ATTN_VAL = "XFORMERS"

def make_tensor_with_pad(
x: list[list[int]],
max_len: int,
pad: int,
dtype: torch.dtype,
device: Union[torch.device, str],
) -> torch.Tensor:
padded = [item + [pad] * (max_len - len(item)) for item in x]

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

If any list in x has a length greater than max_len, max_len - len(item) will be negative, resulting in [pad] * negative_number which evaluates to []. This means the item will not be truncated, and the resulting padded list will contain sublists of different lengths, causing torch.tensor(padded) to crash with a ValueError. To make this helper robust, truncate the items to max_len before padding.

Suggested change
padded = [item + [pad] * (max_len - len(item)) for item in x]
padded = [item[:max_len] + [pad] * (max_len - len(item)) for item in x]

return torch.tensor(padded, dtype=dtype, device=device)

# For now, disable "test_aot_dispatch_dynamic" since there are some
# bugs related to this test in PyTorch 2.4.
Expand Down