Skip to content

Commit ef42dd8

Browse files
fix tests
Signed-off-by: Kacper Pietkun <[email protected]>
1 parent d4f1c7f commit ef42dd8

File tree

5 files changed

+64
-32
lines changed

5 files changed

+64
-32
lines changed

tests/unit_tests/ops/test_hpu_awq.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig
48
from vllm_gaudi.utils import HPUCompileConfig
59
from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_awq_linear_method(dist_init):
2226
disable_tp=False).to("hpu")
2327
assert isinstance(oot_op.quant_method, AWQHPULinearMethod)
2428

25-
if not htorch.utils.internal.is_lazy():
26-
compile_config = HPUCompileConfig()
27-
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
28-
2929
# qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ
3030
# (with adjusted shape, to make tensors smaller)
31-
qweight = torch.load("data/awq/qweight.pt", weights_only=False, map_location="hpu")
31+
qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu")
3232
oot_op.qweight.copy_(qweight)
33-
qzeros = torch.load("data/awq/qzeros.pt", weights_only=False, map_location="hpu")
33+
qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu")
3434
oot_op.qzeros.copy_(qzeros)
35-
scales = torch.load("data/awq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
35+
scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
3636
oot_op.scales.copy_(scales)
3737

38+
oot_op.quant_method.process_weights_after_loading(oot_op)
39+
40+
if not htorch.utils.internal.is_lazy():
41+
compile_config = HPUCompileConfig()
42+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
43+
3844
# Input and expected output
3945
# Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input
4046
# (AWQLinearMethod was triggered offline with the same input as below to get the ref_output)
41-
input = torch.load("data/awq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
42-
ref_output = torch.load("data/awq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
47+
input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
48+
ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False,
49+
map_location="hpu").to(torch.bfloat16)
4350

4451
# Execute layer
45-
oot_op.quant_method.process_weights_after_loading(oot_op)
4652
out = oot_op(input)
4753

4854
# Check correctness

tests/unit_tests/ops/test_hpu_fp8.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from unittest.mock import MagicMock
48
from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod
59
from vllm_gaudi.utils import HPUCompileConfig
@@ -29,9 +33,11 @@ def test_fp8_linear_method(dist_init, monkeypatch):
2933

3034
# Load weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8
3135
# (with adjusted shapes, to make tensors smaller)
32-
weight = torch.load("data/fp8/linear_weight.pt", weights_only=False, map_location="hpu")
36+
weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu")
3337
oot_op.weight.copy_(weight)
34-
weight_scale_inv = torch.load("data/fp8/linear_weight_scale_inv.pt", weights_only=False, map_location="hpu")
38+
weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"),
39+
weights_only=False,
40+
map_location="hpu")
3541
oot_op.weight_scale_inv.copy_(weight_scale_inv)
3642

3743
oot_op.quant_method.process_weights_after_loading(oot_op)
@@ -44,8 +50,8 @@ def test_fp8_linear_method(dist_init, monkeypatch):
4450
# Input and expected output
4551
# Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input
4652
# (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output)
47-
input = torch.load("data/fp8/linear_input.pt", weights_only=False, map_location="hpu")
48-
ref_output = torch.load("data/fp8/linear_output.pt", weights_only=False, map_location="hpu")
53+
input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu")
54+
ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu")
4955

5056
# Execute layer
5157
out = oot_op(input)
@@ -94,27 +100,35 @@ def test_fp8_moe_method(dist_init, monkeypatch):
94100

95101
# Weights were extracted from first FusedMoE layer of Qwen/Qwen3-30B-A3B-FP8
96102
# (with adjusted shapes, to make tensors smaller)
97-
w13_weight = torch.load("data/fp8/moe_w13_weight.pt", weights_only=False, map_location="hpu")
103+
w13_weight = torch.load(get_data_path("data/fp8/moe_w13_weight.pt"), weights_only=False, map_location="hpu")
98104
oot_op.w13_weight.copy_(w13_weight.repeat(128, 1, 1))
99-
w13_weight_scale_inv = torch.load("data/fp8/moe_w13_weight_scale_inv.pt", weights_only=False, map_location="hpu")
105+
w13_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w13_weight_scale_inv.pt"),
106+
weights_only=False,
107+
map_location="hpu")
100108
oot_op.w13_weight_scale_inv.copy_(w13_weight_scale_inv.repeat(128, 1, 1))
101-
w2_weight = torch.load("data/fp8/moe_w2_weight.pt", weights_only=False, map_location="hpu")
109+
w2_weight = torch.load(get_data_path("data/fp8/moe_w2_weight.pt"), weights_only=False, map_location="hpu")
102110
oot_op.w2_weight.copy_(w2_weight.repeat(128, 1, 1))
103-
w2_weight_scale_inv = torch.load("data/fp8/moe_w2_weight_scale_inv.pt", weights_only=False, map_location="hpu")
111+
w2_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w2_weight_scale_inv.pt"),
112+
weights_only=False,
113+
map_location="hpu")
104114
oot_op.w2_weight_scale_inv.copy_(w2_weight_scale_inv.repeat(128, 1, 1))
105115

106116
oot_op.quant_method.process_weights_after_loading(oot_op)
107117

108118
if not htorch.utils.internal.is_lazy():
109119
compile_config = HPUCompileConfig()
110-
oot_op.quant_method.apply = torch.compile(oot_op.quant_method.apply, **compile_config.get_compile_args())
120+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
111121

112122
# Input and expected output
113123
# Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input
114124
# (Fp8MoEMethod was triggered offline with the same input as below to get the ref_output)
115-
hidden_states = torch.load("data/fp8/moe_input_hidden_states.pt", weights_only=False, map_location="hpu")
116-
router_logits = torch.load("data/fp8/moe_input_router_logits.pt", weights_only=False, map_location="hpu")
117-
ref_output = torch.load("data/fp8/moe_output.pt", weights_only=False, map_location="hpu")
125+
hidden_states = torch.load(get_data_path("data/fp8/moe_input_hidden_states.pt"),
126+
weights_only=False,
127+
map_location="hpu")
128+
router_logits = torch.load(get_data_path("data/fp8/moe_input_router_logits.pt"),
129+
weights_only=False,
130+
map_location="hpu")
131+
ref_output = torch.load(get_data_path("data/fp8/moe_output.pt"), weights_only=False, map_location="hpu")
118132

119133
# Execute layer
120134
mock_ctx = MagicMock(spec=["dp_metadata"])

tests/unit_tests/ops/test_hpu_gptq.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig
48
from vllm_gaudi.utils import HPUCompileConfig
59
from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_gptq_linear_method(dist_init):
2226
disable_tp=False).to("hpu")
2327
assert isinstance(oot_op.quant_method, GPTQHPULinearMethod)
2428

25-
if not htorch.utils.internal.is_lazy():
26-
compile_config = HPUCompileConfig()
27-
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
28-
2929
# qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
3030
# (with adjusted shape, to make tensors smaller)
31-
qweight = torch.load("data/gptq/qweight.pt", weights_only=False, map_location="hpu")
31+
qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu")
3232
oot_op.qweight.copy_(qweight)
33-
qzeros = torch.load("data/gptq/qzeros.pt", weights_only=False, map_location="hpu")
33+
qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu")
3434
oot_op.qzeros.copy_(qzeros)
35-
scales = torch.load("data/gptq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
35+
scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
3636
oot_op.scales.copy_(scales)
3737

38+
oot_op.quant_method.process_weights_after_loading(oot_op)
39+
40+
if not htorch.utils.internal.is_lazy():
41+
compile_config = HPUCompileConfig()
42+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
43+
3844
# Input and expected output
3945
# Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
4046
# (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
41-
input = torch.load("data/gptq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
42-
ref_output = torch.load("data/gptq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
47+
input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
48+
ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False,
49+
map_location="hpu").to(torch.bfloat16)
4350

4451
# Execute layer
45-
oot_op.quant_method.process_weights_after_loading(oot_op)
4652
out = oot_op(input)
4753

4854
# Check correctness

tests/unit_tests/ops/test_hpu_rotary_embedding.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ def test_m_rotary_embedding(
382382
"max_position_embeddings": max_position_embeddings,
383383
"base": base,
384384
"is_neox_style": is_neox_style,
385+
"mrope_section": [rotary_dim // 2]
385386
}
386387
native_rotary_data = RotaryData(cls=MRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
387388
oot_rotary_data = RotaryData(cls=HPUMRotaryEmbedding, dtype=torch.bfloat16, device="hpu")

tests/unit_tests/ops/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import os
45
import contextlib
56
from vllm.model_executor.custom_op import CustomOp
67

@@ -28,3 +29,7 @@ def register_op(base_cls, oot_cls):
2829
within temporary_op_registry_oot context manager.
2930
"""
3031
CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
32+
33+
34+
def get_data_path(filename):
35+
return os.path.join(os.path.dirname(__file__), filename)

0 commit comments

Comments
 (0)