Skip to content

Commit 500c41d

Browse files
fix tests
Signed-off-by: Kacper Pietkun <[email protected]>
1 parent d4f1c7f commit 500c41d

File tree

4 files changed

+48
-25
lines changed

4 files changed

+48
-25
lines changed

tests/unit_tests/ops/test_hpu_awq.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig
48
from vllm_gaudi.utils import HPUCompileConfig
59
from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_awq_linear_method(dist_init):
2226
disable_tp=False).to("hpu")
2327
assert isinstance(oot_op.quant_method, AWQHPULinearMethod)
2428

25-
if not htorch.utils.internal.is_lazy():
26-
compile_config = HPUCompileConfig()
27-
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
28-
2929
# qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ
3030
# (with adjusted shape, to make tensors smaller)
31-
qweight = torch.load("data/awq/qweight.pt", weights_only=False, map_location="hpu")
31+
qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu")
3232
oot_op.qweight.copy_(qweight)
33-
qzeros = torch.load("data/awq/qzeros.pt", weights_only=False, map_location="hpu")
33+
qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu")
3434
oot_op.qzeros.copy_(qzeros)
35-
scales = torch.load("data/awq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
35+
scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
3636
oot_op.scales.copy_(scales)
3737

38+
oot_op.quant_method.process_weights_after_loading(oot_op)
39+
40+
if not htorch.utils.internal.is_lazy():
41+
compile_config = HPUCompileConfig()
42+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
43+
3844
# Input and expected output
3945
# Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input
4046
# (AWQLinearMethod was triggered offline with the same input as below to get the ref_output)
41-
input = torch.load("data/awq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
42-
ref_output = torch.load("data/awq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
47+
input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
48+
ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False,
49+
map_location="hpu").to(torch.bfloat16)
4350

4451
# Execute layer
45-
oot_op.quant_method.process_weights_after_loading(oot_op)
4652
out = oot_op(input)
4753

4854
# Check correctness

tests/unit_tests/ops/test_hpu_fp8.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from unittest.mock import MagicMock
48
from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod
59
from vllm_gaudi.utils import HPUCompileConfig
@@ -29,9 +33,11 @@ def test_fp8_linear_method(dist_init, monkeypatch):
2933

3034
# Load weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8
3135
# (with adjusted shapes, to make tensors smaller)
32-
weight = torch.load("data/fp8/linear_weight.pt", weights_only=False, map_location="hpu")
36+
weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu")
3337
oot_op.weight.copy_(weight)
34-
weight_scale_inv = torch.load("data/fp8/linear_weight_scale_inv.pt", weights_only=False, map_location="hpu")
38+
weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"),
39+
weights_only=False,
40+
map_location="hpu")
3541
oot_op.weight_scale_inv.copy_(weight_scale_inv)
3642

3743
oot_op.quant_method.process_weights_after_loading(oot_op)
@@ -44,8 +50,8 @@ def test_fp8_linear_method(dist_init, monkeypatch):
4450
# Input and expected output
4551
# Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input
4652
# (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output)
47-
input = torch.load("data/fp8/linear_input.pt", weights_only=False, map_location="hpu")
48-
ref_output = torch.load("data/fp8/linear_output.pt", weights_only=False, map_location="hpu")
53+
input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu")
54+
ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu")
4955

5056
# Execute layer
5157
out = oot_op(input)
@@ -107,7 +113,7 @@ def test_fp8_moe_method(dist_init, monkeypatch):
107113

108114
if not htorch.utils.internal.is_lazy():
109115
compile_config = HPUCompileConfig()
110-
oot_op.quant_method.apply = torch.compile(oot_op.quant_method.apply, **compile_config.get_compile_args())
116+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
111117

112118
# Input and expected output
113119
# Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input

tests/unit_tests/ops/test_hpu_gptq.py

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14
import torch
25
import habana_frameworks.torch as htorch
6+
from utils import get_data_path
37
from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig
48
from vllm_gaudi.utils import HPUCompileConfig
59
from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_gptq_linear_method(dist_init):
2226
disable_tp=False).to("hpu")
2327
assert isinstance(oot_op.quant_method, GPTQHPULinearMethod)
2428

25-
if not htorch.utils.internal.is_lazy():
26-
compile_config = HPUCompileConfig()
27-
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
28-
2929
# qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
3030
# (with adjusted shape, to make tensors smaller)
31-
qweight = torch.load("data/gptq/qweight.pt", weights_only=False, map_location="hpu")
31+
qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu")
3232
oot_op.qweight.copy_(qweight)
33-
qzeros = torch.load("data/gptq/qzeros.pt", weights_only=False, map_location="hpu")
33+
qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu")
3434
oot_op.qzeros.copy_(qzeros)
35-
scales = torch.load("data/gptq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
35+
scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
3636
oot_op.scales.copy_(scales)
3737

38+
oot_op.quant_method.process_weights_after_loading(oot_op)
39+
40+
if not htorch.utils.internal.is_lazy():
41+
compile_config = HPUCompileConfig()
42+
oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
43+
3844
# Input and expected output
3945
# Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
4046
# (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
41-
input = torch.load("data/gptq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
42-
ref_output = torch.load("data/gptq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
47+
input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
48+
ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False,
49+
map_location="hpu").to(torch.bfloat16)
4350

4451
# Execute layer
45-
oot_op.quant_method.process_weights_after_loading(oot_op)
4652
out = oot_op(input)
4753

4854
# Check correctness

tests/unit_tests/ops/utils.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33

4+
import os
45
import contextlib
56
from vllm.model_executor.custom_op import CustomOp
67

@@ -28,3 +29,7 @@ def register_op(base_cls, oot_cls):
2829
within temporary_op_registry_oot context manager.
2930
"""
3031
CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
32+
33+
34+
def get_data_path(filename):
35+
return os.path.join(os.path.dirname(__file__), filename)

0 commit comments

Comments
 (0)