fix tests

Kacper-Pietkun · Kacper-Pietkun · commit 500c41df81fd · 2025-10-03T15:00:40.000+03:00
Signed-off-by: Kacper Pietkun &lt;kpietkun@habana.ai&gt;
diff --git a/tests/unit_tests/ops/test_hpu_awq.py b/tests/unit_tests/ops/test_hpu_awq.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig
 from vllm_gaudi.utils import HPUCompileConfig
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_awq_linear_method(dist_init):
                                disable_tp=False).to("hpu")
     assert isinstance(oot_op.quant_method, AWQHPULinearMethod)
 
-    if not htorch.utils.internal.is_lazy():
-        compile_config = HPUCompileConfig()
-        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
-
     # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ
     # (with adjusted shape, to make tensors smaller)
-    qweight = torch.load("data/awq/qweight.pt", weights_only=False, map_location="hpu")
+    qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu")
     oot_op.qweight.copy_(qweight)
-    qzeros = torch.load("data/awq/qzeros.pt", weights_only=False, map_location="hpu")
+    qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu")
     oot_op.qzeros.copy_(qzeros)
-    scales = torch.load("data/awq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
     oot_op.scales.copy_(scales)
 
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input
     # (AWQLinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/awq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
-    ref_output = torch.load("data/awq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
 
     # Execute layer
-    oot_op.quant_method.process_weights_after_loading(oot_op)
     out = oot_op(input)
 
     # Check correctness
diff --git a/tests/unit_tests/ops/test_hpu_fp8.py b/tests/unit_tests/ops/test_hpu_fp8.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from unittest.mock import MagicMock
 from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod
 from vllm_gaudi.utils import HPUCompileConfig
@@ -29,9 +33,11 @@ def test_fp8_linear_method(dist_init, monkeypatch):
 
     # Load weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8
     # (with adjusted shapes, to make tensors smaller)
-    weight = torch.load("data/fp8/linear_weight.pt", weights_only=False, map_location="hpu")
+    weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu")
     oot_op.weight.copy_(weight)
-    weight_scale_inv = torch.load("data/fp8/linear_weight_scale_inv.pt", weights_only=False, map_location="hpu")
+    weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"),
+                                  weights_only=False,
+                                  map_location="hpu")
     oot_op.weight_scale_inv.copy_(weight_scale_inv)
 
     oot_op.quant_method.process_weights_after_loading(oot_op)
@@ -44,8 +50,8 @@ def test_fp8_linear_method(dist_init, monkeypatch):
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input
     # (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/fp8/linear_input.pt", weights_only=False, map_location="hpu")
-    ref_output = torch.load("data/fp8/linear_output.pt", weights_only=False, map_location="hpu")
+    input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu")
+    ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu")
 
     # Execute layer
     out = oot_op(input)
@@ -107,7 +113,7 @@ def test_fp8_moe_method(dist_init, monkeypatch):
 
     if not htorch.utils.internal.is_lazy():
         compile_config = HPUCompileConfig()
-        oot_op.quant_method.apply = torch.compile(oot_op.quant_method.apply, **compile_config.get_compile_args())
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
 
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input
diff --git a/tests/unit_tests/ops/test_hpu_gptq.py b/tests/unit_tests/ops/test_hpu_gptq.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig
 from vllm_gaudi.utils import HPUCompileConfig
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_gptq_linear_method(dist_init):
                                disable_tp=False).to("hpu")
     assert isinstance(oot_op.quant_method, GPTQHPULinearMethod)
 
-    if not htorch.utils.internal.is_lazy():
-        compile_config = HPUCompileConfig()
-        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
-
     # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
     # (with adjusted shape, to make tensors smaller)
-    qweight = torch.load("data/gptq/qweight.pt", weights_only=False, map_location="hpu")
+    qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu")
     oot_op.qweight.copy_(qweight)
-    qzeros = torch.load("data/gptq/qzeros.pt", weights_only=False, map_location="hpu")
+    qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu")
     oot_op.qzeros.copy_(qzeros)
-    scales = torch.load("data/gptq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
     oot_op.scales.copy_(scales)
 
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
     # (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/gptq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
-    ref_output = torch.load("data/gptq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
 
     # Execute layer
-    oot_op.quant_method.process_weights_after_loading(oot_op)
     out = oot_op(input)
 
     # Check correctness
diff --git a/tests/unit_tests/ops/utils.py b/tests/unit_tests/ops/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 import contextlib
 from vllm.model_executor.custom_op import CustomOp
 
@@ -28,3 +29,7 @@ def register_op(base_cls, oot_cls):
     within temporary_op_registry_oot context manager.
     """
     CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
+
+
+def get_data_path(filename):
+    return os.path.join(os.path.dirname(__file__), filename)