fix tests

Kacper-Pietkun · Kacper-Pietkun · commit ef42dd810f32 · 2025-10-03T15:07:13.000+03:00
Signed-off-by: Kacper Pietkun &lt;kpietkun@habana.ai&gt;
diff --git a/tests/unit_tests/ops/test_hpu_awq.py b/tests/unit_tests/ops/test_hpu_awq.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from vllm_gaudi.ops.hpu_awq import AWQHPULinearMethod, AWQHPUConfig
 from vllm_gaudi.utils import HPUCompileConfig
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_awq_linear_method(dist_init):
                                disable_tp=False).to("hpu")
     assert isinstance(oot_op.quant_method, AWQHPULinearMethod)
 
-    if not htorch.utils.internal.is_lazy():
-        compile_config = HPUCompileConfig()
-        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
-
     # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-AWQ
     # (with adjusted shape, to make tensors smaller)
-    qweight = torch.load("data/awq/qweight.pt", weights_only=False, map_location="hpu")
+    qweight = torch.load(get_data_path("data/awq/qweight.pt"), weights_only=False, map_location="hpu")
     oot_op.qweight.copy_(qweight)
-    qzeros = torch.load("data/awq/qzeros.pt", weights_only=False, map_location="hpu")
+    qzeros = torch.load(get_data_path("data/awq/qzeros.pt"), weights_only=False, map_location="hpu")
     oot_op.qzeros.copy_(qzeros)
-    scales = torch.load("data/awq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    scales = torch.load(get_data_path("data/awq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
     oot_op.scales.copy_(scales)
 
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of AWQLinearMethod for given input
     # (AWQLinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/awq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
-    ref_output = torch.load("data/awq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    input = torch.load(get_data_path("data/awq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/awq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
 
     # Execute layer
-    oot_op.quant_method.process_weights_after_loading(oot_op)
     out = oot_op(input)
 
     # Check correctness
diff --git a/tests/unit_tests/ops/test_hpu_fp8.py b/tests/unit_tests/ops/test_hpu_fp8.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from unittest.mock import MagicMock
 from vllm_gaudi.ops.hpu_fp8 import Fp8LinearMethod, HPUFp8MoEMethod
 from vllm_gaudi.utils import HPUCompileConfig
@@ -29,9 +33,11 @@ def test_fp8_linear_method(dist_init, monkeypatch):
 
     # Load weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8
     # (with adjusted shapes, to make tensors smaller)
-    weight = torch.load("data/fp8/linear_weight.pt", weights_only=False, map_location="hpu")
+    weight = torch.load(get_data_path("data/fp8/linear_weight.pt"), weights_only=False, map_location="hpu")
     oot_op.weight.copy_(weight)
-    weight_scale_inv = torch.load("data/fp8/linear_weight_scale_inv.pt", weights_only=False, map_location="hpu")
+    weight_scale_inv = torch.load(get_data_path("data/fp8/linear_weight_scale_inv.pt"),
+                                  weights_only=False,
+                                  map_location="hpu")
     oot_op.weight_scale_inv.copy_(weight_scale_inv)
 
     oot_op.quant_method.process_weights_after_loading(oot_op)
@@ -44,8 +50,8 @@ def test_fp8_linear_method(dist_init, monkeypatch):
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input
     # (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/fp8/linear_input.pt", weights_only=False, map_location="hpu")
-    ref_output = torch.load("data/fp8/linear_output.pt", weights_only=False, map_location="hpu")
+    input = torch.load(get_data_path("data/fp8/linear_input.pt"), weights_only=False, map_location="hpu")
+    ref_output = torch.load(get_data_path("data/fp8/linear_output.pt"), weights_only=False, map_location="hpu")
 
     # Execute layer
     out = oot_op(input)
@@ -94,27 +100,35 @@ def test_fp8_moe_method(dist_init, monkeypatch):
 
     # Weights were extracted from first FusedMoE layer of Qwen/Qwen3-30B-A3B-FP8
     # (with adjusted shapes, to make tensors smaller)
-    w13_weight = torch.load("data/fp8/moe_w13_weight.pt", weights_only=False, map_location="hpu")
+    w13_weight = torch.load(get_data_path("data/fp8/moe_w13_weight.pt"), weights_only=False, map_location="hpu")
     oot_op.w13_weight.copy_(w13_weight.repeat(128, 1, 1))
-    w13_weight_scale_inv = torch.load("data/fp8/moe_w13_weight_scale_inv.pt", weights_only=False, map_location="hpu")
+    w13_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w13_weight_scale_inv.pt"),
+                                      weights_only=False,
+                                      map_location="hpu")
     oot_op.w13_weight_scale_inv.copy_(w13_weight_scale_inv.repeat(128, 1, 1))
-    w2_weight = torch.load("data/fp8/moe_w2_weight.pt", weights_only=False, map_location="hpu")
+    w2_weight = torch.load(get_data_path("data/fp8/moe_w2_weight.pt"), weights_only=False, map_location="hpu")
     oot_op.w2_weight.copy_(w2_weight.repeat(128, 1, 1))
-    w2_weight_scale_inv = torch.load("data/fp8/moe_w2_weight_scale_inv.pt", weights_only=False, map_location="hpu")
+    w2_weight_scale_inv = torch.load(get_data_path("data/fp8/moe_w2_weight_scale_inv.pt"),
+                                     weights_only=False,
+                                     map_location="hpu")
     oot_op.w2_weight_scale_inv.copy_(w2_weight_scale_inv.repeat(128, 1, 1))
 
     oot_op.quant_method.process_weights_after_loading(oot_op)
 
     if not htorch.utils.internal.is_lazy():
         compile_config = HPUCompileConfig()
-        oot_op.quant_method.apply = torch.compile(oot_op.quant_method.apply, **compile_config.get_compile_args())
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
 
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input
     # (Fp8MoEMethod was triggered offline with the same input as below to get the ref_output)
-    hidden_states = torch.load("data/fp8/moe_input_hidden_states.pt", weights_only=False, map_location="hpu")
-    router_logits = torch.load("data/fp8/moe_input_router_logits.pt", weights_only=False, map_location="hpu")
-    ref_output = torch.load("data/fp8/moe_output.pt", weights_only=False, map_location="hpu")
+    hidden_states = torch.load(get_data_path("data/fp8/moe_input_hidden_states.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    router_logits = torch.load(get_data_path("data/fp8/moe_input_router_logits.pt"),
+                               weights_only=False,
+                               map_location="hpu")
+    ref_output = torch.load(get_data_path("data/fp8/moe_output.pt"), weights_only=False, map_location="hpu")
 
     # Execute layer
     mock_ctx = MagicMock(spec=["dp_metadata"])
diff --git a/tests/unit_tests/ops/test_hpu_gptq.py b/tests/unit_tests/ops/test_hpu_gptq.py
@@ -1,5 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
 import torch
 import habana_frameworks.torch as htorch
+from utils import get_data_path
 from vllm_gaudi.ops.hpu_gptq import GPTQHPULinearMethod, GPTQHPUConfig
 from vllm_gaudi.utils import HPUCompileConfig
 from vllm.model_executor.layers.linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_gptq_linear_method(dist_init):
                                disable_tp=False).to("hpu")
     assert isinstance(oot_op.quant_method, GPTQHPULinearMethod)
 
-    if not htorch.utils.internal.is_lazy():
-        compile_config = HPUCompileConfig()
-        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
-
     # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
     # (with adjusted shape, to make tensors smaller)
-    qweight = torch.load("data/gptq/qweight.pt", weights_only=False, map_location="hpu")
+    qweight = torch.load(get_data_path("data/gptq/qweight.pt"), weights_only=False, map_location="hpu")
     oot_op.qweight.copy_(qweight)
-    qzeros = torch.load("data/gptq/qzeros.pt", weights_only=False, map_location="hpu")
+    qzeros = torch.load(get_data_path("data/gptq/qzeros.pt"), weights_only=False, map_location="hpu")
     oot_op.qzeros.copy_(qzeros)
-    scales = torch.load("data/gptq/scales.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    scales = torch.load(get_data_path("data/gptq/scales.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
     oot_op.scales.copy_(scales)
 
+    oot_op.quant_method.process_weights_after_loading(oot_op)
+
+    if not htorch.utils.internal.is_lazy():
+        compile_config = HPUCompileConfig()
+        oot_op = torch.compile(oot_op, **compile_config.get_compile_args())
+
     # Input and expected output
     # Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
     # (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
-    input = torch.load("data/gptq/input.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
-    ref_output = torch.load("data/gptq/output.pt", weights_only=False, map_location="hpu").to(torch.bfloat16)
+    input = torch.load(get_data_path("data/gptq/input.pt"), weights_only=False, map_location="hpu").to(torch.bfloat16)
+    ref_output = torch.load(get_data_path("data/gptq/output.pt"), weights_only=False,
+                            map_location="hpu").to(torch.bfloat16)
 
     # Execute layer
-    oot_op.quant_method.process_weights_after_loading(oot_op)
     out = oot_op(input)
 
     # Check correctness
diff --git a/tests/unit_tests/ops/test_hpu_rotary_embedding.py b/tests/unit_tests/ops/test_hpu_rotary_embedding.py
@@ -382,6 +382,7 @@ def test_m_rotary_embedding(
         "max_position_embeddings": max_position_embeddings,
         "base": base,
         "is_neox_style": is_neox_style,
+        "mrope_section": [rotary_dim // 2]
     }
     native_rotary_data = RotaryData(cls=MRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
     oot_rotary_data = RotaryData(cls=HPUMRotaryEmbedding, dtype=torch.bfloat16, device="hpu")
diff --git a/tests/unit_tests/ops/utils.py b/tests/unit_tests/ops/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
+import os
 import contextlib
 from vllm.model_executor.custom_op import CustomOp
 
@@ -28,3 +29,7 @@ def register_op(base_cls, oot_cls):
     within temporary_op_registry_oot context manager.
     """
     CustomOp.op_registry_oot[base_cls.__name__] = oot_cls
+
+
+def get_data_path(filename):
+    return os.path.join(os.path.dirname(__file__), filename)

Original file line number	Diff line number	Diff line change
`@@ -382,6 +382,7 @@ def test_m_rotary_embedding(`
`382`	`382`	`"max_position_embeddings": max_position_embeddings,`
`383`	`383`	`"base": base,`
`384`	`384`	`"is_neox_style": is_neox_style,`
	`385`	`+ "mrope_section": [rotary_dim // 2]`
`385`	`386`	`}`
`386`	`387`	`native_rotary_data = RotaryData(cls=MRotaryEmbedding, dtype=torch.bfloat16, device="hpu")`
`387`	`388`	`oot_rotary_data = RotaryData(cls=HPUMRotaryEmbedding, dtype=torch.bfloat16, device="hpu")`