1+ # SPDX-License-Identifier: Apache-2.0 
2+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project 
3+ 
14import  torch 
25import  habana_frameworks .torch  as  htorch 
6+ from  utils  import  get_data_path 
37from  unittest .mock  import  MagicMock 
48from  vllm_gaudi .ops .hpu_fp8  import  Fp8LinearMethod , HPUFp8MoEMethod 
59from  vllm_gaudi .utils  import  HPUCompileConfig 
@@ -29,9 +33,11 @@ def test_fp8_linear_method(dist_init, monkeypatch):
2933
3034    # Load weight and weight_scale_inv were extracted from first RowParallelLinear layer of Qwen/Qwen3-8B-FP8 
3135    # (with adjusted shapes, to make tensors smaller) 
32-     weight  =  torch .load ("data/fp8/linear_weight.pt" , weights_only = False , map_location = "hpu" )
36+     weight  =  torch .load (get_data_path ( "data/fp8/linear_weight.pt" ) , weights_only = False , map_location = "hpu" )
3337    oot_op .weight .copy_ (weight )
34-     weight_scale_inv  =  torch .load ("data/fp8/linear_weight_scale_inv.pt" , weights_only = False , map_location = "hpu" )
38+     weight_scale_inv  =  torch .load (get_data_path ("data/fp8/linear_weight_scale_inv.pt" ),
39+                                   weights_only = False ,
40+                                   map_location = "hpu" )
3541    oot_op .weight_scale_inv .copy_ (weight_scale_inv )
3642
3743    oot_op .quant_method .process_weights_after_loading (oot_op )
@@ -44,8 +50,8 @@ def test_fp8_linear_method(dist_init, monkeypatch):
4450    # Input and expected output 
4551    # Output tensor holds the data that was returned by cuda implementation of Fp8LinearMethod for given input 
4652    # (Fp8LinearMethod was triggered offline with the same input as below to get the ref_output) 
47-     input  =  torch .load ("data/fp8/linear_input.pt" , weights_only = False , map_location = "hpu" )
48-     ref_output  =  torch .load ("data/fp8/linear_output.pt" , weights_only = False , map_location = "hpu" )
53+     input  =  torch .load (get_data_path ( "data/fp8/linear_input.pt" ) , weights_only = False , map_location = "hpu" )
54+     ref_output  =  torch .load (get_data_path ( "data/fp8/linear_output.pt" ) , weights_only = False , map_location = "hpu" )
4955
5056    # Execute layer 
5157    out  =  oot_op (input )
@@ -94,27 +100,35 @@ def test_fp8_moe_method(dist_init, monkeypatch):
94100
95101    # Weights were extracted from first FusedMoE layer of Qwen/Qwen3-30B-A3B-FP8 
96102    # (with adjusted shapes, to make tensors smaller) 
97-     w13_weight  =  torch .load ("data/fp8/moe_w13_weight.pt" , weights_only = False , map_location = "hpu" )
103+     w13_weight  =  torch .load (get_data_path ( "data/fp8/moe_w13_weight.pt" ) , weights_only = False , map_location = "hpu" )
98104    oot_op .w13_weight .copy_ (w13_weight .repeat (128 , 1 , 1 ))
99-     w13_weight_scale_inv  =  torch .load ("data/fp8/moe_w13_weight_scale_inv.pt" , weights_only = False , map_location = "hpu" )
105+     w13_weight_scale_inv  =  torch .load (get_data_path ("data/fp8/moe_w13_weight_scale_inv.pt" ),
106+                                       weights_only = False ,
107+                                       map_location = "hpu" )
100108    oot_op .w13_weight_scale_inv .copy_ (w13_weight_scale_inv .repeat (128 , 1 , 1 ))
101-     w2_weight  =  torch .load ("data/fp8/moe_w2_weight.pt" , weights_only = False , map_location = "hpu" )
109+     w2_weight  =  torch .load (get_data_path ( "data/fp8/moe_w2_weight.pt" ) , weights_only = False , map_location = "hpu" )
102110    oot_op .w2_weight .copy_ (w2_weight .repeat (128 , 1 , 1 ))
103-     w2_weight_scale_inv  =  torch .load ("data/fp8/moe_w2_weight_scale_inv.pt" , weights_only = False , map_location = "hpu" )
111+     w2_weight_scale_inv  =  torch .load (get_data_path ("data/fp8/moe_w2_weight_scale_inv.pt" ),
112+                                      weights_only = False ,
113+                                      map_location = "hpu" )
104114    oot_op .w2_weight_scale_inv .copy_ (w2_weight_scale_inv .repeat (128 , 1 , 1 ))
105115
106116    oot_op .quant_method .process_weights_after_loading (oot_op )
107117
108118    if  not  htorch .utils .internal .is_lazy ():
109119        compile_config  =  HPUCompileConfig ()
110-         oot_op . quant_method . apply  =  torch .compile (oot_op . quant_method . apply , ** compile_config .get_compile_args ())
120+         oot_op  =  torch .compile (oot_op , ** compile_config .get_compile_args ())
111121
112122    # Input and expected output 
113123    # Output tensor holds the data that was returned by cuda implementation of Fp8MoEMethod for given input 
114124    # (Fp8MoEMethod was triggered offline with the same input as below to get the ref_output) 
115-     hidden_states  =  torch .load ("data/fp8/moe_input_hidden_states.pt" , weights_only = False , map_location = "hpu" )
116-     router_logits  =  torch .load ("data/fp8/moe_input_router_logits.pt" , weights_only = False , map_location = "hpu" )
117-     ref_output  =  torch .load ("data/fp8/moe_output.pt" , weights_only = False , map_location = "hpu" )
125+     hidden_states  =  torch .load (get_data_path ("data/fp8/moe_input_hidden_states.pt" ),
126+                                weights_only = False ,
127+                                map_location = "hpu" )
128+     router_logits  =  torch .load (get_data_path ("data/fp8/moe_input_router_logits.pt" ),
129+                                weights_only = False ,
130+                                map_location = "hpu" )
131+     ref_output  =  torch .load (get_data_path ("data/fp8/moe_output.pt" ), weights_only = False , map_location = "hpu" )
118132
119133    # Execute layer 
120134    mock_ctx  =  MagicMock (spec = ["dp_metadata" ])
0 commit comments