1+ # SPDX-License-Identifier: Apache-2.0
2+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
14import torch
25import habana_frameworks .torch as htorch
6+ from utils import get_data_path
37from vllm_gaudi .ops .hpu_gptq import GPTQHPULinearMethod , GPTQHPUConfig
48from vllm_gaudi .utils import HPUCompileConfig
59from vllm .model_executor .layers .linear import RowParallelLinear
@@ -22,27 +26,29 @@ def test_gptq_linear_method(dist_init):
2226 disable_tp = False ).to ("hpu" )
2327 assert isinstance (oot_op .quant_method , GPTQHPULinearMethod )
2428
25- if not htorch .utils .internal .is_lazy ():
26- compile_config = HPUCompileConfig ()
27- oot_op = torch .compile (oot_op , ** compile_config .get_compile_args ())
28-
2929 # qweight, qzeros, scales were extracted from first RowParallelLinear of TheBloke/Llama-2-7B-Chat-GPTQ
3030 # (with adjusted shape, to make tensors smaller)
31- qweight = torch .load ("data/gptq/qweight.pt" , weights_only = False , map_location = "hpu" )
31+ qweight = torch .load (get_data_path ( "data/gptq/qweight.pt" ) , weights_only = False , map_location = "hpu" )
3232 oot_op .qweight .copy_ (qweight )
33- qzeros = torch .load ("data/gptq/qzeros.pt" , weights_only = False , map_location = "hpu" )
33+ qzeros = torch .load (get_data_path ( "data/gptq/qzeros.pt" ) , weights_only = False , map_location = "hpu" )
3434 oot_op .qzeros .copy_ (qzeros )
35- scales = torch .load ("data/gptq/scales.pt" , weights_only = False , map_location = "hpu" ).to (torch .bfloat16 )
35+ scales = torch .load (get_data_path ( "data/gptq/scales.pt" ) , weights_only = False , map_location = "hpu" ).to (torch .bfloat16 )
3636 oot_op .scales .copy_ (scales )
3737
38+ oot_op .quant_method .process_weights_after_loading (oot_op )
39+
40+ if not htorch .utils .internal .is_lazy ():
41+ compile_config = HPUCompileConfig ()
42+ oot_op = torch .compile (oot_op , ** compile_config .get_compile_args ())
43+
3844 # Input and expected output
3945 # Output tensor holds the data that was returned by cuda implementation of GPTQLinearMethod for given input
4046 # (GPTQLinearMethod was triggered offline with the same input as below to get the ref_output)
41- input = torch .load ("data/gptq/input.pt" , weights_only = False , map_location = "hpu" ).to (torch .bfloat16 )
42- ref_output = torch .load ("data/gptq/output.pt" , weights_only = False , map_location = "hpu" ).to (torch .bfloat16 )
47+ input = torch .load (get_data_path ("data/gptq/input.pt" ), weights_only = False , map_location = "hpu" ).to (torch .bfloat16 )
48+ ref_output = torch .load (get_data_path ("data/gptq/output.pt" ), weights_only = False ,
49+ map_location = "hpu" ).to (torch .bfloat16 )
4350
4451 # Execute layer
45- oot_op .quant_method .process_weights_after_loading (oot_op )
4652 out = oot_op (input )
4753
4854 # Check correctness
0 commit comments