fix inference issue (#529)

wenhuach21 · web-flow · commit 322ad6ef5486 · 2025-04-21T18:28:00.000+08:00
diff --git a/README.md b/README.md
@@ -272,7 +272,7 @@ inference. **[2,3,4,8] bits are supported**. However, it has not yet gained wide
 **AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the
 community, **[2,3,4,8] bits are supported**. However, **the
 asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small
-models.
+models. Besides, recently 3 bits may have some accuracy issues in Transformers.
 
 **AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely
 adopted within the community, **only 4-bits quantization is supported**.
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
@@ -101,8 +101,8 @@ def feature_multiply_checker_group_size(in_feature, out_feature, group_size, in_
                                                   bits=[4],
                                                   priority=5,
                                                   dtype=["float16"],
-                                                  group_size=[-1, 32, 64, 128, 256, 384, 512, 1024, 2048],
-                                                  ##16 seems has accuracy issue
+                                                  ##16, 384,768 accuracy issue
+                                                  group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
                                                   feature_checks=[exllamav2_feature_check],
                                                   alias=['gptq', 'auto_gptq', 'exllamav2', "gptq:exllamav2",
                                                          "auto_gptq:exllamav2"],
@@ -180,33 +180,33 @@ def feature_multiply_checker_group_size(in_feature, out_feature, group_size, in_
                                             bits=[4], group_size=None,
                                             priority=5,
                                             dtype=["float16"],
-                                            alias=["auto_awq:gemm", "awq","awq:gemm",
+                                            alias=["auto_awq:gemm", "awq", "awq:gemm",
                                                    "auto_awq"],
                                             requirements=["autoawq"]
                                             )
 
 BackendInfos['qbits'] = BackendInfo(device=["cpu"], sym=[True, False],
-                                          packing_format="qbits",
-                                          bits=[2, 4, 8], group_size=None,
-                                          priority=0,
-                                          feature_checks=[],
-                                          alias=["itrex", "qbits"],
-                                          dtype=["float16", "bfloat16"],
-                                          convertable_format=["int32"],
-                                          requirements=["intel-extension-for-transformers"])
+                                    packing_format="qbits",
+                                    bits=[2, 4, 8], group_size=None,
+                                    priority=0,
+                                    feature_checks=[],
+                                    alias=["itrex", "qbits"],
+                                    dtype=["float16", "bfloat16"],
+                                    convertable_format=["int32"],
+                                    requirements=["intel-extension-for-transformers"])
 
 BackendInfos['qbits_zp'] = BackendInfo(device=["cpu"], sym=[True, False],
                                        packing_format="qbits_zp",
                                        bits=[2, 4, 8], group_size=None,
                                        dtype=["float16", "bfloat16"],
-                                       priority=0 ,
+                                       priority=0,
                                        feature_checks=[],
                                        alias=["itrex", "qbits"],
                                        convertable_format=["int32_zp"],
                                        requirements=["intel-extension-for-transformers"]
                                        )
 
-BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True,False], ## for awq, not robust
+BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True, False],  ## for awq, not robust
                                                    packing_format="awq",
                                                    bits=[2, 4, 8], group_size=None,
                                                    priority=0,
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
@@ -385,8 +385,9 @@ def _import_exllamav2_kernels():
         from exllamav2_kernels import gemm_half_q_half, make_q_matrix  # pylint: disable=E0611, E0401
     except ImportError:
         raise ImportError(
-            "For better inference performance, install ExLlamaV2 kernel via: "
-            "`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")
+            "AutoGPTQ ExLlamaV2 has not been installed, Please install it using the following command: "
+            "`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`"
+        )
 
 
 def _create_quant_layer(layer, layer_backend, config, in_features, out_features):
@@ -450,10 +451,10 @@ def post_init(model, used_backends):
     if need_autogptq_init:
         from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init  # pylint: disable=E0401
         model = gptq_post_init(model, use_act_order=False)
-    elif need_gptqmodel_init:
+    if need_gptqmodel_init:
         from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init  # pylint: disable=E0401
         model = gptq_post_init(model, use_act_order=False)
-    elif need_ipex_itrex_init:
+    if need_ipex_itrex_init:
         message = "repacking to CPU/XPU format"
         layers = []  ## ipex post_init  will add one more layer
         for n, m in model.named_modules():
@@ -464,8 +465,8 @@ def post_init(model, used_backends):
                           leave=True):
             layer.post_init()
 
-        if used_gptq_exllamav2:
-            _import_exllamav2_kernels()
+    if used_gptq_exllamav2:
+        _import_exllamav2_kernels()
 
     ## convert datatype
     data_types = []
diff --git a/test_cuda/test_exllamav2_backend.py b/test_cuda/test_exllamav2_backend.py
@@ -143,7 +143,8 @@ def test_gptq_exllamav2_4bits_sym(self):
         shutil.rmtree(self.save_folder, ignore_errors=True)
 
     def test_gptq_exllamav2_4bits_sym_group_size(self):
-        for group_size in [32, 512, 1024]:
+        for group_size in [-1, 32, 64, 128, 256, 1024]:  ## 384, 768 has accuracy issue
+            print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
             model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
             bits, group_size, sym = 4, group_size, True
@@ -170,7 +171,7 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
 
             tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
             self.model_infer(model, tokenizer)
-            result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
+            result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
             print(result['results']['lambada_openai']['acc,none'])
             self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.15)
             torch.cuda.empty_cache()