Skip to content

Commit 322ad6e

Browse files
authored
fix inference issue (#529)
1 parent 03f3dcd commit 322ad6e

File tree

4 files changed

+24
-22
lines changed

4 files changed

+24
-22
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ inference. **[2,3,4,8] bits are supported**. However, it has not yet gained wide
272272
**AutoGPTQ Format**: This format is well-suited for symmetric quantization on CUDA devices and is widely adopted by the
273273
community, **[2,3,4,8] bits are supported**. However, **the
274274
asymmetric kernel has issues** that can cause considerable accuracy drops, particularly at 2-bit quantization and small
275-
models.
275+
models. Besides, recently 3 bits may have some accuracy issues in Transformers.
276276

277277
**AutoAWQ Format**: This format is well-suited for asymmetric 4-bit quantization on CUDA devices and is widely
278278
adopted within the community, **only 4-bits quantization is supported**.

auto_round/inference/backend.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,8 @@ def feature_multiply_checker_group_size(in_feature, out_feature, group_size, in_
101101
bits=[4],
102102
priority=5,
103103
dtype=["float16"],
104-
group_size=[-1, 32, 64, 128, 256, 384, 512, 1024, 2048],
105-
##16 seems has accuracy issue
104+
##16, 384,768 accuracy issue
105+
group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
106106
feature_checks=[exllamav2_feature_check],
107107
alias=['gptq', 'auto_gptq', 'exllamav2', "gptq:exllamav2",
108108
"auto_gptq:exllamav2"],
@@ -180,33 +180,33 @@ def feature_multiply_checker_group_size(in_feature, out_feature, group_size, in_
180180
bits=[4], group_size=None,
181181
priority=5,
182182
dtype=["float16"],
183-
alias=["auto_awq:gemm", "awq","awq:gemm",
183+
alias=["auto_awq:gemm", "awq", "awq:gemm",
184184
"auto_awq"],
185185
requirements=["autoawq"]
186186
)
187187

188188
BackendInfos['qbits'] = BackendInfo(device=["cpu"], sym=[True, False],
189-
packing_format="qbits",
190-
bits=[2, 4, 8], group_size=None,
191-
priority=0,
192-
feature_checks=[],
193-
alias=["itrex", "qbits"],
194-
dtype=["float16", "bfloat16"],
195-
convertable_format=["int32"],
196-
requirements=["intel-extension-for-transformers"])
189+
packing_format="qbits",
190+
bits=[2, 4, 8], group_size=None,
191+
priority=0,
192+
feature_checks=[],
193+
alias=["itrex", "qbits"],
194+
dtype=["float16", "bfloat16"],
195+
convertable_format=["int32"],
196+
requirements=["intel-extension-for-transformers"])
197197

198198
BackendInfos['qbits_zp'] = BackendInfo(device=["cpu"], sym=[True, False],
199199
packing_format="qbits_zp",
200200
bits=[2, 4, 8], group_size=None,
201201
dtype=["float16", "bfloat16"],
202-
priority=0 ,
202+
priority=0,
203203
feature_checks=[],
204204
alias=["itrex", "qbits"],
205205
convertable_format=["int32_zp"],
206206
requirements=["intel-extension-for-transformers"]
207207
)
208208

209-
BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True,False], ## for awq, not robust
209+
BackendInfos['auto_round:qbits_awq'] = BackendInfo(device=["cpu"], sym=[True, False], ## for awq, not robust
210210
packing_format="awq",
211211
bits=[2, 4, 8], group_size=None,
212212
priority=0,

auto_round/inference/convert_model.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -385,8 +385,9 @@ def _import_exllamav2_kernels():
385385
from exllamav2_kernels import gemm_half_q_half, make_q_matrix # pylint: disable=E0611, E0401
386386
except ImportError:
387387
raise ImportError(
388-
"For better inference performance, install ExLlamaV2 kernel via: "
389-
"`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`")
388+
"AutoGPTQ ExLlamaV2 has not been installed, Please install it using the following command: "
389+
"`pip install git+https://github.com/AutoGPTQ/AutoGPTQ.git@b8b4127`"
390+
)
390391

391392

392393
def _create_quant_layer(layer, layer_backend, config, in_features, out_features):
@@ -450,10 +451,10 @@ def post_init(model, used_backends):
450451
if need_autogptq_init:
451452
from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init # pylint: disable=E0401
452453
model = gptq_post_init(model, use_act_order=False)
453-
elif need_gptqmodel_init:
454+
if need_gptqmodel_init:
454455
from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init # pylint: disable=E0401
455456
model = gptq_post_init(model, use_act_order=False)
456-
elif need_ipex_itrex_init:
457+
if need_ipex_itrex_init:
457458
message = "repacking to CPU/XPU format"
458459
layers = [] ## ipex post_init will add one more layer
459460
for n, m in model.named_modules():
@@ -464,8 +465,8 @@ def post_init(model, used_backends):
464465
leave=True):
465466
layer.post_init()
466467

467-
if used_gptq_exllamav2:
468-
_import_exllamav2_kernels()
468+
if used_gptq_exllamav2:
469+
_import_exllamav2_kernels()
469470

470471
## convert datatype
471472
data_types = []

test_cuda/test_exllamav2_backend.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ def test_gptq_exllamav2_4bits_sym(self):
143143
shutil.rmtree(self.save_folder, ignore_errors=True)
144144

145145
def test_gptq_exllamav2_4bits_sym_group_size(self):
146-
for group_size in [32, 512, 1024]:
146+
for group_size in [-1, 32, 64, 128, 256, 1024]: ## 384, 768 has accuracy issue
147+
print(f"!!!!!!!!!!!!!!!!!{group_size}!!!!!!!!!!!!!!!!!")
147148
model = AutoModelForCausalLM.from_pretrained(self.model_name, torch_dtype="auto", trust_remote_code=True)
148149
tokenizer = AutoTokenizer.from_pretrained(self.model_name, trust_remote_code=True)
149150
bits, group_size, sym = 4, group_size, True
@@ -170,7 +171,7 @@ def test_gptq_exllamav2_4bits_sym_group_size(self):
170171

171172
tokenizer = AutoTokenizer.from_pretrained(self.save_folder)
172173
self.model_infer(model, tokenizer)
173-
result = simple_evaluate_user_model(model, tokenizer, batch_size=16, tasks="lambada_openai")
174+
result = simple_evaluate_user_model(model, tokenizer, batch_size=64, tasks="lambada_openai")
174175
print(result['results']['lambada_openai']['acc,none'])
175176
self.assertGreater(result['results']['lambada_openai']['acc,none'], 0.15)
176177
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)