12
12
import pytest
13
13
from transformers import AutoModelForCausalLM
14
14
15
- from QEfficient .exporter .export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
16
15
from QEfficient .transformers .models .modeling_auto import QEFFAutoModelForCausalLM
17
16
from QEfficient .transformers .quantizers .auto import replace_transformers_quantizers
18
17
from QEfficient .utils import hf_download
22
21
from QEfficient .utils .run_utils import ApiRunner
23
22
24
23
test_models_qaic = [
25
- "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ,
26
- "gpt2" ,
27
- "Salesforce/codegen-350M-mono" ,
28
- "microsoft/Phi-3-mini-4k-instruct" ,
24
+ # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
25
+ # "gpt2",
26
+ # "Salesforce/codegen-350M-mono",
27
+ # "microsoft/Phi-3-mini-4k-instruct",
29
28
"tiiuae/falcon-7b" ,
30
- "Qwen/Qwen2-0.5B" ,
31
- "bigcode/starcoder2-3b" ,
32
- "Felladrin/Minueza-32M-Base" ,
33
- "wtang06/mpt-125m-c4" ,
34
- "hakurei/gpt-j-random-tinier" ,
35
- "mistralai/Mixtral-8x7B-Instruct-v0.1" ,
36
- "meta-llama/Llama-3.2-1B" ,
37
- "unsloth/gemma-2b" ,
38
- "unsloth/gemma-2-2b" ,
39
- "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ" , # AWQ model
40
- "TheBloke/Llama-2-7B-GPTQ" , # GPTQ model
41
- "ibm-granite/granite-20b-code-base" ,
42
- # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
43
- "neuralmagic/Llama-3.2-3B-Instruct-FP8" , # float quantized compressed-tensor per tensor both weight and activations
44
- "neuralmagic/Qwen2-0.5B-Instruct-FP8" , # fp8 quant method, static, with lm head ignored
45
- "ibm-granite/granite-3.1-2b-instruct" ,
46
- "ibm-granite/granite-guardian-3.1-2b" ,
29
+ # "Qwen/Qwen2-0.5B",
30
+ # "bigcode/starcoder2-3b",
31
+ # "Felladrin/Minueza-32M-Base",
32
+ # "wtang06/mpt-125m-c4",
33
+ # "hakurei/gpt-j-random-tinier",
34
+ # "mistralai/Mixtral-8x7B-Instruct-v0.1",
35
+ # "meta-llama/Llama-3.2-1B",
36
+ # "unsloth/gemma-2b",
37
+ # "unsloth/gemma-2-2b",
38
+ # "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model
39
+ # "TheBloke/Llama-2-7B-GPTQ", # GPTQ model
40
+ # "ibm-granite/granite-20b-code-base",
41
+ # # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
42
+ # "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations
43
+ # "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored
44
+ # "ibm-granite/granite-3.1-2b-instruct",
45
+ # "ibm-granite/granite-guardian-3.1-2b",
47
46
]
48
47
49
48
test_models_qnn = [
50
- "mistralai/Mixtral-8x7B-Instruct-v0.1" ,
51
- "meta-llama/Llama-3.2-1B" ,
52
- "unsloth/gemma-2b" ,
53
- "ibm-granite/granite-guardian-3.1-2b" ,
49
+ # "mistralai/Mixtral-8x7B-Instruct-v0.1",
50
+ # "meta-llama/Llama-3.2-1B",
51
+ # "unsloth/gemma-2b",
52
+ # "ibm-granite/granite-guardian-3.1-2b",
54
53
]
55
54
56
55
spd_test_models = [
57
- "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ,
58
- "Qwen/Qwen2-0.5B" ,
56
+ # "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
57
+ # "Qwen/Qwen2-0.5B",
59
58
]
60
59
61
60
@@ -215,33 +214,33 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
215
214
216
215
217
216
# FIXME: there should be a CB test here
218
- @pytest .mark .parametrize ("model_name" , ["gpt2" ], ids = lambda x : x )
219
- def test_causal_lm_export_with_deprecated_api (model_name ):
220
- model_config = {"model_name" : model_name }
221
- model_config ["n_layer" ] = 1
222
- model , _ = load_causal_lm_model (model_config )
223
- tokenizer = load_hf_tokenizer (pretrained_model_name_or_path = model_name )
224
- qeff_model = QEFFAutoModelForCausalLM (model , model_name = model_name , pretrained_model_name_or_path = model_name )
225
- new_api_onnx_model_path = qeff_model .export ()
226
- _ , old_api_onnx_model_path = qualcomm_efficient_converter (
227
- model_name = model_name , model_kv = qeff_model , tokenizer = tokenizer
228
- )
229
-
230
- api_runner = ApiRunner (
231
- batch_size = 1 ,
232
- tokenizer = tokenizer ,
233
- config = model .config ,
234
- prompt = Constants .INPUT_STR ,
235
- prompt_len = Constants .PROMPT_LEN ,
236
- ctx_len = Constants .CTX_LEN ,
237
- )
238
-
239
- new_api_ort_tokens = api_runner .run_kv_model_on_ort (new_api_onnx_model_path )
240
- old_api_ort_tokens = api_runner .run_kv_model_on_ort (old_api_onnx_model_path )
241
-
242
- assert (new_api_ort_tokens == old_api_ort_tokens ).all (), (
243
- "New API output does not match old API output for ONNX export function"
244
- )
217
+ # @pytest.mark.parametrize("model_name", ["gpt2"], ids=lambda x: x)
218
+ # def test_causal_lm_export_with_deprecated_api(model_name):
219
+ # model_config = {"model_name": model_name}
220
+ # model_config["n_layer"] = 1
221
+ # model, _ = load_causal_lm_model(model_config)
222
+ # tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
223
+ # qeff_model = QEFFAutoModelForCausalLM(model, model_name=model_name, pretrained_model_name_or_path=model_name)
224
+ # new_api_onnx_model_path = qeff_model.export()
225
+ # _, old_api_onnx_model_path = qualcomm_efficient_converter(
226
+ # model_name=model_name, model_kv=qeff_model, tokenizer=tokenizer
227
+ # )
228
+
229
+ # api_runner = ApiRunner(
230
+ # batch_size=1,
231
+ # tokenizer=tokenizer,
232
+ # config=model.config,
233
+ # prompt=Constants.INPUT_STR,
234
+ # prompt_len=Constants.PROMPT_LEN,
235
+ # ctx_len=Constants.CTX_LEN,
236
+ # )
237
+
238
+ # new_api_ort_tokens = api_runner.run_kv_model_on_ort(new_api_onnx_model_path)
239
+ # old_api_ort_tokens = api_runner.run_kv_model_on_ort(old_api_onnx_model_path)
240
+
241
+ # assert (new_api_ort_tokens == old_api_ort_tokens).all(), (
242
+ # "New API output does not match old API output for ONNX export function"
243
+ # )
245
244
246
245
247
246
@pytest .mark .on_qaic
@@ -260,84 +259,84 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
260
259
check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (model_name = model_name , n_layer = n_layer )
261
260
262
261
263
- @pytest .mark .on_qaic
264
- @pytest .mark .qnn
265
- @pytest .mark .parametrize ("model_name" , test_models_qnn )
266
- def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn (model_name ):
267
- """
268
- QNN Compilation Test
269
- Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
270
- ``Mandatory`` Args:
271
- :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
272
- """
273
- if model_name == "microsoft/Phi-3-mini-4k-instruct" :
274
- n_layer = 2 # test only 2 layer models
275
- else :
276
- n_layer = 1
277
-
278
- qnn_config_json_path = os .path .join (os .getcwd (), "qnn_config.json" )
279
- create_json (qnn_config_json_path , QnnConstants .QNN_SAMPLE_CONFIG )
280
-
281
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (
282
- model_name = model_name , n_layer = n_layer , enable_qnn = True , qnn_config = qnn_config_json_path
283
- )
284
-
285
-
286
- @pytest .mark .skip () # remove when the SDK 1.20.0 issue solved for compiling this model
287
- @pytest .mark .on_qaic
288
- @pytest .mark .parametrize ("model_name" , spd_test_models )
289
- def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100 (model_name ):
290
- """
291
- Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
292
- ``Mandatory`` Args:
293
- :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
294
- """
295
-
296
- if model_name == "microsoft/Phi-3-mini-4k-instruct" :
297
- n_layer = 2 # test only 2 layer models
298
- else :
299
- n_layer = 1
300
-
301
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (
302
- model_name = model_name , n_layer = n_layer , num_speculative_tokens = Constants .NUM_SPECULATIVE_TOKENS
303
- )
304
-
305
-
306
- @pytest .mark .on_qaic
307
- def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1 ():
308
- """
309
- Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
310
- """
311
- model_name = "gpt2"
312
- prompt_len = 1
313
-
314
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (model_name = model_name , prompt_len = prompt_len )
315
-
316
-
317
- @pytest .mark .on_qaic
318
- @pytest .mark .qnn
319
- def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn ():
320
- """
321
- Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
322
- """
323
- model_name = "gpt2"
324
- prompt_len = 1
325
-
326
- qnn_config_json_path = os .path .join (os .getcwd (), "qnn_config.json" )
327
- create_json (qnn_config_json_path , QnnConstants .QNN_SAMPLE_CONFIG )
328
-
329
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (
330
- model_name = model_name , prompt_len = prompt_len , enable_qnn = True , qnn_config = qnn_config_json_path
331
- )
332
-
333
-
334
- @pytest .mark .on_qaic
335
- def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100 ():
336
- model_name = "gpt2"
337
- n_layer = 1
338
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (model_name , n_layer = n_layer , prefill_only = True )
339
-
340
- check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100 (model_name , n_layer = n_layer , prefill_only = False )
262
+ # @pytest.mark.on_qaic
263
+ # @pytest.mark.qnn
264
+ # @pytest.mark.parametrize("model_name", test_models_qnn)
265
+ # def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name):
266
+ # """
267
+ # QNN Compilation Test
268
+ # Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
269
+ # ``Mandatory`` Args:
270
+ # :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
271
+ # """
272
+ # if model_name == "microsoft/Phi-3-mini-4k-instruct":
273
+ # n_layer = 2 # test only 2 layer models
274
+ # else:
275
+ # n_layer = 1
276
+
277
+ # qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
278
+ # create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
279
+
280
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
281
+ # model_name=model_name, n_layer=n_layer, enable_qnn=True, qnn_config=qnn_config_json_path
282
+ # )
283
+
284
+
285
+ # @pytest.mark.skip() # remove when the SDK 1.20.0 issue solved for compiling this model
286
+ # @pytest.mark.on_qaic
287
+ # @pytest.mark.parametrize("model_name", spd_test_models)
288
+ # def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
289
+ # """
290
+ # Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
291
+ # ``Mandatory`` Args:
292
+ # :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
293
+ # """
294
+
295
+ # if model_name == "microsoft/Phi-3-mini-4k-instruct":
296
+ # n_layer = 2 # test only 2 layer models
297
+ # else:
298
+ # n_layer = 1
299
+
300
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
301
+ # model_name=model_name, n_layer=n_layer, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS
302
+ # )
303
+
304
+
305
+ # @pytest.mark.on_qaic
306
+ # def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1():
307
+ # """
308
+ # Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
309
+ # """
310
+ # model_name = "gpt2"
311
+ # prompt_len = 1
312
+
313
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, prompt_len=prompt_len)
314
+
315
+
316
+ # @pytest.mark.on_qaic
317
+ # @pytest.mark.qnn
318
+ # def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_pl1_qnn():
319
+ # """
320
+ # Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model for a prompt length of 1, both with and without continuous batching.
321
+ # """
322
+ # model_name = "gpt2"
323
+ # prompt_len = 1
324
+
325
+ # qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
326
+ # create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG)
327
+
328
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
329
+ # model_name=model_name, prompt_len=prompt_len, enable_qnn=True, qnn_config=qnn_config_json_path
330
+ # )
331
+
332
+
333
+ # @pytest.mark.on_qaic
334
+ # def test_prefiill_only_pytorch_vs_kv_vs_ort_vs_ai100():
335
+ # model_name = "gpt2"
336
+ # n_layer = 1
337
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=True)
338
+
339
+ # check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, prefill_only=False)
341
340
342
341
343
342
@pytest .mark .on_qaic
0 commit comments