Skip to content

Commit d547d6f

Browse files
committed
Ruff Check and format
Signed-off-by: Amit Raj <[email protected]>
1 parent 384634c commit d547d6f

File tree

3 files changed

+123
-0
lines changed

3 files changed

+123
-0
lines changed

QEfficient/transformers/modeling_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,4 +385,5 @@ def _create_causal_mask(
385385

386386
return attention_mask
387387

388+
388389
VLM_SPLIT_GATE_UP_WEIGHTS = ["Llama4ForConditionalGeneration"]

examples/llama4_lm_example.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------
7+
8+
import torch
9+
from transformers import Llama4ForCausalLM
10+
11+
from QEfficient import QEFFAutoModelForCausalLM
12+
from QEfficient.utils._utils import load_hf_tokenizer
13+
from QEfficient.utils.constants import Constants
14+
from QEfficient.utils.run_utils import ApiRunner
15+
16+
torch.manual_seed(42)
17+
18+
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
19+
model = Llama4ForCausalLM.from_pretrained(
20+
model_id, torch_dtype=torch.float32, use_cache=True, attn_implementation="eager"
21+
)
22+
model.eval()
23+
24+
original_sd = model.state_dict()
25+
26+
tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_id)
27+
config = model.config
28+
batch_size = len(Constants.INPUT_STR)
29+
api_runner = ApiRunner(
30+
batch_size,
31+
tokenizer,
32+
config,
33+
Constants.INPUT_STR,
34+
Constants.PROMPT_LEN,
35+
Constants.CTX_LEN,
36+
)
37+
38+
qeff_model = QEFFAutoModelForCausalLM(model)
39+
40+
onnx_model_path = qeff_model.export()
41+
qpc_path = qeff_model.compile(
42+
prefill_seq_len=128,
43+
ctx_len=2048,
44+
num_cores=16,
45+
mxfp6_matmul=True,
46+
mxint8_kv_cache=True,
47+
num_devices=8,
48+
mos=1,
49+
aic_enable_depth_first=True,
50+
num_speculative_tokens=None,
51+
)
52+
print(f"qpc path is {qpc_path}")
53+
exec_info = qeff_model.generate(
54+
tokenizer, prompts=Constants.INPUT_STR, generation_len=32, device_ids=[0, 1, 2, 3, 4, 5, 6, 7]
55+
)

examples/llama4_mm_single.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# -----------------------------------------------------------------------------
2+
#
3+
# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
4+
# SPDX-License-Identifier: BSD-3-Clause
5+
#
6+
# -----------------------------------------------------------------------------
7+
8+
import torch
9+
import transformers
10+
from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer
11+
12+
from QEfficient import QEFFAutoModelForImageTextToText
13+
14+
model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
15+
config = AutoConfig.from_pretrained(model_id)
16+
# For Testing Purpose Only
17+
# config.text_config.num_hidden_layers = 1
18+
# config.vision_config.num_hidden_layers = 2
19+
20+
model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config)
21+
model.eval()
22+
23+
qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True)
24+
25+
# TODO: Map the Vision Encoder to FP16 Only and Disable MXFP6 For Better Accuracy.
26+
qeff_model.compile(
27+
prefill_seq_len=128,
28+
ctx_len=3072,
29+
img_size=336,
30+
num_cores=16,
31+
num_devices=8,
32+
batch_size_times_num_tiles=17,
33+
mxfp6_matmul=True,
34+
mxint8_kv_cache=True,
35+
aic_enable_depth_first=True,
36+
mos=1,
37+
)
38+
39+
image_url = (
40+
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
41+
)
42+
43+
messages = [
44+
{
45+
"role": "user",
46+
"content": [
47+
{"type": "image", "url": image_url},
48+
{"type": "text", "text": "Can you describe the image in detail."},
49+
],
50+
},
51+
]
52+
53+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
54+
processor = AutoProcessor.from_pretrained(model_id)
55+
inputs = processor.apply_chat_template(
56+
messages,
57+
add_generation_prompt=True,
58+
tokenize=True,
59+
return_dict=True,
60+
return_tensors="pt",
61+
)
62+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
63+
streamer = TextStreamer(tokenizer)
64+
output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
65+
print(output.generated_ids)
66+
print(tokenizer.batch_decode(output.generated_ids))
67+
print(output)

0 commit comments

Comments
 (0)