Ruff Check and format

quic-amitraj · quic-amitraj · commit d547d6fde356 · 2025-05-08T05:31:20.000Z
Signed-off-by: Amit Raj &lt;quic_amitraj@quicinc.com&gt;
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -385,4 +385,5 @@ def _create_causal_mask(
 
     return attention_mask
 
+
 VLM_SPLIT_GATE_UP_WEIGHTS = ["Llama4ForConditionalGeneration"]
diff --git a/examples/llama4_lm_example.py b/examples/llama4_lm_example.py
@@ -0,0 +1,55 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import torch
+from transformers import Llama4ForCausalLM
+
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.utils._utils import load_hf_tokenizer
+from QEfficient.utils.constants import Constants
+from QEfficient.utils.run_utils import ApiRunner
+
+torch.manual_seed(42)
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+model = Llama4ForCausalLM.from_pretrained(
+    model_id, torch_dtype=torch.float32, use_cache=True, attn_implementation="eager"
+)
+model.eval()
+
+original_sd = model.state_dict()
+
+tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_id)
+config = model.config
+batch_size = len(Constants.INPUT_STR)
+api_runner = ApiRunner(
+    batch_size,
+    tokenizer,
+    config,
+    Constants.INPUT_STR,
+    Constants.PROMPT_LEN,
+    Constants.CTX_LEN,
+)
+
+qeff_model = QEFFAutoModelForCausalLM(model)
+
+onnx_model_path = qeff_model.export()
+qpc_path = qeff_model.compile(
+    prefill_seq_len=128,
+    ctx_len=2048,
+    num_cores=16,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+    num_devices=8,
+    mos=1,
+    aic_enable_depth_first=True,
+    num_speculative_tokens=None,
+)
+print(f"qpc path is {qpc_path}")
+exec_info = qeff_model.generate(
+    tokenizer, prompts=Constants.INPUT_STR, generation_len=32, device_ids=[0, 1, 2, 3, 4, 5, 6, 7]
+)
diff --git a/examples/llama4_mm_single.py b/examples/llama4_mm_single.py
@@ -0,0 +1,67 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import torch
+import transformers
+from transformers import AutoConfig, AutoModelForImageTextToText, AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
+config = AutoConfig.from_pretrained(model_id)
+# For Testing Purpose Only
+# config.text_config.num_hidden_layers = 1
+# config.vision_config.num_hidden_layers = 2
+
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager", config=config)
+model.eval()
+
+qeff_model = QEFFAutoModelForImageTextToText(model, kv_offload=True)
+
+# TODO: Map the Vision Encoder to FP16 Only and Disable MXFP6 For Better Accuracy.
+qeff_model.compile(
+    prefill_seq_len=128,
+    ctx_len=3072,
+    img_size=336,
+    num_cores=16,
+    num_devices=8,
+    batch_size_times_num_tiles=17,
+    mxfp6_matmul=True,
+    mxint8_kv_cache=True,
+    aic_enable_depth_first=True,
+    mos=1,
+)
+
+image_url = (
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png"
+)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "url": image_url},
+            {"type": "text", "text": "Can you describe the image in detail."},
+        ],
+    },
+]
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
+processor = AutoProcessor.from_pretrained(model_id)
+inputs = processor.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=True,
+    return_dict=True,
+    return_tensors="pt",
+)
+inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32)
+streamer = TextStreamer(tokenizer)
+output = qeff_model.generate(inputs=inputs, device_ids=[0, 1, 2, 3, 4, 5, 6, 7], generation_len=100)
+print(output.generated_ids)
+print(tokenizer.batch_decode(output.generated_ids))
+print(output)

Original file line number	Diff line number	Diff line change
`@@ -385,4 +385,5 @@ def _create_causal_mask(`
`385`	`385`
`386`	`386`	`return attention_mask`
`387`	`387`
	`388`	`+`
`388`	`389`	`VLM_SPLIT_GATE_UP_WEIGHTS = ["Llama4ForConditionalGeneration"]`