Skip to content

Enable debug #416

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion QEfficient/generation/cloud_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(
qpc_path: Union[Path, str],
device_ids: Optional[List[int]] = None,
activate: bool = True,
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
):
"""
Initialise for QAIC inference Session
Expand Down
6 changes: 3 additions & 3 deletions QEfficient/generation/text_generation_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def cloud_ai_100_exec_kv(
prompts_txt_file_path: Optional[str] = None,
device_id: Optional[List[int]] = None,
generation_len: Optional[int] = None,
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
stream: bool = True,
write_io_dir: Optional[str] = None,
automation=False,
Expand Down Expand Up @@ -408,7 +408,7 @@ def __init__(
full_batch_size: Optional[int] = None,
ctx_len: Optional[int] = None,
device_id: Optional[List[int]] = None,
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
write_io_dir: Optional[str] = None,
is_tlm: Optional[int] = None,
) -> None:
Expand Down Expand Up @@ -902,7 +902,7 @@ def __init__(
full_batch_size: Optional[int] = None,
ctx_len: Optional[int] = None,
device_id: Optional[List[int]] = None,
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
write_io_dir: Optional[str] = None,
is_tlm: bool = False,
) -> None:
Expand Down
2 changes: 1 addition & 1 deletion QEfficient/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,7 @@ def cloud_ai_100_generate(
self,
inputs: torch.Tensor,
device_ids: List[int],
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
generation_len: int = None,
streamer: Optional[TextStreamer] = None,
) -> np.ndarray:
Expand Down
2 changes: 1 addition & 1 deletion examples/cpp_execution/text_inference_using_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def cloud_ai_100_exec_kv_cpp(
prompts_txt_file_path: Optional[str] = None,
device_id: Optional[List[int]] = None,
generation_len: Optional[int] = None,
enable_debug_logs: bool = False,
enable_debug_logs: bool = True,
stream: bool = True,
full_batch_size: Optional[int] = None,
):
Expand Down
4 changes: 3 additions & 1 deletion scripts/replicate_kv_head/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,6 @@ Replace `<hf_token>` with your actual token.
### Arguments
- **--model_name**: Model card name to use (default: “meta-llama/Meta-Llama-3-8B-Instruct”).
- **--prompt**: Prompt to use for the model (default: “My name is”).
- **--repeat**: Factor to repeat key-value heads (default: 2).
- **--repeat**: Factor to repeat key-value heads (default: 2).
- **--num_attention_heads**: Number of attentin heads (default: None). This is optional param, if not given explicitly the will be read from config.json.
- **--hidden_size**: Hidden size (default: None). This is optional param, if not given explicitly the will be read from config.json.
80 changes: 68 additions & 12 deletions scripts/replicate_kv_head/replicate_kv_heads.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# -----------------------------------------------------------------------------

import argparse
from typing import Optional

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
Expand Down Expand Up @@ -70,46 +71,78 @@ def duplicate_weights_for_linear_layer(
)


def main(args):
def replicate_kv_heads(
model_name: str = "meta-llama/Meta-Llama-3-8B-Instruct",
prompt: str = "My name is",
repeat: int = 2,
full_batch_size: Optional[int] = None,
num_hidden_layers: Optional[int] = None,
num_attention_heads: Optional[int] = None,
hidden_size: Optional[int] = None,
):
"""
Replicate the KV heads. The script performs the following steps:
1. Runs inference with the original model.
2. Replicates the KV heads.
3. Runs inference on the modified model to validate the changes.
4. Exports the modified model to ONNX format.

``Mandatory`` Args:
:model_name (str): Model card name to use, default value as meta-llama/Meta-Llama-3-8B-Instruct.
:prompt (str): Prompt to use for the model, default value as My name is
:repeat (int): Factor to repeat key-value heads.
``Optional`` Args:
:full_batch_size (int): Set full batch size to enable continuous batching mode, default is None.
:num_hidden_layers (int): Number of hidden layers to use, default is None.
:num_attention_heads (int): Number of attention heads, if not passed explicitly then will be picked from config.json.
:hidden_size (int): Hidden size to use, if not passed explicitly then will be picked from config.json.

"""
# Load the model and tokenizer
model_name = args.model_name
model_base_name = model_name.split("/")[-1]
# Replace quantizers for loading Quantized AWQ/GPTQ models on CPU.
replace_transformers_quantizers()
# Prepare kwargs for model loading
model_kwargs = {"attn_implementation": "eager"}
if args.num_hidden_layers:
model_kwargs["num_hidden_layers"] = args.num_hidden_layers

if num_hidden_layers:
model_kwargs["num_hidden_layers"] = num_hidden_layers

pretrained_model_name_or_path = login_and_download_hf_lm(model_name)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, **model_kwargs)

# Undo the effect of replace_transformers_quantizers
undo_transformers_quantizers()
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(args.prompt, return_tensors="pt")
inputs = tokenizer(prompt, return_tensors="pt")

# Generate original outputs and tokens
with torch.inference_mode():
_ = model(**inputs) # original output
orig_tokens = model.generate(**inputs, max_new_tokens=10, num_beams=1, do_sample=False)

# Modify the number of key-value heads
repeat = args.repeat
orig_kv_heads = model.config.num_key_value_heads
new_kv_heads = repeat * orig_kv_heads
model.config.num_key_value_heads = new_kv_heads

print("Original KV heads:", orig_kv_heads)
print("Modified KV heads:", new_kv_heads)

# Check if hidden size and number of attention heads are explicitly passed as arguments or not
if num_attention_heads is None:
num_attention_heads = model.config.num_attention_heads

if hidden_size is None:
hidden_size = model.config.hidden_size

# Update the model's attention layers with new key-value heads
for block in model.model.layers:
attn = block.self_attn
attn.num_key_value_heads = new_kv_heads
attn.num_key_value_groups = block.self_attn.num_heads // new_kv_heads
duplicate_weights_for_linear_layer(attn.k_proj, orig_kv_heads, repeat, attn.head_dim, attn.hidden_size)
duplicate_weights_for_linear_layer(attn.v_proj, orig_kv_heads, repeat, attn.head_dim, attn.hidden_size)
attn.num_key_value_groups = num_attention_heads // new_kv_heads
duplicate_weights_for_linear_layer(attn.k_proj, orig_kv_heads, repeat, attn.head_dim, hidden_size)
duplicate_weights_for_linear_layer(attn.v_proj, orig_kv_heads, repeat, attn.head_dim, hidden_size)

# Generate modified outputs and tokens
with torch.inference_mode():
Expand All @@ -126,13 +159,13 @@ def main(args):
)

# Export the modified model
q_model = QEFFAutoModelForCausalLM(model, continuous_batching=(True if args.full_batch_size else False))
q_model = QEFFAutoModelForCausalLM(model, continuous_batching=(True if full_batch_size else False))
export(
model_name,
q_model,
tokenizer=tokenizer,
onnx_dir_path=f"{model_base_name}-{new_kv_heads}kvheads",
full_batch_size=(args.full_batch_size if args.full_batch_size else None),
full_batch_size=(full_batch_size if full_batch_size else None),
)


Expand Down Expand Up @@ -162,6 +195,29 @@ def main(args):
default=None,
help="Number of hidden layers to use, default is None",
)
parser.add_argument(
"--num_attention_heads",
"--num-attention-heads",
type=int,
default=None,
help="Number of attention heads, if not passed explicitly then will be picked from config.json",
)
parser.add_argument(
"--hidden_size",
"--hidden-size",
type=int,
default=None,
help="Hidden size to use, if not passed explicitly then will be picked from config.json",
)

args = parser.parse_args()
main(args)

replicate_kv_heads(
model_name=args.model_name,
prompt=args.prompt,
repeat=args.repeat,
full_batch_size=args.full_batch_size,
num_hidden_layers=args.num_hidden_layers,
num_attention_heads=args.num_attention_heads,
hidden_size=args.hidden_size,
)
Loading