Skip to content

[QEff. Finetune]: Enabled FT CI tests. #420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 1, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion QEfficient/finetune/dataset/samsum_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@


def get_preprocessed_samsum(dataset_config, tokenizer, split, context_length=None):
dataset = datasets.load_dataset("Samsung/samsum", split=split, trust_remote_code=True)
dataset = datasets.load_dataset("knkarthick/samsum", split=split, trust_remote_code=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this approved dataset?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will be removed in #482


prompt = "Summarize this dialog:\n{dialog}\n---\nSummary:\n"

Expand Down
32 changes: 25 additions & 7 deletions scripts/Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ pipeline {
pip install junitparser pytest-xdist &&
pip install librosa==0.10.2 soundfile==0.13.1 && #packages needed to load example for whisper testing
pip install --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1 && #packages to load VLMs
pip install /opt/qti-aic/integrations/torch_qaic/py310/torch_qaic-0.1.0-cp310-cp310-linux_x86_64.whl && # For finetuning tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this replace the torch that is getting installed on line 27?
If yes, we should install this package only at the finetuning tests stage.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it won't update existing pytorch installation. The torch_qaic is the add-on for qaic backend.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did e2e CI run pass post this change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But most of qeff users won't have this package installed.
We should reflect the exact env that is going to be used by customers.
I understand it should not affect anything, but why should we deviate from golden standard?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did e2e CI run pass post this change?
Yes, it ran.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But most of qeff users won't have this package installed. We should reflect the exact env that is going to be used by customers. I understand it should not affect anything, but why should we deviate from golden standard?

Thanks for pointing it out. We will have subsequent PRs for FT testcases. We will try to take care this in it by having a new stage before FT test cases.

rm -rf QEfficient"
'''
}
Expand All @@ -41,7 +42,7 @@ pipeline {
mkdir -p $PWD/Non_cli_qaic &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_cli_qaic &&
pytest tests -m '(not cli) and (not on_qaic)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml &&
pytest tests -m '(not cli) and (not on_qaic) and (not finetune)' --ignore tests/vllm -n auto --junitxml=tests/tests_log1.xml &&
junitparser merge tests/tests_log1.xml tests/tests_log.xml &&
deactivate"
'''
Expand All @@ -58,7 +59,7 @@ pipeline {
mkdir -p $PWD/Non_qaic &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_qaic &&
pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
pytest tests -m '(not cli) and (on_qaic) and (not multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log2.xml &&
junitparser merge tests/tests_log2.xml tests/tests_log.xml &&
deactivate"
'''
Expand All @@ -77,14 +78,14 @@ pipeline {
mkdir -p $PWD/Non_cli_qaic_multimodal &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Non_cli_qaic_multimodal &&
pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml &&
pytest tests -m '(not cli) and (on_qaic) and (multimodal) and (not qnn) and (not finetune)' --ignore tests/vllm -n 4 --junitxml=tests/tests_log6.xml &&
junitparser merge tests/tests_log6.xml tests/tests_log.xml &&
deactivate"
'''
}
}
}
stage('CLI Tests') {
stage('Inference Tests') {
steps {
timeout(time: 60, unit: 'MINUTES') {
sh '''
Expand All @@ -96,7 +97,7 @@ pipeline {
mkdir -p $PWD/cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/cli &&
pytest tests -m '(cli and not qnn)' --ignore tests/vllm --junitxml=tests/tests_log3.xml &&
pytest tests -m '(cli and not qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log3.xml &&
junitparser merge tests/tests_log3.xml tests/tests_log.xml &&
deactivate"
'''
Expand Down Expand Up @@ -125,7 +126,7 @@ pipeline {
mkdir -p $PWD/Qnn_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_cli &&
pytest tests -m '(cli and qnn)' --ignore tests/vllm --junitxml=tests/tests_log4.xml &&
pytest tests -m '(cli and qnn) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log4.xml &&
junitparser merge tests/tests_log4.xml tests/tests_log.xml &&
deactivate"
'''
Expand All @@ -144,7 +145,7 @@ pipeline {
mkdir -p $PWD/Qnn_non_cli &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/Qnn_non_cli &&
pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
pytest tests -m '(not cli) and (qnn) and (on_qaic) and (not multimodal) and (not finetune)' --ignore tests/vllm --junitxml=tests/tests_log5.xml &&
junitparser merge tests/tests_log5.xml tests/tests_log.xml &&
deactivate"
'''
Expand All @@ -170,6 +171,23 @@ pipeline {
}
}
}
stage('Finetune CLI Tests') {
steps {
timeout(time: 5, unit: 'MINUTES') {
sh '''
sudo docker exec ${BUILD_TAG} bash -c "
cd /efficient-transformers &&
. preflight_qeff/bin/activate &&
mkdir -p $PWD/cli_qaic_finetuning &&
export TOKENIZERS_PARALLELISM=false &&
export QEFF_HOME=$PWD/cli_qaic_finetuning &&
pytest tests -m '(cli) and (on_qaic) and (not qnn) and (not multimodal) and (finetune)' --ignore tests/vllm --junitxml=tests/tests_log_finetune.xml &&
junitparser merge tests/tests_log_finetune.xml tests/tests_log.xml &&
deactivate"
'''
}
}
}
}

post {
Expand Down
109 changes: 93 additions & 16 deletions tests/finetune/test_finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,94 +7,164 @@

import os
import shutil
from pathlib import Path

import numpy as np
import pytest
import requests
import torch.optim as optim
from torch.utils.data import DataLoader

import QEfficient
import QEfficient.cloud.finetune
from QEfficient.cloud.finetune import main as finetune

alpaca_json_path = Path.cwd() / "alpaca_data.json"


def clean_up(path):
if os.path.exists(path):
if os.path.isdir(path) and os.path.exists(path):
shutil.rmtree(path)
if os.path.isfile(path):
os.remove(path)


def download_alpaca():
alpaca_url = "https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json"
response = requests.get(alpaca_url)

with open(alpaca_json_path, "wb") as f:
f.write(response.content)


configs = [
pytest.param(
"meta-llama/Llama-3.2-1B", # model_name
"generation", # task_type
10, # max_eval_step
20, # max_train_step
"gsm8k_dataset", # dataset_name
None, # data_path
1, # intermediate_step_save
None, # context_length
True, # run_validation
True, # use_peft
"qaic", # device
id="llama_config", # config name
)
0.0043353, # expected_train_loss
1.0043447, # expected_train_metric
0.0117334, # expected_eval_loss
1.0118025, # expected_eval_metric
id="llama_config_gsm8k", # config name
),
pytest.param(
"meta-llama/Llama-3.2-1B", # model_name
"generation", # task_type
10, # max_eval_step
20, # max_train_step
"alpaca_dataset", # dataset_name
alpaca_json_path, # data_path
1, # intermediate_step_save
None, # context_length
True, # run_validation
True, # use_peft
"qaic", # device
0.0006099, # expected_train_loss
1.0006101, # expected_train_metric
0.0065296, # expected_eval_loss
1.0065510, # expected_eval_metric
id="llama_config_alpaca", # config name
),
pytest.param(
"google-bert/bert-base-uncased", # model_name
"seq_classification", # task_type
10, # max_eval_step
20, # max_train_step
"imdb_dataset", # dataset_name
None, # data_path
1, # intermediate_step_save
None, # context_length
True, # run_validation
False, # use_peft
"qaic", # device
0.00052981, # expected_train_loss
0.55554199, # expected_train_metric
0.00738618, # expected_eval_loss
0.70825195, # expected_eval_metric
id="bert_config_imdb", # config name
),
]


@pytest.mark.skip(reason="Currently CI is broken. Once it is fixed we will enable this test.")
@pytest.mark.cli
@pytest.mark.on_qaic
@pytest.mark.finetune
@pytest.mark.parametrize(
"model_name,max_eval_step,max_train_step,intermediate_step_save,context_length,run_validation,use_peft,device",
"model_name,task_type,max_eval_step,max_train_step,dataset_name,data_path,intermediate_step_save,context_length,run_validation,use_peft,device,expected_train_loss,expected_train_metric,expected_eval_loss,expected_eval_metric",
configs,
)
def test_finetune(
def test_finetune_llama(
model_name,
task_type,
max_eval_step,
max_train_step,
dataset_name,
data_path,
intermediate_step_save,
context_length,
run_validation,
use_peft,
device,
expected_train_loss,
expected_train_metric,
expected_eval_loss,
expected_eval_metric,
mocker,
):
train_config_spy = mocker.spy(QEfficient.cloud.finetune, "TrainConfig")
generate_dataset_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_dataset_config")
generate_peft_config_spy = mocker.spy(QEfficient.cloud.finetune, "generate_peft_config")
get_dataloader_kwargs_spy = mocker.spy(QEfficient.cloud.finetune, "get_dataloader_kwargs")
get_dataloader_kwargs_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_dataloader_kwargs")
update_config_spy = mocker.spy(QEfficient.cloud.finetune, "update_config")
get_custom_data_collator_spy = mocker.spy(QEfficient.cloud.finetune, "get_custom_data_collator")
get_preprocessed_dataset_spy = mocker.spy(QEfficient.cloud.finetune, "get_preprocessed_dataset")
get_custom_data_collator_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_custom_data_collator")
get_preprocessed_dataset_spy = mocker.spy(QEfficient.finetune.utils.dataset_utils, "get_preprocessed_dataset")
get_longest_seq_length_spy = mocker.spy(QEfficient.cloud.finetune, "get_longest_seq_length")
print_model_size_spy = mocker.spy(QEfficient.cloud.finetune, "print_model_size")
train_spy = mocker.spy(QEfficient.cloud.finetune, "train")

kwargs = {
"model_name": model_name,
"task_type": task_type,
"max_eval_step": max_eval_step,
"max_train_step": max_train_step,
"dataset": dataset_name,
"data_path": data_path,
"intermediate_step_save": intermediate_step_save,
"context_length": context_length,
"run_validation": run_validation,
"use_peft": use_peft,
"device": device,
}

if dataset_name == "alpaca_dataset":
download_alpaca()

results = finetune(**kwargs)
assert np.allclose(results["avg_train_loss"], 0.00232327, atol=1e-5), "Train loss is not matching."
assert np.allclose(results["avg_train_metric"], 1.002326, atol=1e-5), "Train metric is not matching."
assert np.allclose(results["avg_eval_loss"], 0.0206124, atol=1e-5), "Eval loss is not matching."
assert np.allclose(results["avg_eval_metric"], 1.020826, atol=1e-5), "Eval metric is not matching."
assert np.allclose(results["avg_train_loss"], expected_train_loss, atol=1e-3), "Train loss is not matching."
assert np.allclose(results["avg_train_metric"], expected_train_metric, atol=1e-3), "Train metric is not matching."
assert np.allclose(results["avg_eval_loss"], expected_eval_loss, atol=1e-3), "Eval loss is not matching."
assert np.allclose(results["avg_eval_metric"], expected_eval_metric, atol=1e-3), "Eval metric is not matching."
assert results["avg_epoch_time"] < 60, "Training should complete within 60 seconds."

train_config_spy.assert_called_once()
generate_dataset_config_spy.assert_called_once()
generate_peft_config_spy.assert_called_once()
get_custom_data_collator_spy.assert_called_once()
if task_type == "generation":
generate_peft_config_spy.assert_called_once()
get_longest_seq_length_spy.assert_called_once()
print_model_size_spy.assert_called_once()
train_spy.assert_called_once()

assert update_config_spy.call_count == 2
assert get_custom_data_collator_spy.call_count == 2
assert get_dataloader_kwargs_spy.call_count == 2
assert get_preprocessed_dataset_spy.call_count == 2

Expand Down Expand Up @@ -123,12 +193,19 @@ def test_finetune(
f"{train_config.gradient_accumulation_steps} which is gradient accumulation steps."
)

saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors")
if use_peft:
saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/adapter_model.safetensors")
else:
saved_file = os.path.join(train_config.output_dir, "complete_epoch_1/model.safetensors")
assert os.path.isfile(saved_file)

clean_up(train_config.output_dir)
clean_up("runs")
clean_up("qaic-dumps")
clean_up(train_config.dump_root_dir)

if dataset_name == "alpaca_dataset":
clean_up(alpaca_json_path)


# TODO (Meet): Add seperate tests for BERT FT and LLama FT
Loading