diff --git a/.gitignore b/.gitignore index 5f514f7..00ae19d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *.ipynb -*.parquet \ No newline at end of file +*.parquet +dataset/ +models/ +local_util/ \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..c282e9a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,14 @@ +# Contributing + +This project welcomes contributions and suggestions. Most contributions require you to +agree to a Contributor License Agreement (CLA) declaring that you have the right to, +and actually do, grant us the rights to use your contribution. For details, visit +https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need +to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the +instructions provided by the bot. You will only need to do this once across all repositories using our CLA. + +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) +or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file diff --git a/README.md b/README.md index 086a60f..f80411b 100644 --- a/README.md +++ b/README.md @@ -112,3 +112,15 @@ trademarks or logos is subject to and must follow [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. Any use of third-party trademarks or logos are subject to those third-party's policies. + +## Citation + +```bibtex +@inproceedings{aggarwal2025nextcoder, +author = {Aggarwal, Tushar and Singh, Swayam and Awasthi, Abhijeet and Kanade, Aditya and Natarajan, Nagarajan}, +title = {NextCoder: Robust Adaptation of Code LMs to Diverse Code Edits}, +booktitle = {International Conference on Machine Learning}, +year = {2025}, +url = {https://www.microsoft.com/en-us/research/publication/nextcoder-robust-adaptation-of-code-lms-to-diverse-code-edits/}, +} +``` \ No newline at end of file diff --git a/src/train/README.md b/src/train/README.md index 64b81b0..cf0b489 100644 --- a/src/train/README.md +++ b/src/train/README.md @@ -1,12 +1,10 @@ # Model Training scripts ## Folder Structure -- `ds_config.json` contains the deepspeed configuration -- `general_acc.yaml` contains the accelerate configuration (might need to be modified as per desired system) -- `lora.py` contains the code for training model with LoRA -- `merge_lora.py` contains the code for merging trained LoRA adapters back to model for inference -- `seletkt.py` contains the code for training model with our algorithm explained in our paper -- `sft.py` contains the code for training model with Full Supervised Finetuning +- `configs` contains the deepspeed and accelerate configurations (modifialbe as per the system) +- `lora` contains the code for training model with LoRA +- `seletkt` contains the code for training model with SeleKT algorithm explained in our paper +- `sft` contains the code for training model with Full Supervised Finetuning ## Usgae ### Preparing the dataset @@ -23,122 +21,23 @@ ### Training with SFT - modify or replace the `general_acc.yaml` file as per the desired system configuration - set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - deepspeed sft.py \ - --model_name_or_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output dir" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "epoch" \ - --save_steps 760 \ - --save_total_limit 25 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Run name for logs" \ - --debug True \ - ``` - Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 175, 176 and 185 and run the same command with proper conversational dataset path - ```python - response_template = "#RESPONSE\n" - collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) - - # Initialize trainer - trainer = SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=training_config, - callbacks=[Callback(flush_steps=1)], - data_collator=collator, # pass the collator in the trainer - ) - ``` +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./sft/run.sh +``` ### Training with LoRA - modify or replace the `general_acc.yaml` file as per the desired system configuration -- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - deepspeed lora.py \ - --model_name_or_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output dir" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "epoch" \ - --save_steps 760 \ - --save_total_limit 25 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Run name for logs" \ - --debug True \ - ``` - Update the above command as per the model -- Put the path of output LoRA adapters inside `merge_lora.py` and run following to get the final checkpoints - ```bash - python merge_lora.py - ``` +- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./lora/run.sh +``` +>`lora/lora.py` uses `use_reentrant: True` for gradient checkpointing, and this can allow using deepspeed zero-3 optimization for large models. ### Training with SeleKT - modify or replace the `general_acc.yaml` file as per the desired system configuration -- set the `zero_optimization-stage` to `2` and `overlap_comm` to `false` in `ds_config` for better memory optimizations -- Run the following command to start training - ```bash - accelerate launch \ - --config_file=general_acc.yaml \ - selekt.py \ - --model_name_or_path "path to pretrained LLM" \ - --base_model_path "path to pretrained LLM" \ - --train_data_path "path to training data" \ - --output_dir "path to output directory" \ - --num_train_epochs 3 \ - --model_max_length 8192 \ - --per_device_train_batch_size 4 \ - --gradient_accumulation_steps 4 \ - --save_strategy "steps" \ - --save_steps "Enter the periodicity value M for seleKT" \ - --save_total_limit 50 \ - --learning_rate 1e-5 \ - --warmup_ratio 0.1 \ - --logging_steps 5 \ - --report_to "wandb" \ - --gradient_checkpointing True \ - --deepspeed ds_config.json \ - --bf16 True \ - --run_name "Name for logs" \ - --debug True \ - --alpha "Enter value for desired alpha parameter for SeleKT" \ - ``` - Update the above command as per the model -- To train on conversation data by only applying loss on the response, uncomment the lines 291, 292 and 301 and run the same command with proper conversational dataset path - ```python - ```python - response_template = "#RESPONSE\n" - collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) - - # Initialize trainer - trainer = SFTTrainer( - model=model, - processing_class=tokenizer, - train_dataset=dataset, - args=training_config, - callbacks=[Callback(flush_steps=1)], - data_collator=collator, # pass the collator in the trainer - ) - ``` \ No newline at end of file +- set the `zero_optimization-stage` to `3` and `overlap_comm` to `false` in `ds_config` for better memory optimizations +- Add the respecitive variables like `MODEL_PATH`, `TRAIN_DATA`, `OUTPUT_DIR` etc. in the `run.sh` script and run +```bash +bash ./selekt/run.sh \ No newline at end of file diff --git a/src/train/SeleKT/run.sh b/src/train/SeleKT/run.sh new file mode 100644 index 0000000..5faa5a5 --- /dev/null +++ b/src/train/SeleKT/run.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +export MODEL_NAME="" +export DESC="" + +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/selekt_stage1_instruction" +TRAIN_DATA_STAGE1="" +MODEL_PATH="" + +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/selekt_stage2_conversational" +TRAIN_DATA_STAGE2="" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +echo "Starting Stage 1: SeleKT Instruction Training..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" + +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: Instruction Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + selekt.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage1_instruction" \ + --alpha 0.05 \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi + +echo "Stage 1 completed successfully!" + +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") + +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" +echo "Starting Stage 2: SeleKT Conversational Training..." +echo "Model: $LATEST_CHECKPOINT" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: Conversational Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + selekt.py \ + --model_name_or_path "${LATEST_CHECKPOINT}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage2_conversational" \ + --alpha 0.05 \ + --is_conversational_training \ + + +# Check if stage 2 completed successfully +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed!" +echo "Both training stages completed successfully!" +echo "Final model saved in: $OUTPUT_DIR_STAGE2" \ No newline at end of file diff --git a/src/train/selekt.py b/src/train/SeleKT/selekt.py similarity index 96% rename from src/train/selekt.py rename to src/train/SeleKT/selekt.py index b7c987b..45a16c4 100644 --- a/src/train/selekt.py +++ b/src/train/SeleKT/selekt.py @@ -70,10 +70,11 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") - parser.add_argument("--alpha", type=float, default=0.05,) + parser.add_argument("--alpha", type=float, default=0.05, help="Alpha value for SeleKT") + parser.add_argument("--is_conversational_training", action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args @@ -300,8 +301,10 @@ def train(args): print(f'Resuming from checkpoint: {last_checkpoint}') - # response_template = "#RESPONSE\n" - # collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) callback = Callback(base_model_path=args.base_model_path, flush_steps=1, alpha=args.alpha) trainer = SFTTrainer( @@ -310,7 +313,7 @@ def train(args): train_dataset=dataset, args=training_config, callbacks=[callback], - # data_collator=collator, + data_collator=collator, ) callback.set_trainer(trainer) print(f"Starting training for epoch {args.num_train_epochs}") diff --git a/src/train/ds_config.json b/src/train/configs/ds_config.json similarity index 100% rename from src/train/ds_config.json rename to src/train/configs/ds_config.json diff --git a/src/train/general_acc.yaml b/src/train/configs/general_acc.yaml similarity index 100% rename from src/train/general_acc.yaml rename to src/train/configs/general_acc.yaml diff --git a/src/train/lora.py b/src/train/lora/lora.py similarity index 94% rename from src/train/lora.py rename to src/train/lora/lora.py index 01fcd6a..6385456 100644 --- a/src/train/lora.py +++ b/src/train/lora/lora.py @@ -66,9 +66,10 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") + parser.add_argument("--is_conversational_training", action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args @@ -151,12 +152,13 @@ def main(): output_dir=args.output_dir, report_to="none", gradient_checkpointing=args.gradient_checkpointing, - gradient_checkpointing_kwargs={"use_reentrant": False}, + gradient_checkpointing_kwargs={"use_reentrant": True}, deepspeed=args.deepspeed, dataset_num_proc=80, run_name=args.run_name, use_liger=args.use_liger, ) + lora_config = LoraConfig( r=64, # target_modules= ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], @@ -175,6 +177,11 @@ def main(): dataset = setup_training_data(args, local_rank, tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + trainer = SFTTrainer( model=model, processing_class=tokenizer, @@ -182,6 +189,7 @@ def main(): args=training_config, peft_config=lora_config, callbacks=[Callback(flush_steps=1)], + data_collator=collator ) print("Starting LoRA training...") diff --git a/src/train/lora/merge_lora.py b/src/train/lora/merge_lora.py new file mode 100644 index 0000000..de4927b --- /dev/null +++ b/src/train/lora/merge_lora.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +import argparse +import torch +from peft import AutoPeftModelForCausalLM +from transformers import AutoTokenizer +import os + +def parse_args(): + parser = argparse.ArgumentParser(description="Merge LoRA weights with base model") + parser.add_argument("--lora_checkpoint", type=str, required=True, + help="Path to the LoRA checkpoint directory") + parser.add_argument("--output_dir", type=str, required=True, + help="Directory to save the merged model") + parser.add_argument("--max_shard_size", type=str, default="5GB", + help="Maximum size of each shard when saving") + parser.add_argument("--safe_serialization", action="store_true", default=True, + help="Use safe serialization format") + return parser.parse_args() + +def merge_lora_weights(lora_checkpoint, output_dir, max_shard_size="5GB", safe_serialization=True): + """ + Merge LoRA adapter weights with the base model + """ + print(f"Loading LoRA model from: {lora_checkpoint}") + + peft_model = AutoPeftModelForCausalLM.from_pretrained( + lora_checkpoint, + torch_dtype=torch.bfloat16, + device_map="auto" + ) + + print(f"Loading tokenizer from: {lora_checkpoint}") + tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint) + + print("Merging LoRA adapters with base model...") + merged_model = peft_model.merge_and_unload() + + print(f"Saving merged model to: {output_dir}") + os.makedirs(output_dir, exist_ok=True) + + merged_model.save_pretrained( + output_dir, + max_shard_size=max_shard_size, + safe_serialization=safe_serialization + ) + + # Save the tokenizer + tokenizer.save_pretrained(output_dir) + + print(f"✅ Successfully merged and saved model to: {output_dir}") + + del peft_model, merged_model + torch.cuda.empty_cache() + + return output_dir + +def main(): + args = parse_args() + + try: + merge_lora_weights( + lora_checkpoint=args.lora_checkpoint, + output_dir=args.output_dir, + max_shard_size=args.max_shard_size, + safe_serialization=args.safe_serialization + ) + except Exception as e: + print(f"❌ Error during merging: {str(e)}") + raise e + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/train/lora/run.sh b/src/train/lora/run.sh new file mode 100644 index 0000000..2607b44 --- /dev/null +++ b/src/train/lora/run.sh @@ -0,0 +1,154 @@ +#!/bin/bash + +export MODEL_NAME="" +export DESC="" + +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/stage1_instruction_lora" +TRAIN_DATA_STAGE1="" +MODEL_PATH="" + +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/stage2_conversational_lora" +TRAIN_DATA_STAGE2="" + +# Merged model directory +MERGED_MODEL_DIR="./output/stage1_merged" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +merge_lora_weights() { + local lora_checkpoint=$1 + local output_dir=$2 + + echo "Merging LoRA weights..." + echo "LoRA checkpoint: $lora_checkpoint" + echo "Output: $output_dir" + + python3 merge_lora.py \ + --lora_checkpoint "$lora_checkpoint" \ + --output_dir "$output_dir" \ + --safe_serialization + + return $? +} + +echo "Starting Stage 1: Instruction Training (LoRA)..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" + +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: LoRA Instruction Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + lora.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage1_instruction_lora" \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi + +echo "Stage 1 completed successfully!" + +# Find latest checkpoint +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") + +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" + +# Merge LoRA weights with base model +mkdir -p $MERGED_MODEL_DIR +merge_lora_weights "$LATEST_CHECKPOINT" "$MERGED_MODEL_DIR" + +if [ $? -ne 0 ]; then + echo "Error: LoRA merging failed!" + exit 1 +fi + +echo "LoRA weights merged successfully!" +echo "Starting Stage 2: Conversational Training (LoRA)..." +echo "Model: $MERGED_MODEL_DIR" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: LoRA Conversational Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + lora.py \ + --model_name_or_path "${MERGED_MODEL_DIR}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage2_conversational_lora" \ + --is_conversational_training \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed successfully!" + +# Find final checkpoint and merge again +FINAL_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE2") +FINAL_MERGED_DIR="./output/final_merged_model" + +if [ ! -z "$FINAL_CHECKPOINT" ]; then + echo "Merging final LoRA weights..." + mkdir -p $FINAL_MERGED_DIR + merge_lora_weights "$FINAL_CHECKPOINT" "$FINAL_MERGED_DIR" + echo "Final merged model saved in: $FINAL_MERGED_DIR" +else + echo "Warning: No final checkpoint found, using stage 2 output directory" +fi + +echo "Both training stages completed successfully!" +echo "LoRA adapters saved in: $OUTPUT_DIR_STAGE2" +echo "Final merged model saved in: $FINAL_MERGED_DIR" \ No newline at end of file diff --git a/src/train/merge_lora.py b/src/train/merge_lora.py deleted file mode 100644 index f2c16dd..0000000 --- a/src/train/merge_lora.py +++ /dev/null @@ -1,16 +0,0 @@ -from peft import AutoPeftModelForCausalLM -from transformers import AutoTokenizer - -checkpoints = [] # add the paths to the checkpoints here - - -for lora_checkpoint in checkpoints[1:]: - peft_model = AutoPeftModelForCausalLM.from_pretrained(lora_checkpoint) - tokenizer = AutoTokenizer.from_pretrained(lora_checkpoint) - - merged_model = peft_model.merge_and_unload() - print(type(merged_model)) - output_path = lora_checkpoint + "-merged" - merged_model.save_pretrained(output_path) - tokenizer.save_pretrained(output_path) - print(f"Model saved at {output_path}") diff --git a/src/train/sft/run.sh b/src/train/sft/run.sh new file mode 100644 index 0000000..e534fe0 --- /dev/null +++ b/src/train/sft/run.sh @@ -0,0 +1,110 @@ +#!/bin/bash + +export MODEL_NAME="" +export DESC="" + +# Stage 1: Instruction Training +OUTPUT_DIR_STAGE1="./output/sft_stage1_instruction" +TRAIN_DATA_STAGE1="" +MODEL_PATH="" + +# Stage 2: Conversational Training +OUTPUT_DIR_STAGE2="./output/sft_stage2_conversational" +TRAIN_DATA_STAGE2="" + +find_latest_checkpoint() { + local output_dir=$1 + local latest_checkpoint=$(find "$output_dir" -name "checkpoint-*" -type d | sort -V | tail -1) + echo "$latest_checkpoint" +} + +echo "Starting Stage 1: Instruction Training..." +echo "Model: $MODEL_PATH" +echo "Training data: $TRAIN_DATA_STAGE1" +echo "Output directory: $OUTPUT_DIR_STAGE1" + +mkdir -p $OUTPUT_DIR_STAGE1 + +# Stage 1: Instruction Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + sft.py \ + --model_name_or_path "$MODEL_PATH" \ + --train_data_path "$TRAIN_DATA_STAGE1" \ + --output_dir ${OUTPUT_DIR_STAGE1} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage1_instruction" \ + +if [ $? -ne 0 ]; then + echo "Error: Stage 1 training failed!" + exit 1 +fi + +echo "Stage 1 completed successfully!" + +LATEST_CHECKPOINT=$(find_latest_checkpoint "$OUTPUT_DIR_STAGE1") + +if [ -z "$LATEST_CHECKPOINT" ]; then + echo "Error: No checkpoint found in $OUTPUT_DIR_STAGE1" + exit 1 +fi + +echo "Found latest checkpoint: $LATEST_CHECKPOINT" +echo "Starting Stage 2: Conversational Training..." +echo "Model: $LATEST_CHECKPOINT" +echo "Training data: $TRAIN_DATA_STAGE2" +echo "Output directory: $OUTPUT_DIR_STAGE2" + +mkdir -p $OUTPUT_DIR_STAGE2 + +# Stage 2: Conversational Training +accelerate launch \ + --config_file=../configs/general_acc.yaml \ + sft.py \ + --model_name_or_path "${LATEST_CHECKPOINT}" \ + --train_data_path "$TRAIN_DATA_STAGE2" \ + --output_dir ${OUTPUT_DIR_STAGE2} \ + --num_train_epochs 3 \ + --model_max_length 16384 \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 4 \ + --save_strategy "epoch" \ + --save_steps 760 \ + --save_total_limit 25 \ + --learning_rate 1e-5 \ + --warmup_ratio 0.1 \ + --weight_decay 0.1 \ + --logging_steps 5 \ + --lr_scheduler_type "cosine" \ + --report_to "wandb" \ + --gradient_checkpointing True \ + --deepspeed ../configs/ds_config.json \ + --bf16 True \ + --run_name "${MODEL_NAME}_stage2_conversational" \ + --is_conversational_training \ + + +# Check if stage 2 completed successfully +if [ $? -ne 0 ]; then + echo "Error: Stage 2 training failed!" + exit 1 +fi + +echo "Stage 2 training completed!" +echo "Both training stages completed successfully!" +echo "Final model saved in: $OUTPUT_DIR_STAGE2" \ No newline at end of file diff --git a/src/train/sft.py b/src/train/sft/sft.py similarity index 95% rename from src/train/sft.py rename to src/train/sft/sft.py index ebdd7e0..08a4abd 100644 --- a/src/train/sft.py +++ b/src/train/sft/sft.py @@ -62,9 +62,10 @@ def parse_args(): help="Whether to use bf16 mixed precision training") parser.add_argument("--run_name", type=str, default=None) parser.add_argument("--use_liger", type=bool, default=False) - parser.add_argument("--debug", type=bool, default=False) parser.add_argument("--packing", type=bool, default=True, help="Whether to use packing for training") + parser.add_argument("--is_conversational_training", action='store_true', + help="Whether to use conversational training format") args, _ = parser.parse_known_args() return args @@ -108,7 +109,6 @@ def __init__(self, flush_steps=None): self.flush_steps = flush_steps def on_step_end(self, args, state, control, model, processing_class , **kwargs): - # import sys; sys.exit(0) if state.global_step % self.flush_steps == 0: get_accelerator().empty_cache() if dist.is_initialized(): @@ -172,8 +172,10 @@ def main(): if last_checkpoint: print(f'Resuming from checkpoint: {last_checkpoint}') - # response_template = "#RESPONSE\n" - # collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) + collator = None + if args.is_conversational_training: + response_template = "#RESPONSE\n" + collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer) # Initialize trainer trainer = SFTTrainer( @@ -182,7 +184,7 @@ def main(): train_dataset=dataset, args=training_config, callbacks=[Callback(flush_steps=1)], - # data_collator=collator, + data_collator=collator, ) # Start training