diff --git a/scripts/finetune_qwen32B.sh b/scripts/finetune_qwen32B.sh new file mode 100644 index 0000000..6a4000c --- /dev/null +++ b/scripts/finetune_qwen32B.sh @@ -0,0 +1,32 @@ +#!/bin/bash +#SBATCH --job-name=Odesia-qwen32B +#SBATCH --cpus-per-task=16 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:8 +#SBATCH --mem=64G +#SBATCH --output=.slurm/Odesia-qwen32B.out.txt +#SBATCH --error=.slurm/Odesia-qwen32B.err.txt + + +source /ikerlariak/igarcia945/envs/pytorch2/bin/activate + + +export LC_ALL=en_US.UTF-8 +export LANG=en_US.UTF-8 +export LANGUAGE=en_US.UTF-8 +export TOKENIZERS_PARALLELISM=true +export TRANSFORMERS_NO_ADVISORY_WARNINGS=true +export WANDB_ENTITY=igarciaf +export WANDB_PROJECT=Odesia +export OMP_NUM_THREADS=16 +export WANDB__SERVICE_WAIT=300 + +echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}" + + +export PYTHONPATH="$PYTHONPATH:$PWD" +accelerate launch --config_file train_configs/deepspeed_8.json src/train.py train_configs/qwen32B.yaml +torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct +torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct + + diff --git a/scripts/finetune_qwen_72B_LoRA.sh b/scripts/finetune_qwen_72B_LoRA.sh new file mode 100644 index 0000000..a82aaf4 --- /dev/null +++ b/scripts/finetune_qwen_72B_LoRA.sh @@ -0,0 +1,33 @@ +#!/bin/bash +#SBATCH --job-name=Odesia-llama_LoRA +#SBATCH --cpus-per-task=16 +#SBATCH --nodes=1 +#SBATCH --gres=gpu:4 +#SBATCH --mem=64G +#SBATCH --output=.slurm/Odesia-llama_LoRA.out.txt +#SBATCH --error=.slurm/Odesia-llama_LoRA.err.txt + + +source /ikerlariak/igarcia945/envs/pytorch2/bin/activate + + +export LC_ALL=en_US.UTF-8 +export LANG=en_US.UTF-8 +export LANGUAGE=en_US.UTF-8 +export TOKENIZERS_PARALLELISM=true +export TRANSFORMERS_NO_ADVISORY_WARNINGS=true +export WANDB_ENTITY=igarciaf +export WANDB_PROJECT=Odesia +export OMP_NUM_THREADS=16 +export WANDB__SERVICE_WAIT=300 + +echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}" + + +export PYTHONPATH="$PYTHONPATH:$PWD" +accelerate launch --config_file train_configs/deepspeed.json src/train.py train_configs/qwen_72B_LoRA.yaml +torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA +torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA + + + diff --git a/src/train.py b/src/train.py index 59c7702..d7761e5 100644 --- a/src/train.py +++ b/src/train.py @@ -39,6 +39,7 @@ def train(training_args: Seq2SeqTrainingArguments, model_args: ModelArguments): or training_args.fsdp_config is not None, max_memory_MB=model_args.max_memory_MB, rope_scaling_factor=model_args.rope_scaling_factor, + #use_liger_kernel=training_args.use_liger_kernel, ) print(f"Model_max_length: {tokenizer.model_max_length}") diff --git a/src/training/load_model.py b/src/training/load_model.py index 2b0e470..4e661f7 100644 --- a/src/training/load_model.py +++ b/src/training/load_model.py @@ -21,6 +21,7 @@ from transformers.utils import is_ipex_available from .model_utils import find_all_linear_names, get_trainable_parameters +from liger_kernel.transformers import AutoLigerKernelForCausalLM def get_current_device() -> int: @@ -172,6 +173,7 @@ def load_model( fsdp_training: bool = False, max_memory_MB: Optional[int] = None, rope_scaling_factor: Optional[float] = None, + use_liger_kernel: bool = False, ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]: """ Load any Decoder model for training. @@ -402,8 +404,11 @@ def load_model( logging.warning( f"Model {model_weights_name_or_path} is an decoder-only model. We will load it as a CausalLM model." ) - - load_fn = AutoModelForCausalLM + if use_liger_kernel: + logging.warning("Loading model with Liger Kernel.") + load_fn = AutoLigerKernelForCausalLM + else: + load_fn = AutoModelForCausalLM tokenizer.padding_side = "left" model_type = "causal" @@ -413,7 +418,7 @@ def load_model( f"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES or MODEL_FOR_CAUSAL_LM_MAPPING_NAMES. " f"We will attempt load it as a CausalLM model." ) - load_fn = AutoModelForCausalLM + load_fn = AutoLigerKernelForCausalLM tokenizer.padding_side = "left" model_type = "causal" diff --git a/src/training/trainer.py b/src/training/trainer.py index e49d0d2..b5db5de 100644 --- a/src/training/trainer.py +++ b/src/training/trainer.py @@ -196,7 +196,7 @@ def __init__( self.tokenizer = tokenizer - def compute_loss(self, model, inputs, return_outputs=False): + def compute_loss(self, model, inputs, return_outputs=False, **kwargs): """ How the loss is computed by Trainer. By default, all models return the loss in the first element. Subclass and override for custom behavior. diff --git a/train_configs/deepspeed.json b/train_configs/deepspeed_2.json similarity index 95% rename from train_configs/deepspeed.json rename to train_configs/deepspeed_2.json index 738145a..51ad738 100644 --- a/train_configs/deepspeed.json +++ b/train_configs/deepspeed_2.json @@ -11,7 +11,7 @@ "machine_rank": 0, "main_training_function": "main", "num_machines": 1, - "num_processes": 4, + "num_processes": 2, "rdzv_backend": "static", "same_network": true, "tpu_env": [], diff --git a/train_configs/deepspeed_8.json b/train_configs/deepspeed_8.json new file mode 100644 index 0000000..dded576 --- /dev/null +++ b/train_configs/deepspeed_8.json @@ -0,0 +1,21 @@ +{ + "compute_environment": "LOCAL_MACHINE", + "debug": false, + "deepspeed_config": { + "deepspeed_config_file": "train_configs/deepspeed_zero3.json", + "zero3_init_flag": false + }, + "distributed_type": "DEEPSPEED", + "downcast_bf16": "no", + "enable_cpu_affinity": false, + "machine_rank": 0, + "main_training_function": "main", + "num_machines": 1, + "num_processes": 8, + "rdzv_backend": "static", + "same_network": true, + "tpu_env": [], + "tpu_use_cluster": false, + "tpu_use_sudo": false, + "use_cpu": false +} diff --git a/train_configs/llama8b_LoRA.yaml b/train_configs/llama8b_LoRA.yaml index ce81d09..a9a08dd 100644 --- a/train_configs/llama8b_LoRA.yaml +++ b/train_configs/llama8b_LoRA.yaml @@ -23,9 +23,9 @@ do_eval: true do_predict: false evaluation_strategy: "epoch" -per_device_train_batch_size: 2 +per_device_train_batch_size: 4 per_device_eval_batch_size: 2 -gradient_accumulation_steps: 8 +gradient_accumulation_steps: 4 # optimizer settings optim: adamw_torch diff --git a/train_configs/qwen14B.yaml b/train_configs/qwen14B.yaml new file mode 100644 index 0000000..da4b688 --- /dev/null +++ b/train_configs/qwen14B.yaml @@ -0,0 +1,65 @@ +#Training args +model_name_or_path: Qwen/Qwen2.5-14B-Instruct +torch_dtype: bfloat16 +use_lora: false +quantization: null +gradient_checkpointing: true +force_auto_device_map: false +use_flash_attention: true +deepspeed: train_configs/deepspeed_zero3.json +use_liger_kernel: true + +output_dir: models/Qwen2.5-14B-Instruct +overwrite_output_dir: true +load_best_model_at_end: false +metric_for_best_model: eval_loss +greater_is_better: false +save_strategy: "no" +save_only_model: true +save_total_limit: 1 + +# evaluation +do_train: true +do_eval: true +do_predict: false +evaluation_strategy: "epoch" + +per_device_train_batch_size: 1 +per_device_eval_batch_size: 1 +gradient_accumulation_steps: 16 + +# optimizer settings +optim: adamw_torch +learning_rate: 0.000005 +weight_decay: 0.0 +num_train_epochs: 3 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +adam_beta1: 0.9 +adam_beta2: 0.95 +adam_epsilon: 1e-12 + +# lora settings +lora_r: 128 +lora_alpha: 256 +lora_dropout: 0.05 +lora_target_modules: + - all + +# reporting +logging_strategy: steps +logging_first_step: true +logging_steps: 5 +report_to: wandb +run_name: "Qwen2.5-14B-Instruct" +disable_tqdm: false + +# hub settings +push_to_hub: false +resume_from_checkpoint: false + +# performance +bf16: true +fp16: false +torch_compile: false +ddp_find_unused_parameters: false \ No newline at end of file diff --git a/train_configs/qwen32B.yaml b/train_configs/qwen32B.yaml new file mode 100644 index 0000000..85cf5ed --- /dev/null +++ b/train_configs/qwen32B.yaml @@ -0,0 +1,65 @@ +#Training args +model_name_or_path: Qwen/Qwen2.5-32B-Instruct +torch_dtype: bfloat16 +use_lora: false +quantization: null +gradient_checkpointing: true +force_auto_device_map: false +use_flash_attention: true +deepspeed: train_configs/deepspeed_zero3.json +use_liger_kernel: true + +output_dir: models/Qwen2.5-32B-Instruct +overwrite_output_dir: true +load_best_model_at_end: false +metric_for_best_model: eval_loss +greater_is_better: false +save_strategy: "no" +save_only_model: true +save_total_limit: 1 + +# evaluation +do_train: true +do_eval: true +do_predict: false +evaluation_strategy: "epoch" + +per_device_train_batch_size: 2 +per_device_eval_batch_size: 1 +gradient_accumulation_steps: 8 + +# optimizer settings +optim: adamw_torch +learning_rate: 0.000005 +weight_decay: 0.0 +num_train_epochs: 3 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +adam_beta1: 0.9 +adam_beta2: 0.95 +adam_epsilon: 1e-12 + +# lora settings +lora_r: 128 +lora_alpha: 256 +lora_dropout: 0.05 +lora_target_modules: + - all + +# reporting +logging_strategy: steps +logging_first_step: true +logging_steps: 5 +report_to: wandb +run_name: "Qwen2.5-32B-Instruct" +disable_tqdm: false + +# hub settings +push_to_hub: false +resume_from_checkpoint: false + +# performance +bf16: true +fp16: false +torch_compile: false +ddp_find_unused_parameters: false \ No newline at end of file diff --git a/train_configs/qwen_72B_LoRA.yaml b/train_configs/qwen_72B_LoRA.yaml new file mode 100644 index 0000000..f82ac34 --- /dev/null +++ b/train_configs/qwen_72B_LoRA.yaml @@ -0,0 +1,62 @@ +#Training args +model_name_or_path: Qwen/Qwen2.5-72B +torch_dtype: bfloat16 +use_lora: true +quantization: 4 +gradient_checkpointing: true +force_auto_device_map: false +use_flash_attention: true +deepspeed: train_configs/deepspeed_zero3.json + +output_dir: models/Qwen2.5-72B_LoRA +overwrite_output_dir: true +load_best_model_at_end: false +metric_for_best_model: eval_loss +greater_is_better: false +save_strategy: "no" +save_only_model: true +save_total_limit: 1 + +# evaluation +do_train: true +do_eval: true +do_predict: false +evaluation_strategy: "epoch" + +per_device_train_batch_size: 2 +per_device_eval_batch_size: 2 +gradient_accumulation_steps: 8 + +# optimizer settings +optim: adamw_torch +learning_rate: 0.0003 +weight_decay: 0.001 +num_train_epochs: 3 +lr_scheduler_type: cosine +warmup_ratio: 0.1 +adam_epsilon: 0.0000001 + +# lora settings +lora_r: 128 +lora_alpha: 256 +lora_dropout: 0.05 +lora_target_modules: + - all + +# reporting +logging_strategy: steps +logging_first_step: true +logging_steps: 5 +report_to: wandb +run_name: "Qwen2.5-72B_LoRA" +disable_tqdm: false + +# hub settings +push_to_hub: false +resume_from_checkpoint: false + +# performance +bf16: true +fp16: false +torch_compile: false +ddp_find_unused_parameters: false \ No newline at end of file