New configs

hitz-zentroa · Feb 2, 2025 · 894d2a1 · 894d2a1
1 parent 9048324
commit 894d2a1
Show file tree

Hide file tree

Showing 11 changed files with 291 additions and 7 deletions.
diff --git a/scripts/finetune_qwen32B.sh b/scripts/finetune_qwen32B.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+#SBATCH --job-name=Odesia-qwen32B
+#SBATCH --cpus-per-task=16
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:8
+#SBATCH --mem=64G
+#SBATCH --output=.slurm/Odesia-qwen32B.out.txt
+#SBATCH --error=.slurm/Odesia-qwen32B.err.txt
+
+
+source /ikerlariak/igarcia945/envs/pytorch2/bin/activate
+
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+export TOKENIZERS_PARALLELISM=true
+export TRANSFORMERS_NO_ADVISORY_WARNINGS=true
+export WANDB_ENTITY=igarciaf
+export WANDB_PROJECT=Odesia
+export OMP_NUM_THREADS=16
+export WANDB__SERVICE_WAIT=300
+
+echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}"
+
+
+export PYTHONPATH="$PYTHONPATH:$PWD"
+accelerate launch --config_file train_configs/deepspeed_8.json src/train.py train_configs/qwen32B.yaml
+torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct
+torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct
+
+
diff --git a/scripts/finetune_qwen_72B_LoRA.sh b/scripts/finetune_qwen_72B_LoRA.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+#SBATCH --job-name=Odesia-llama_LoRA
+#SBATCH --cpus-per-task=16
+#SBATCH --nodes=1
+#SBATCH --gres=gpu:4
+#SBATCH --mem=64G
+#SBATCH --output=.slurm/Odesia-llama_LoRA.out.txt
+#SBATCH --error=.slurm/Odesia-llama_LoRA.err.txt
+
+
+source /ikerlariak/igarcia945/envs/pytorch2/bin/activate
+
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+export TOKENIZERS_PARALLELISM=true
+export TRANSFORMERS_NO_ADVISORY_WARNINGS=true
+export WANDB_ENTITY=igarciaf
+export WANDB_PROJECT=Odesia
+export OMP_NUM_THREADS=16
+export WANDB__SERVICE_WAIT=300
+
+echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}"
+
+
+export PYTHONPATH="$PYTHONPATH:$PWD"
+accelerate launch --config_file train_configs/deepspeed.json src/train.py train_configs/qwen_72B_LoRA.yaml
+torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA
+torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA
+
+
+
diff --git a/src/train.py b/src/train.py
@@ -39,6 +39,7 @@ def train(training_args: Seq2SeqTrainingArguments, model_args: ModelArguments):
         or training_args.fsdp_config is not None,
         max_memory_MB=model_args.max_memory_MB,
         rope_scaling_factor=model_args.rope_scaling_factor,
+        #use_liger_kernel=training_args.use_liger_kernel,
     )
 
     print(f"Model_max_length: {tokenizer.model_max_length}")

diff --git a/src/training/load_model.py b/src/training/load_model.py
@@ -21,6 +21,7 @@
 from transformers.utils import is_ipex_available
 
 from .model_utils import find_all_linear_names, get_trainable_parameters
+from liger_kernel.transformers import AutoLigerKernelForCausalLM
 
 
 def get_current_device() -> int:
@@ -172,6 +173,7 @@ def load_model(
     fsdp_training: bool = False,
     max_memory_MB: Optional[int] = None,
     rope_scaling_factor: Optional[float] = None,
+    use_liger_kernel: bool = False,
 ) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
     """
     Load any Decoder model for training.
@@ -402,8 +404,11 @@ def load_model(
         logging.warning(
             f"Model {model_weights_name_or_path} is an decoder-only model. We will load it as a CausalLM model."
         )
-
-        load_fn = AutoModelForCausalLM
+        if use_liger_kernel:
+            logging.warning("Loading model with Liger Kernel.")
+            load_fn = AutoLigerKernelForCausalLM
+        else:
+            load_fn = AutoModelForCausalLM
         tokenizer.padding_side = "left"
         model_type = "causal"
 
@@ -413,7 +418,7 @@ def load_model(
             f"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES or MODEL_FOR_CAUSAL_LM_MAPPING_NAMES. "
             f"We will attempt load it as a CausalLM model."
         )
-        load_fn = AutoModelForCausalLM
+        load_fn = AutoLigerKernelForCausalLM
         tokenizer.padding_side = "left"
         model_type = "causal"
 

diff --git a/src/training/trainer.py b/src/training/trainer.py
@@ -196,7 +196,7 @@ def __init__(
 
         self.tokenizer = tokenizer
 
-    def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
         """
         How the loss is computed by Trainer. By default, all models return the loss in the first element.
         Subclass and override for custom behavior.

diff --git a/train_configs/deepspeed.json → train_configs/deepspeed_2.json b/train_configs/deepspeed.json → train_configs/deepspeed_2.json
@@ -11,7 +11,7 @@
   "machine_rank": 0,
   "main_training_function": "main",
   "num_machines": 1,
-  "num_processes": 4,
+  "num_processes": 2,
   "rdzv_backend": "static",
   "same_network": true,
   "tpu_env": [],

diff --git a/train_configs/deepspeed_8.json b/train_configs/deepspeed_8.json
@@ -0,0 +1,21 @@
+{
+  "compute_environment": "LOCAL_MACHINE",
+  "debug": false,
+  "deepspeed_config": {
+    "deepspeed_config_file": "train_configs/deepspeed_zero3.json",
+    "zero3_init_flag": false
+  },
+  "distributed_type": "DEEPSPEED",
+  "downcast_bf16": "no",
+  "enable_cpu_affinity": false,
+  "machine_rank": 0,
+  "main_training_function": "main",
+  "num_machines": 1,
+  "num_processes": 8,
+  "rdzv_backend": "static",
+  "same_network": true,
+  "tpu_env": [],
+  "tpu_use_cluster": false,
+  "tpu_use_sudo": false,
+  "use_cpu": false
+}
diff --git a/train_configs/llama8b_LoRA.yaml b/train_configs/llama8b_LoRA.yaml
@@ -23,9 +23,9 @@ do_eval: true
 do_predict: false
 evaluation_strategy: "epoch"
 
-per_device_train_batch_size: 2
+per_device_train_batch_size: 4
 per_device_eval_batch_size: 2
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 4
 
 # optimizer settings
 optim: adamw_torch

diff --git a/train_configs/qwen14B.yaml b/train_configs/qwen14B.yaml
@@ -0,0 +1,65 @@
+#Training args
+model_name_or_path: Qwen/Qwen2.5-14B-Instruct
+torch_dtype: bfloat16
+use_lora: false
+quantization: null
+gradient_checkpointing: true
+force_auto_device_map: false
+use_flash_attention: true
+deepspeed: train_configs/deepspeed_zero3.json
+use_liger_kernel: true 
+
+output_dir: models/Qwen2.5-14B-Instruct
+overwrite_output_dir: true
+load_best_model_at_end: false
+metric_for_best_model: eval_loss
+greater_is_better: false
+save_strategy: "no"
+save_only_model: true
+save_total_limit: 1
+
+# evaluation
+do_train: true
+do_eval: true
+do_predict: false
+evaluation_strategy: "epoch"
+
+per_device_train_batch_size: 1
+per_device_eval_batch_size: 1
+gradient_accumulation_steps: 16
+
+# optimizer settings
+optim: adamw_torch
+learning_rate: 0.000005
+weight_decay: 0.0
+num_train_epochs: 3
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_epsilon: 1e-12
+
+# lora settings
+lora_r: 128
+lora_alpha: 256
+lora_dropout: 0.05
+lora_target_modules:
+  - all
+
+# reporting
+logging_strategy: steps
+logging_first_step: true
+logging_steps: 5
+report_to: wandb
+run_name: "Qwen2.5-14B-Instruct"
+disable_tqdm: false
+
+# hub settings
+push_to_hub: false
+resume_from_checkpoint: false
+
+# performance
+bf16: true
+fp16: false
+torch_compile: false
+ddp_find_unused_parameters: false
diff --git a/train_configs/qwen32B.yaml b/train_configs/qwen32B.yaml
@@ -0,0 +1,65 @@
+#Training args
+model_name_or_path: Qwen/Qwen2.5-32B-Instruct
+torch_dtype: bfloat16
+use_lora: false
+quantization: null
+gradient_checkpointing: true
+force_auto_device_map: false
+use_flash_attention: true
+deepspeed: train_configs/deepspeed_zero3.json
+use_liger_kernel: true 
+
+output_dir: models/Qwen2.5-32B-Instruct
+overwrite_output_dir: true
+load_best_model_at_end: false
+metric_for_best_model: eval_loss
+greater_is_better: false
+save_strategy: "no"
+save_only_model: true
+save_total_limit: 1
+
+# evaluation
+do_train: true
+do_eval: true
+do_predict: false
+evaluation_strategy: "epoch"
+
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 1
+gradient_accumulation_steps: 8
+
+# optimizer settings
+optim: adamw_torch
+learning_rate: 0.000005
+weight_decay: 0.0
+num_train_epochs: 3
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+adam_beta1: 0.9
+adam_beta2: 0.95
+adam_epsilon: 1e-12
+
+# lora settings
+lora_r: 128
+lora_alpha: 256
+lora_dropout: 0.05
+lora_target_modules:
+  - all
+
+# reporting
+logging_strategy: steps
+logging_first_step: true
+logging_steps: 5
+report_to: wandb
+run_name: "Qwen2.5-32B-Instruct"
+disable_tqdm: false
+
+# hub settings
+push_to_hub: false
+resume_from_checkpoint: false
+
+# performance
+bf16: true
+fp16: false
+torch_compile: false
+ddp_find_unused_parameters: false
diff --git a/train_configs/qwen_72B_LoRA.yaml b/train_configs/qwen_72B_LoRA.yaml
@@ -0,0 +1,62 @@
+#Training args
+model_name_or_path: Qwen/Qwen2.5-72B
+torch_dtype: bfloat16
+use_lora: true
+quantization: 4
+gradient_checkpointing: true
+force_auto_device_map: false
+use_flash_attention: true
+deepspeed: train_configs/deepspeed_zero3.json
+
+output_dir: models/Qwen2.5-72B_LoRA
+overwrite_output_dir: true
+load_best_model_at_end: false
+metric_for_best_model: eval_loss
+greater_is_better: false
+save_strategy: "no"
+save_only_model: true
+save_total_limit: 1
+
+# evaluation
+do_train: true
+do_eval: true
+do_predict: false
+evaluation_strategy: "epoch"
+
+per_device_train_batch_size: 2
+per_device_eval_batch_size: 2
+gradient_accumulation_steps: 8
+
+# optimizer settings
+optim: adamw_torch
+learning_rate: 0.0003
+weight_decay: 0.001
+num_train_epochs: 3
+lr_scheduler_type: cosine
+warmup_ratio: 0.1
+adam_epsilon: 0.0000001
+
+# lora settings
+lora_r: 128
+lora_alpha: 256
+lora_dropout: 0.05
+lora_target_modules:
+  - all
+
+# reporting
+logging_strategy: steps
+logging_first_step: true
+logging_steps: 5
+report_to: wandb
+run_name: "Qwen2.5-72B_LoRA"
+disable_tqdm: false
+
+# hub settings
+push_to_hub: false
+resume_from_checkpoint: false
+
+# performance
+bf16: true
+fp16: false
+torch_compile: false
+ddp_find_unused_parameters: false