Skip to content

Commit

Permalink
New configs
Browse files Browse the repository at this point in the history
  • Loading branch information
ikergarcia1996 committed Feb 2, 2025
1 parent 9048324 commit 894d2a1
Show file tree
Hide file tree
Showing 11 changed files with 291 additions and 7 deletions.
32 changes: 32 additions & 0 deletions scripts/finetune_qwen32B.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#!/bin/bash
#SBATCH --job-name=Odesia-qwen32B
#SBATCH --cpus-per-task=16
#SBATCH --nodes=1
#SBATCH --gres=gpu:8
#SBATCH --mem=64G
#SBATCH --output=.slurm/Odesia-qwen32B.out.txt
#SBATCH --error=.slurm/Odesia-qwen32B.err.txt


source /ikerlariak/igarcia945/envs/pytorch2/bin/activate


export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
export LANGUAGE=en_US.UTF-8
export TOKENIZERS_PARALLELISM=true
export TRANSFORMERS_NO_ADVISORY_WARNINGS=true
export WANDB_ENTITY=igarciaf
export WANDB_PROJECT=Odesia
export OMP_NUM_THREADS=16
export WANDB__SERVICE_WAIT=300

echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}"


export PYTHONPATH="$PYTHONPATH:$PWD"
accelerate launch --config_file train_configs/deepspeed_8.json src/train.py train_configs/qwen32B.yaml
torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct
torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Qwen2.5-32B-Instruct --output_dir results/finetune/Qwen2.5-32B-Instruct


33 changes: 33 additions & 0 deletions scripts/finetune_qwen_72B_LoRA.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
#SBATCH --job-name=Odesia-llama_LoRA
#SBATCH --cpus-per-task=16
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --mem=64G
#SBATCH --output=.slurm/Odesia-llama_LoRA.out.txt
#SBATCH --error=.slurm/Odesia-llama_LoRA.err.txt


source /ikerlariak/igarcia945/envs/pytorch2/bin/activate


export LC_ALL=en_US.UTF-8
export LANG=en_US.UTF-8
export LANGUAGE=en_US.UTF-8
export TOKENIZERS_PARALLELISM=true
export TRANSFORMERS_NO_ADVISORY_WARNINGS=true
export WANDB_ENTITY=igarciaf
export WANDB_PROJECT=Odesia
export OMP_NUM_THREADS=16
export WANDB__SERVICE_WAIT=300

echo CUDA_VISIBLE_DEVICES "${CUDA_VISIBLE_DEVICES}"


export PYTHONPATH="$PYTHONPATH:$PWD"
accelerate launch --config_file train_configs/deepspeed.json src/train.py train_configs/qwen_72B_LoRA.yaml
torchrun --standalone --master_port 37227 --nproc_per_node=1 src/evaluate.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA
torchrun --standalone --master_port 37227 --nproc_per_node=1 src/inference.py --tasks all --quantization --model_name models/Hermes-3-Llama-3.1-8B_LoRA --output_dir results/finetune/Hermes-3-Llama-3.1-8B_LoRA



1 change: 1 addition & 0 deletions src/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def train(training_args: Seq2SeqTrainingArguments, model_args: ModelArguments):
or training_args.fsdp_config is not None,
max_memory_MB=model_args.max_memory_MB,
rope_scaling_factor=model_args.rope_scaling_factor,
#use_liger_kernel=training_args.use_liger_kernel,
)

print(f"Model_max_length: {tokenizer.model_max_length}")
Expand Down
11 changes: 8 additions & 3 deletions src/training/load_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from transformers.utils import is_ipex_available

from .model_utils import find_all_linear_names, get_trainable_parameters
from liger_kernel.transformers import AutoLigerKernelForCausalLM


def get_current_device() -> int:
Expand Down Expand Up @@ -172,6 +173,7 @@ def load_model(
fsdp_training: bool = False,
max_memory_MB: Optional[int] = None,
rope_scaling_factor: Optional[float] = None,
use_liger_kernel: bool = False,
) -> Tuple[PreTrainedModel, PreTrainedTokenizerBase]:
"""
Load any Decoder model for training.
Expand Down Expand Up @@ -402,8 +404,11 @@ def load_model(
logging.warning(
f"Model {model_weights_name_or_path} is an decoder-only model. We will load it as a CausalLM model."
)

load_fn = AutoModelForCausalLM
if use_liger_kernel:
logging.warning("Loading model with Liger Kernel.")
load_fn = AutoLigerKernelForCausalLM
else:
load_fn = AutoModelForCausalLM
tokenizer.padding_side = "left"
model_type = "causal"

Expand All @@ -413,7 +418,7 @@ def load_model(
f"MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES or MODEL_FOR_CAUSAL_LM_MAPPING_NAMES. "
f"We will attempt load it as a CausalLM model."
)
load_fn = AutoModelForCausalLM
load_fn = AutoLigerKernelForCausalLM
tokenizer.padding_side = "left"
model_type = "causal"

Expand Down
2 changes: 1 addition & 1 deletion src/training/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __init__(

self.tokenizer = tokenizer

def compute_loss(self, model, inputs, return_outputs=False):
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
"""
How the loss is computed by Trainer. By default, all models return the loss in the first element.
Subclass and override for custom behavior.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"machine_rank": 0,
"main_training_function": "main",
"num_machines": 1,
"num_processes": 4,
"num_processes": 2,
"rdzv_backend": "static",
"same_network": true,
"tpu_env": [],
Expand Down
21 changes: 21 additions & 0 deletions train_configs/deepspeed_8.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"compute_environment": "LOCAL_MACHINE",
"debug": false,
"deepspeed_config": {
"deepspeed_config_file": "train_configs/deepspeed_zero3.json",
"zero3_init_flag": false
},
"distributed_type": "DEEPSPEED",
"downcast_bf16": "no",
"enable_cpu_affinity": false,
"machine_rank": 0,
"main_training_function": "main",
"num_machines": 1,
"num_processes": 8,
"rdzv_backend": "static",
"same_network": true,
"tpu_env": [],
"tpu_use_cluster": false,
"tpu_use_sudo": false,
"use_cpu": false
}
4 changes: 2 additions & 2 deletions train_configs/llama8b_LoRA.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ do_eval: true
do_predict: false
evaluation_strategy: "epoch"

per_device_train_batch_size: 2
per_device_train_batch_size: 4
per_device_eval_batch_size: 2
gradient_accumulation_steps: 8
gradient_accumulation_steps: 4

# optimizer settings
optim: adamw_torch
Expand Down
65 changes: 65 additions & 0 deletions train_configs/qwen14B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#Training args
model_name_or_path: Qwen/Qwen2.5-14B-Instruct
torch_dtype: bfloat16
use_lora: false
quantization: null
gradient_checkpointing: true
force_auto_device_map: false
use_flash_attention: true
deepspeed: train_configs/deepspeed_zero3.json
use_liger_kernel: true

output_dir: models/Qwen2.5-14B-Instruct
overwrite_output_dir: true
load_best_model_at_end: false
metric_for_best_model: eval_loss
greater_is_better: false
save_strategy: "no"
save_only_model: true
save_total_limit: 1

# evaluation
do_train: true
do_eval: true
do_predict: false
evaluation_strategy: "epoch"

per_device_train_batch_size: 1
per_device_eval_batch_size: 1
gradient_accumulation_steps: 16

# optimizer settings
optim: adamw_torch
learning_rate: 0.000005
weight_decay: 0.0
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-12

# lora settings
lora_r: 128
lora_alpha: 256
lora_dropout: 0.05
lora_target_modules:
- all

# reporting
logging_strategy: steps
logging_first_step: true
logging_steps: 5
report_to: wandb
run_name: "Qwen2.5-14B-Instruct"
disable_tqdm: false

# hub settings
push_to_hub: false
resume_from_checkpoint: false

# performance
bf16: true
fp16: false
torch_compile: false
ddp_find_unused_parameters: false
65 changes: 65 additions & 0 deletions train_configs/qwen32B.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#Training args
model_name_or_path: Qwen/Qwen2.5-32B-Instruct
torch_dtype: bfloat16
use_lora: false
quantization: null
gradient_checkpointing: true
force_auto_device_map: false
use_flash_attention: true
deepspeed: train_configs/deepspeed_zero3.json
use_liger_kernel: true

output_dir: models/Qwen2.5-32B-Instruct
overwrite_output_dir: true
load_best_model_at_end: false
metric_for_best_model: eval_loss
greater_is_better: false
save_strategy: "no"
save_only_model: true
save_total_limit: 1

# evaluation
do_train: true
do_eval: true
do_predict: false
evaluation_strategy: "epoch"

per_device_train_batch_size: 2
per_device_eval_batch_size: 1
gradient_accumulation_steps: 8

# optimizer settings
optim: adamw_torch
learning_rate: 0.000005
weight_decay: 0.0
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-12

# lora settings
lora_r: 128
lora_alpha: 256
lora_dropout: 0.05
lora_target_modules:
- all

# reporting
logging_strategy: steps
logging_first_step: true
logging_steps: 5
report_to: wandb
run_name: "Qwen2.5-32B-Instruct"
disable_tqdm: false

# hub settings
push_to_hub: false
resume_from_checkpoint: false

# performance
bf16: true
fp16: false
torch_compile: false
ddp_find_unused_parameters: false
62 changes: 62 additions & 0 deletions train_configs/qwen_72B_LoRA.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#Training args
model_name_or_path: Qwen/Qwen2.5-72B
torch_dtype: bfloat16
use_lora: true
quantization: 4
gradient_checkpointing: true
force_auto_device_map: false
use_flash_attention: true
deepspeed: train_configs/deepspeed_zero3.json

output_dir: models/Qwen2.5-72B_LoRA
overwrite_output_dir: true
load_best_model_at_end: false
metric_for_best_model: eval_loss
greater_is_better: false
save_strategy: "no"
save_only_model: true
save_total_limit: 1

# evaluation
do_train: true
do_eval: true
do_predict: false
evaluation_strategy: "epoch"

per_device_train_batch_size: 2
per_device_eval_batch_size: 2
gradient_accumulation_steps: 8

# optimizer settings
optim: adamw_torch
learning_rate: 0.0003
weight_decay: 0.001
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
adam_epsilon: 0.0000001

# lora settings
lora_r: 128
lora_alpha: 256
lora_dropout: 0.05
lora_target_modules:
- all

# reporting
logging_strategy: steps
logging_first_step: true
logging_steps: 5
report_to: wandb
run_name: "Qwen2.5-72B_LoRA"
disable_tqdm: false

# hub settings
push_to_hub: false
resume_from_checkpoint: false

# performance
bf16: true
fp16: false
torch_compile: false
ddp_find_unused_parameters: false

0 comments on commit 894d2a1

Please sign in to comment.