-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_lola_peft.sh
55 lines (48 loc) · 1.46 KB
/
train_lola_peft.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
set -eu
PEFT_VENV_DIR=./venv-lola-custom-peft
source $PEFT_VENV_DIR/bin/activate
## Specify CUDA device
#export CUDA_VISIBLE_DEVICES="0"
#export CUDA_VISIBLE_DEVICES="0,1"
# CUDA configuration
if [[ -z "${CUDA_VISIBLE_DEVICES+x}" ]]; then
GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
else
GPU_COUNT=$(echo $CUDA_VISIBLE_DEVICES | awk -F',' '{print NF}')
fi
echo "Assigning ${GPU_COUNT} gpu(s) for training"
# Debug mode and prefix setup
postfix=$(date +"%d%m%y%H%M%S")
LAUNCHER="python"
prefix=""
# DeepSpeed configuration
USE_DEEPSPEED=false
EXTRA_PARAMS=()
if [[ "$USE_DEEPSPEED" == true ]]; then
prefix="deepspeed-"
EXTRA_PARAMS+=("--deepspeed" "deepspeed_config.json")
fi
# Model and task configuration
MODEL_HF_ID=dice-research/lola_v1
MODEL_NAME="${MODEL_HF_ID##*/}"
DATA_DIR=data_dir/
RUN_LABEL=${prefix}${MODEL_NAME}-${postfix}
# Training command with extensible parameters
$LAUNCHER -m torch.distributed.run --nnodes=1 --nproc_per_node=$GPU_COUNT --master_port=4550 train_custom_peft.py \
--model_name_or_path $MODEL_HF_ID \
--do_train \
--train_data_path $DATA_DIR/train.json \
--fp16 True \
--output_dir trained_model/$RUN_LABEL\
--num_train_epochs 100 \
--per_device_train_batch_size 8 \
--save_strategy "steps" \
--save_steps 100 \
--save_total_limit 1 \
--logging_steps 10 \
--learning_rate 5e-5 \
--weight_decay 0.01 \
--warmup_ratio 0.03 \
--run_name $RUN_LABEL \
"${EXTRA_PARAMS[@]}"