-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-gpt3-moe-pretrain.sh
184 lines (167 loc) · 6.55 KB
/
run-gpt3-moe-pretrain.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/bash
###### Sample usage ######
# noctua2 (default args): bash run-gpt3-moe-pretrain.sh --slurm
# noctua2 (custom args): bash run-gpt3-moe-pretrain.sh --slurm --nnodes=1 --runtime="50:00:00" --static_gate --model_size="1.3" --mb_size=8
# experiment servers (single server): bash run-gpt3-moe-pretrain.sh --nnodes=1 --gpus_per_node=2 --node_rank=0 --static_gate --model_size="1.3" --mb_size=24
# experiment servers (multi-server):
# Server1 (master node): bash run-gpt3-moe-pretrain.sh --nnodes=2 --gpus_per_node=2 --node_rank=0 --static_gate --model_size="1.3" --mb_size=24
# Server2 (worker node): bash run-gpt3-moe-pretrain.sh --nnodes=2 --gpus_per_node=2 --node_rank=1 --master_addr=<server1-hostname> --master_port=6005 --rdzv_id=<generated-id-from-server1> --static_gate --model_size="1.3" --mb_size=24
######
set -eu
# Collect command line arguments
# Default values for variables
export STATIC_GATE=false
export NNODES=24
export GPUS_PER_NODE=4
# Node rank is only needed when not running on SLURM. On SLURM, this value is extracted automatically from SLURM_PROCID.
export NODE_RANK=0
export DGX_NODE=false
export SLURM=false
export MASTER_ADDR=$HOSTNAME
export MASTER_PORT=6005
export RUNTIME="50:00:00"
export RDZV_ID=$RANDOM
# Default (dense) Model size (number of parameters in billions)
## Only values that will work: 0.125, 0.35, 0.76, 1.3, 2.7, 6.7, 13 or 175
export MODEL_SIZE=1.3
# Default batch size per GPU
#export MICRO_BATCH_SIZE=6
export MICRO_BATCH_SIZE=8
# Number of tokens to train for
# export TRAIN_TOKENS=3000000000 # 3B
# export TRAIN_TOKENS=12000000000 # 12B
export TRAIN_TOKENS=1000000000000 # 1T
## Number of experts. EP_SIZE 1 means dense model without MoE
export EP_SIZE=16
## The NAME_ID variable is used for generating a unique name for the model. It acts like a model name prefix.
## When kept the same as a previously trained model (together with other hyperparams), the training will resume from the last checkpoint automatically.
## Please note that other hyperparameters are also used in generation of a unique name, check in the gpt3-moe-pretrain.sh script to see how "NAME" is formed.
export NAME_ID="gpt-normal-${EP_SIZE}moe"
POSITIONAL=()
# Process command line arguments
while [[ "$#" -gt 0 ]]; do
case $1 in
--static_gate|--static_gate=true) # Boolean static_gate
export STATIC_GATE=true
export NAME_ID="gpt-staticgate-moe"
shift
;;
--dgx_node|--dgx_node=true) # Boolean dgx_node
export DGX_NODE=true
shift
;;
--slurm|--slurm=true) # Boolean slurm
export SLURM=true
shift
;;
--nnodes=*) # Option with argument
export NNODES="${1#*=}"
shift
;;
--gpus_per_node=*) # Option with argument
export GPUS_PER_NODE="${1#*=}"
shift
;;
--node_rank=*) # Option with argument
export NODE_RANK="${1#*=}"
shift
;;
--master_addr=*) # Option with argument
export MASTER_ADDR="${1#*=}"
shift
;;
--master_port=*) # Option with argument
export MASTER_PORT="${1#*=}"
shift
;;
--rdzv_id=*) # Option with argument
export RDZV_ID="${1#*=}"
shift
;;
--train_tokens=*) # Option with argument
export TRAIN_TOKENS="${1#*=}"
shift
;;
--model_size=*) # Option with argument
export MODEL_SIZE="${1#*=}"
shift
;;
--mb_size=*) # Option with argument
export MICRO_BATCH_SIZE="${1#*=}"
shift
;;
--ep_size=*) # Option with argument
export EP_SIZE="${1#*=}"
shift
;;
--runtime=*) # Option with argument
export RUNTIME="${1#*=}"
shift
;;
--) # End of options, start of positional parameters
shift
break
;;
*) # Positional parameter
POSITIONAL+=("$1")
shift
;;
esac
done
export OUTPUT_DIR=`pwd`
# output path
export OUTPUT_BASEPATH=$OUTPUT_DIR/$NAME_ID"-output"
export NUM_GPUS=$((NNODES*GPUS_PER_NODE))
# Setting Global batch size in the end, since it relies on micro batch size
export GLOBAL_BATCH_SIZE=$((NNODES*GPUS_PER_NODE*MICRO_BATCH_SIZE))
export NAME_POSTFIX="${NAME_ID}_ms-${MODEL_SIZE}B_bs-${GLOBAL_BATCH_SIZE}_gpus-${NUM_GPUS}"
# Restore positional parameters
set -- "${POSITIONAL[@]}"
mkdir -p "train_logs/"
# If running through SLURM or not
if [[ "$SLURM" == "true" ]]; then
echo "Submitting job to SLURM."
# Set paths
export LIB_DIR=../../
export VOCAB_PATH=/scratch/hpc-prf-lola/data/misc/mgpt/mgpt_vocab.json
export MERGE_PATH=/scratch/hpc-prf-lola/data/misc/mgpt/mgpt_merges.txt
### data blend generation start ###
# # directory with all the data directories
# export DATA_DIR=/scratch/hpc-prf-lola/data/culturaX/mgpt-tokenized
# # call python script to fetch blend string
# export DATA_BLEND=$(python3 fetch_data_blend.py "$DATA_DIR")
# # echo "Data blend: ${DATA_BLEND}"
# if [[ -z $DATA_BLEND ]]; then
# echo "Error: Data blend path is empty" >&2
# exit 1
# fi
### data blend generation end ###
export DATA_BLEND=/scratch/hpc-prf-lola/data/culturaX/merged/mgpt-tokenized/meg-culturax
export VENV_PATH=~/virt-envs/venv-lola
export RUN_NAME="noctua2-${NAME_POSTFIX}"
echo "Run name: ${RUN_NAME}"
export WANDB_NAME=$RUN_NAME
EXTRA_PARAMS=""
if [[ "$DGX_NODE" == "true" ]]; then
EXTRA_PARAMS=" --partition=dgx --qos=devel "
fi
sbatch --reservation=lola_production_run --job-name=$RUN_NAME \
--nodes=$NNODES \
--ntasks-per-node=1 \
--cpus-per-task=128 \
--gres=gpu:a100:$GPUS_PER_NODE \
--time=$RUNTIME \
--output="train_logs/%x_slurm-%j.out" \
--mail-user=$USER_EMAIL --mail-type=BEGIN,END,FAIL\
$EXTRA_PARAMS gpt3-moe-pretrain.sh
else
echo -e "PyTorch rendezvous id: ${RDZV_ID}\nIf this is the master node, then provide the id above to the worker nodes: --rdzv_id=${RDZV_ID}"
echo "Starting process for node rank: ${NODE_RANK}"
# Set paths
export LIB_DIR=~/repos/LOLA-Megatron-DeepSpeed
export DATA_DIR=~/repos/LOLA-Megatron-DeepSpeed/lola_ws/gpt/mc4_4pt5m_data
export VENV_PATH=~/repos/LOLA-Megatron-DeepSpeed/venv-lola
export RUN_NAME="dice_exp-${NAME_POSTFIX}"
export WANDB_NAME=$RUN_NAME
bash gpt3-moe-pretrain.sh
fi