-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdist-sample.sh
185 lines (159 loc) · 4.83 KB
/
dist-sample.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/bin/bash
#SBATCH -J "Fast Setup - mini GPT2 train"
#SBATCH -N 1
#SBATCH --ntasks-per-node=1
#SBATCH --gres=gpu:a100:4
#SBATCH --partition=dgx
#SBATCH --qos=devel
#SBATCH -t 00:30:00
#load modules
#module load mpi/OpenMPI/4.1.4-GCC-11.3.0
# CUDA installation already comes included in the PyTorch module
#module load system/CUDA/11.7.0
module load lib/NCCL/2.12.12-GCCcore-11.3.0-CUDA-11.7.0
module load ai/PyTorch/1.12.0-foss-2022a-CUDA-11.7.0
module load vis/torchvision/0.13.1-foss-2022a-CUDA-11.7.0
# activating venv
source /scratch/hpc-prf-lola/lib_repo/custom-venvs/lola1/bin/activate
# Creating time specific postfix for directory and file names
DATE_POSTFIX=$(date '+%Y-%m-%d_%H%M%S')
# The model_size variable is only used in the model name
MODEL_SIZE=0.35
CHECKPOINT_PATH=checkpoints/gpt2-${MODEL_SIZE}b-dist-${DATE_POSTFIX}
VOCAB_FILE=data/gpt2-vocab.json
MERGE_FILE=data/gpt2-merges.txt
DATA_PATH=data/meg-gpt2-oscar-en-10k_text_document
TENSORBOARD_PATH=output_dir/tensorboard-dist-$DATE_POSTFIX
# so processes know who to talk to
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6005
GPUS_PER_NODE=$SLURM_GPUS_ON_NODE
NNODES=$SLURM_NNODES
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=$((SLURM_NNODES*SLURM_GPUS_ON_NODE*MICRO_BATCH_SIZE))
TP_SIZE=1
PP_SIZE=1
# Parameter calculation formula for GPT2: https://github.com/bigscience-workshop/bigscience/blob/58d99c67f643d27b5765a73a2ee2d1ce0a4b2c6b/experiments/gpt2-utils.md
# MODEL_SIZE=0.35
NLAYERS=24
NHIDDEN=1024
NHEADS=16
#NLAYERS=50
#NHIDDEN=4096
#NHEADS=32
SEQ_LEN=1024
VOCAB_SIZE=50257
SAVE_INTERVAL=500
EPOCH=10
TRAIN_SAMPLES=10000
# --rampup-batch-size 2 2 1_000 \
# --lr-decay-samples 12 \
# --lr-warmup-samples 5 \
GPT_ARGS=" \
--num-layers $NLAYERS \
--hidden-size $NHIDDEN \
--num-attention-heads $NHEADS \
--seq-length $SEQ_LEN \
--max-position-embeddings $SEQ_LEN \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--train-samples $((TRAIN_SAMPLES*EPOCH)) \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--adam-eps 1e-8 \
--lr 1e-4 \
--min-lr 1e-6 \
--lr-decay-style cosine \
--clip-grad 1.0 \
--weight-decay 1e-1 \
--fp16 \
--partition-activations \
--seed 42 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
"
OUTPUT_ARGS=" \
--exit-interval 1000 \
--log-interval 1 \
--save-interval $SAVE_INTERVAL \
--eval-interval 50 \
--eval-iters 10 \
--checkpoint-activations \
"
DATA_ARGS=" \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--tensorboard-dir $TENSORBOARD_PATH \
--tensorboard-queue-size 5 \
--log-timers-to-tensorboard \
--log-batch-size-to-tensorboard \
--log-validation-ppl-to-tensorboard \
"
ZERO_STAGE=2
config_json="./ds_config.json"
# Deepspeed figures out GAS dynamically from dynamic GBS via set_train_batch_size()
cat <<EOT > $config_json
{
"train_micro_batch_size_per_gpu": $MICRO_BATCH_SIZE,
"train_batch_size": $GLOBAL_BATCH_SIZE,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": $ZERO_STAGE,
"contiguous_gradients": true,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 5e8,
"allgather_bucket_size": 5e8
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 500,
"hysteresis": 2,
"min_loss_scale": 1,
"initial_scale_power": 12
},
"steps_per_print": 2000,
"wall_clock_breakdown": false
}
EOT
# --deepspeed_config ${config_json} \
DEEPSPEED_ARGS=" \
--deepspeed \
--no-pipeline-parallel \
--zero-stage ${ZERO_STAGE} \
--deepspeed_config ${config_json} \
--deepspeed-activation-checkpointing \
"
ALL_ARGS="$GPT_ARGS $OUTPUT_ARGS $DATA_ARGS $DEEPSPEED_ARGS"
export LAUNCHER="python -u -m torch.distributed.run \
--nproc_per_node $GPUS_PER_NODE \
--nnodes $NNODES \
--rdzv_id=$RANDOM \
--rdzv_endpoint $MASTER_ADDR:$MASTER_PORT \
--rdzv_backend c10d \
--max_restarts 0 \
--tee 3 \
"
export CMD=" \
pretrain_gpt.py \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--distributed-backend nccl \
$ALL_ARGS \
"
# do not remove or the training will hang and nodes will be lost w/o this workaround
export CUDA_LAUNCH_BLOCKING=1
# hide duplicated errors using this hack - will be properly fixed in pt-1.12
export TORCHELASTIC_ERROR_FILE='torch-elastic-error.json'
export NCCL_ASYNC_ERROR_HANDLING=1
# export NCCL_DEBUG=DEBUG
#echo $CMD
#$CMD
# --jobid $SLURM_JOBID
echo LAUNCHER: $LAUNCHER
echo CMD: $CMD
srun --wait=60 --kill-on-bad-exit=1 bash -c "NCCL_DEBUG=INFO $LAUNCHER --node_rank \$SLURM_PROCID $CMD" 2>&1 | tee -a gpt2-dist.log
# bash -c "NCCL_DEBUG=INFO $LAUNCHER --node_rank $SLURM_PROCID $CMD" 2>&1 | tee -a gpt2-dist.log