-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathrun_train_e2d2_cnn.sh
More file actions
74 lines (66 loc) · 2.71 KB
/
run_train_e2d2_cnn.sh
File metadata and controls
74 lines (66 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/bin/bash
# Setup environment
cd ../ || exit # Go to the root directory of the repo
source setup_env.sh
# Model arch
BLOCK_SIZE=8
EVAL_BLOCK_SIZE=8
HIDDEN_SIZE=256
INTERMEDIATE_SIZE=768
N_ENCODER_LAYERS=20
N_DECODER_LAYERS=8
# Hyperparameters
LR=3e-4
WARMUP_DURATION="1000ba"
BATCH_SIZE=128
MAX_DURATION="500000ba"
PRETRAINED_MODEL_NAME_OR_PATH=Qwen/Qwen3-0.6B-Base
TAG="e2d2"
ENC_LAYERS="enc${N_ENCODER_LAYERS}"
DEC_LAYERS="dec${N_DECODER_LAYERS}"
RUN_NAME=cnn_block${BLOCK_SIZE}_lr${LR}_bsz${BATCH_SIZE}_warm${WARMUP_DURATION}_${ENC_LAYERS}_${DEC_LAYERS}_hidden${HIDDEN_SIZE}_inter${INTERMEDIATE_SIZE}_${TAG}
GPU_TYPE=$(nvidia-smi --query-gpu=name --format=csv,noheader | sed -E 's/.*(A[0-9]+|H100|A6000).*/\1/' | head -n 1)
if [[ "$GPU_TYPE" == "A100" || "$GPU_TYPE" == "H100" ]]; then
MICRO_BATCH_SIZE=8
elif [[ "$GPU_TYPE" == "A6000" ]]; then
MICRO_BATCH_SIZE=4
else
MICRO_BATCH_SIZE=2
fi
NUM_WORKERS=0
composer -n ${NUM_VISIBLE_DEVICES} scripts/composer_scripts/train_discrete_denoiser.py \
run_name=${RUN_NAME} \
pretrained_model_name_or_path=${PRETRAINED_MODEL_NAME_OR_PATH} \
dataset@train_dataset=cnn_dailymail_train \
dataset@eval_dataset=cnn_dailymail_eval \
composer.optimizer.lr=${LR} \
composer.trainer.eval_interval="5000ba" \
composer.trainer.max_duration=${MAX_DURATION} \
composer.trainer.save_num_checkpoints_to_keep=1 \
composer/lr_scheduler=constant_with_warmup \
composer.lr_scheduler.t_warmup=${WARMUP_DURATION} \
model=e2d2 \
model.config.attn_backend="sdpa" \
training.compile_backbone=true \
model.config.length=1024 \
model/[email protected]_config=llm_as_encoder_decoder \
model.config.backbone_config.use_encoder_causal_mask=false \
model.config.backbone_config.num_encoder_layers=${N_ENCODER_LAYERS} \
model.config.backbone_config.num_decoder_layers=${N_DECODER_LAYERS} \
model.config.backbone_config.tie_encoder_decoder_weights=false \
model.config.backbone_config.reinit_decoder=true \
model.config.backbone_config.reinit_encoder=true \
model.config.backbone_config.keep_top_decoder_layers=false \
model.config.backbone_config.keep_top_encoder_layers=false \
+model.config.backbone_config.hidden_size=${HIDDEN_SIZE} \
+model.config.backbone_config.intermediate_size=${INTERMEDIATE_SIZE} \
training.global_batch_size=${BATCH_SIZE} \
training.grad_accum=$(( BATCH_SIZE / NUM_VISIBLE_DEVICES / MICRO_BATCH_SIZE )) \
block_size=${BLOCK_SIZE} \
eval_block_size=${EVAL_BLOCK_SIZE} \
training.antithetic_sampling=false \
hydra.run.dir=outputs/${RUN_NAME} \
composer.trainer.save_interval="1000ba" \
composer.loggers.name=${RUN_NAME} \
train_dataloader.num_workers=${NUM_WORKERS} \
composer.callbacks.hf_compatible_checkpointing.disable_hf=true