forked from flagos-ai/FlagScale
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path7b.yaml
More file actions
65 lines (62 loc) · 1.55 KB
/
7b.yaml
File metadata and controls
65 lines (62 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
system:
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
disable_bias_linear: True
use_flash_attn: True
use_distributed_optimizer: True
precision:
fp16: True
initial_loss_scale: 522893
min_loss_scale: 1.0
attention_softmax_in_fp32: True
accumulate_allreduce_grads_in_fp32: True
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: "train-aquila-7B"
wandb_exp_name: "train-test-7B"
checkpoint:
save_interval: 2000
model:
num_layers: 32
hidden_size: 4096
num_attention_heads: 32
seq_length: 2048
max_position_embeddings: 2048
norm_epsilon: 1e-5
use_rotary_position_embeddings: true
no_position_embedding: true
swiglu: true
multiple_of: 256
normalization: RMSNorm
# rotary_interleaved_patch: true
untie_embeddings_and_output_weights: true
init_method_std: 0.02
attention_dropout: 0.0
hidden_dropout: 0.0
weight_decay: 0.1
clip_grad: 1.0
train_samples: 1002539063
eval_iters: 0
micro_batch_size: 2
global_batch_size: 1728
seed: 1234
optimizer:
weight_decay: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
lr_scheduler:
lr: 2.0e-5
min_lr: 2.0e-6
lr_warmup_samples: 3076172
lr_decay_style: cosine
data:
data_path: ${data_path:??}
split: 1
tokenizer:
legacy_tokenizer: true
tokenizer_type: AquilaTokenizerFS
vocab_file: ./examples/aquila/tokenizer/vocab.json
merge_file: ./examples/aquila/tokenizer/merges.txt
special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt
vocab_size: 100008