forked from flagos-ai/FlagScale
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path7b.yaml
More file actions
91 lines (88 loc) · 2.18 KB
/
7b.yaml
File metadata and controls
91 lines (88 loc) · 2.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
system:
tensor_model_parallel_size: 2
pipeline_model_parallel_size: 1
disable_bias_linear: True
use_flash_attn: True
use_distributed_optimizer: True
use_mcore_models: True
transformer_impl: transformer_engine
recompute_method: "uniform"
recompute_granularity: "full"
recompute_num_layers: 1
use_te: True
precision:
bf16: True
attention_softmax_in_fp32: True
logging:
log_interval: 1
tensorboard_log_interval: 1
wandb_project: "train-llava-ov"
wandb_exp_name: "train-llava-ov"
log_params_norm: True
log_num_zeros_in_grad: True
checkpoint:
save_interval: 3000
pretrained_checkpoint: xxxx
dataloader_save: ${experiment.exp_dir}/checkpoints/dataloader
use_dist_ckpt: False
ckpt_format: torch
async_save: False
model:
num_layers: 28
hidden_size: 3584
ffn_hidden_size: 18944
num_attention_heads: 28
num_query_groups: 4
seq_length: 32768
max_position_embeddings: 32768
swiglu: True
normalization: RMSNorm
init_method_std: 0.014
attention_dropout: 0.0
hidden_dropout: 0.0
clip_grad: 1.0
train_iters: 625
eval_iters: 0
micro_batch_size: 1
global_batch_size: 320
allow_missing_vision_projection_checkpoint: True
apply_layernorm_1p: True
group_query_attention: True
no_masked_softmax_fusion: True
untie-embeddings-and-output-weights: True
position_embedding_type: rope
rotary_percent: 1.0
rotary_base: 1000000
eod_mask_loss: True
freeze_LM: False
freeze_ViT: False
patch_dim: 14
img_h: 384
img_w: 384
language_model_type: qwen2_7b
vision_model_type: siglip
disable_vision_class_token: True
image_grid_pinpoints: '(1x1),...,(6x6)'
image_aspect_ratio: anyres_max_9
mm_patch_merge_type: spatial_unpad
seed: 42
optimizer:
weight_decay: 0.0
adam_beta1: 0.9
adam_beta2: 0.95
lr_scheduler:
lr: 1.0e-5
lr_warmup_fraction: .03
lr_decay_style: cosine
data:
interleaved_dataset: True
training_dataset_only: True
data_path: xxxx
dataloader_type: external
split: 100,0,0
tokenizer:
tokenizer_type: Qwen2TokenizerFS
tokenizer_path: xxxx
vocab_size: 152064 # 7b
# vocab_size: 151936 # 1.5b
make_vocab_size_divisible_by: 64