FlagScale/examples/llava_onevision/conf/train/7b.yaml at main · legitnull/FlagScale · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
system:
  tensor_model_parallel_size: 2
  pipeline_model_parallel_size: 1
  disable_bias_linear: True
  use_flash_attn: True
  use_distributed_optimizer: True
  use_mcore_models: True
  transformer_impl: transformer_engine
  recompute_method: "uniform"
  recompute_granularity: "full"
  recompute_num_layers: 1
  use_te: True
  precision:
    bf16: True
    attention_softmax_in_fp32: True
  logging:
    log_interval: 1
    tensorboard_log_interval: 1
    wandb_project: "train-llava-ov"
    wandb_exp_name: "train-llava-ov"
    log_params_norm: True
    log_num_zeros_in_grad: True
  checkpoint:
    save_interval: 3000
    pretrained_checkpoint: xxxx
    dataloader_save: ${experiment.exp_dir}/checkpoints/dataloader
    use_dist_ckpt: False
    ckpt_format: torch
    async_save: False

model:
  num_layers: 28
  hidden_size: 3584
  ffn_hidden_size: 18944
  num_attention_heads: 28
  num_query_groups: 4
  seq_length: 32768
  max_position_embeddings: 32768
  swiglu: True
  normalization: RMSNorm
  init_method_std: 0.014
  attention_dropout: 0.0
  hidden_dropout: 0.0
  clip_grad: 1.0
  train_iters: 625
  eval_iters: 0
  micro_batch_size: 1
  global_batch_size: 320
  allow_missing_vision_projection_checkpoint: True
  apply_layernorm_1p: True
  group_query_attention: True
  no_masked_softmax_fusion: True
  untie-embeddings-and-output-weights: True
  position_embedding_type: rope
  rotary_percent: 1.0
  rotary_base: 1000000
  eod_mask_loss: True
  freeze_LM: False
  freeze_ViT: False
  patch_dim: 14
  img_h: 384
  img_w: 384
  language_model_type: qwen2_7b
  vision_model_type: siglip
  disable_vision_class_token: True
  image_grid_pinpoints: '(1x1),...,(6x6)'
  image_aspect_ratio: anyres_max_9
  mm_patch_merge_type: spatial_unpad
  seed: 42

  optimizer:
    weight_decay: 0.0
    adam_beta1: 0.9
    adam_beta2: 0.95
    lr_scheduler:
      lr: 1.0e-5
      lr_warmup_fraction: .03
      lr_decay_style: cosine

data:
  interleaved_dataset: True
  training_dataset_only: True
  data_path: xxxx
  dataloader_type: external
  split: 100,0,0
  tokenizer:
    tokenizer_type: Qwen2TokenizerFS
    tokenizer_path: xxxx
    vocab_size: 152064 # 7b
    # vocab_size: 151936 # 1.5b
    make_vocab_size_divisible_by: 64