Skip to content

Commit b391461

Browse files
authored
Support RoboBrain-X0 Training on Huawei_Atlas800TA3 (flagos-ai#1000)
PR Category Hardware PR Types Hardware PR Description Support RoboBrain-X0 Training on Huawei_Atlas800TA3
1 parent cf8f9e7 commit b391461

File tree

9 files changed

+2551
-0
lines changed

9 files changed

+2551
-0
lines changed

hardware/Huawei_Atlas800TA3/FlagScale/diff.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,6 @@ contact: ''
88
device_type: Huawei_Atlas800TA3
99
models:
1010
- Qwen2.5-VL
11+
- X0
1112
task:
1213
- train
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
diff --git a/examples/robobrain_x0/conf/train.yaml b/examples/robobrain_x0/conf/train.yaml
2+
new file mode 100644
3+
index 00000000..d507bdfa
4+
--- /dev/null
5+
+++ b/examples/robobrain_x0/conf/train.yaml
6+
@@ -0,0 +1,32 @@
7+
+defaults:
8+
+ - train: robobrain_x0
9+
+ - _self_
10+
+
11+
+experiment:
12+
+ exp_name: robobrain_x0
13+
+ exp_dir: ./${experiment.exp_name}
14+
+ task:
15+
+ type: train
16+
+ backend: megatron
17+
+ entrypoint: ./flagscale/train/train_robobrain_x0.py
18+
+
19+
+ runner:
20+
+ backend: torchrun
21+
+ nnodes: 1
22+
+ nproc_per_node: 4
23+
+ rdzv_backend: static
24+
+ cmds:
25+
+ before_start: ulimit -n 1048576 && source /root/miniconda3/bin/activate flagscale-train
26+
+ envs:
27+
+ ASCEND_RT_VISIBLE_DEVICES: 12,13,14,15
28+
+ CUDA_DEVICE_MAX_CONNECTIONS: 1
29+
+ NVTE_APPLY_QK_LAYER_SCALING: 0
30+
+ NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
31+
+ LD_PRELOAD: "/usr/local/python3.11.5/lib/python3.11/site-packages/scikit_learn.libs/libgomp-947d5fa1.so.1.0.0"
32+
+ HCCL_DETERMINISTIC: true
33+
+
34+
+action: run
35+
+
36+
+hydra:
37+
+ run:
38+
+ dir: ${experiment.exp_dir}/hydra
39+
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
diff --git a/examples/robobrain_x0/conf/train/robobrain_x0.yaml b/examples/robobrain_x0/conf/train/robobrain_x0.yaml
2+
new file mode 100644
3+
index 00000000..6885e12f
4+
--- /dev/null
5+
+++ b/examples/robobrain_x0/conf/train/robobrain_x0.yaml
6+
@@ -0,0 +1,115 @@
7+
+system:
8+
+ vision_recompute_layer_steps: 16
9+
+ vision_ration: 0.1
10+
+ num_workers: 1
11+
+ calculate_per_token_loss: true
12+
+ tensor_model_parallel_size: 1
13+
+ pipeline_model_parallel_size: 4
14+
+ context_parallel_size: 1
15+
+ disable_bias_linear: True
16+
+ use_flash_attn: True
17+
+ use_distributed_optimizer: False
18+
+ sequence_parallel: False
19+
+ tp_comm_overlap: False
20+
+ overlap_grad_reduce: False # if has text-only must be false
21+
+ overlap_param_gather: False # if has text-only must be false
22+
+ use_mcore_models: True
23+
+ transformer_impl: local
24+
+ # recompute_method: "uniform"
25+
+ # recompute_granularity: "full"
26+
+ # recompute_num_layers: 18
27+
+ # use_te: True
28+
+ precision:
29+
+ bf16: True
30+
+ attention_softmax_in_fp32: True
31+
+ logging:
32+
+ timing_log_level: 1
33+
+ log_interval: 10
34+
+ tensorboard_log_interval: 10
35+
+ log_throughput: True
36+
+ wandb_project: ${experiment.exp_name}
37+
+ wandb_exp_name: ${experiment.exp_name}
38+
+ log_params_norm: True
39+
+ log_num_zeros_in_grad: True
40+
+ checkpoint:
41+
+ save_interval: 33
42+
+ # pretrained_checkpoint: checkpoint after converting
43+
+ dataloader_save: ${experiment.exp_dir}/checkpoints/dataloader
44+
+ use_dist_ckpt: False
45+
+ ckpt_format: torch
46+
+ async_save: False
47+
+
48+
+model:
49+
+ # attention_backend: flash # don't use "auto(nvte_flash_attn)"
50+
+ disable_bias_linear: True
51+
+ add_qkv_bias: True
52+
+ num_layers: 36
53+
+ hidden_size: 2048
54+
+ ffn_hidden_size: 11008
55+
+ num_attention_heads: 16
56+
+ num_query_groups: 2
57+
+ seq_length: 16384 # 16384 15360
58+
+ max_padding_length: 16384 # real seq_length
59+
+ # especial for qwen2.5-vl
60+
+ enable_variable_seq_lengths: True
61+
+ max_position_embeddings: 128000 # only useful for additional position embedding
62+
+ swiglu: True
63+
+ normalization: RMSNorm
64+
+ norm_epsilon: 1e-6
65+
+ init_method_std: 0.02
66+
+ attention_dropout: 0.0
67+
+ hidden_dropout: 0.0
68+
+ clip_grad: 1.0
69+
+ #######################
70+
+ train_iters: 2
71+
+ # formal train_iters:
72+
+ # train_iters: 262478 # 1 epoch
73+
+ eval_iters: 0 # no valid
74+
+ micro_batch_size: 1
75+
+ global_batch_size: 1
76+
+ allow_missing_vision_projection_checkpoint: False
77+
+ apply_layernorm_1p: False
78+
+ group_query_attention: True
79+
+ no_masked_softmax_fusion: True
80+
+ # untie_embeddings_and_output_weights: False
81+
+ untie_embeddings_and_output_weights: True
82+
+
83+
+ # position embedding
84+
+ position_embedding_type: mrope
85+
+ rotary_percent: 1.0
86+
+ rotary_base: 1000000
87+
+ rotary_seq_len_interpolation_factor: 1
88+
+ no_rope_fusion: False
89+
+ mrope_section: [16, 24, 24]
90+
+ eod_mask_loss: False
91+
+
92+
+ # vision model
93+
+ freeze_LM: False
94+
+ freeze_ViT: False
95+
+ disable_vision_class_token: True
96+
+ seed: 42
97+
+
98+
+ optimizer:
99+
+ weight_decay: 0.1
100+
+ adam_beta1: 0.9
101+
+ adam_beta2: 0.999
102+
+ lr_scheduler:
103+
+ lr: 5.0e-5
104+
+ min_lr: 0
105+
+ lr_warmup_iters: 0
106+
+ lr_decay_style: cosine
107+
+
108+
+data:
109+
+ data_path: /root/FlagScale/demo_0913_n2/wds-1
110+
+ # formal dataset:
111+
+ # data_path: /share/project/dumengfei/data/libero_data_0910_F1A1C10/wds-1
112+
+ vision_root: /
113+
+ dataloader_type: external
114+
+ split: 100,0,0
115+
+ shuffle-buffer-size: 1000
116+
+ tokenizer:
117+
+ tokenizer_type: Qwen2VLTokenizer
118+
+ tokenizer_path: /data/hcr/models/BAAI/RoboBrain-X0-Preview
119+
+ vocab_size: 151643 # action
120+
+ make_vocab_size_divisible_by: 64
121+
+ extra_vocab_size: 293
122+

0 commit comments

Comments
 (0)