Darryl233
diff --git a/‎hardware/Huawei_Atlas800TA3/FlagScale/diff.yaml‎
Lines changed: 1 addition & 0 deletions b/‎hardware/Huawei_Atlas800TA3/FlagScale/diff.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎hardware/Huawei_Atlas800TA3/FlagScale/examples/robobrain_x0/conf/train.yaml.patch‎
Lines changed: 39 additions & 0 deletions b/‎hardware/Huawei_Atlas800TA3/FlagScale/examples/robobrain_x0/conf/train.yaml.patch‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎hardware/Huawei_Atlas800TA3/FlagScale/examples/robobrain_x0/conf/train/robobrain_x0.yaml.patch‎
Lines changed: 122 additions & 0 deletions b/‎hardware/Huawei_Atlas800TA3/FlagScale/examples/robobrain_x0/conf/train/robobrain_x0.yaml.patch‎
Lines changed: 122 additions & 0 deletions
@@ -8,5 +8,6 @@ contact: ''
 device_type: Huawei_Atlas800TA3
 models:
 - Qwen2.5-VL
+- X0
 task:
 - train
@@ -0,0 +1,39 @@
+diff --git a/examples/robobrain_x0/conf/train.yaml b/examples/robobrain_x0/conf/train.yaml
+new file mode 100644
+index 00000000..d507bdfa
+--- /dev/null
++++ b/examples/robobrain_x0/conf/train.yaml
+@@ -0,0 +1,32 @@
++defaults:
++  - train: robobrain_x0
++  - _self_
++
++experiment:
++  exp_name: robobrain_x0
++  exp_dir: ./${experiment.exp_name}
++  task:
++    type: train
++    backend: megatron
++    entrypoint: ./flagscale/train/train_robobrain_x0.py
++
++  runner:
++    backend: torchrun
++    nnodes: 1
++    nproc_per_node: 4
++    rdzv_backend: static
++  cmds:
++    before_start: ulimit -n 1048576 && source /root/miniconda3/bin/activate flagscale-train
++  envs:
++    ASCEND_RT_VISIBLE_DEVICES: 12,13,14,15
++    CUDA_DEVICE_MAX_CONNECTIONS: 1
++    NVTE_APPLY_QK_LAYER_SCALING: 0
++    NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
++    LD_PRELOAD: "/usr/local/python3.11.5/lib/python3.11/site-packages/scikit_learn.libs/libgomp-947d5fa1.so.1.0.0"
++    HCCL_DETERMINISTIC: true
++
++action: run
++
++hydra:
++  run:
++    dir: ${experiment.exp_dir}/hydra
+
@@ -0,0 +1,122 @@
+diff --git a/examples/robobrain_x0/conf/train/robobrain_x0.yaml b/examples/robobrain_x0/conf/train/robobrain_x0.yaml
+new file mode 100644
+index 00000000..6885e12f
+--- /dev/null
++++ b/examples/robobrain_x0/conf/train/robobrain_x0.yaml
+@@ -0,0 +1,115 @@
++system:
++  vision_recompute_layer_steps: 16
++  vision_ration: 0.1
++  num_workers: 1
++  calculate_per_token_loss: true
++  tensor_model_parallel_size: 1
++  pipeline_model_parallel_size: 4
++  context_parallel_size: 1
++  disable_bias_linear: True
++  use_flash_attn: True
++  use_distributed_optimizer: False
++  sequence_parallel: False
++  tp_comm_overlap: False
++  overlap_grad_reduce: False # if has text-only must be false
++  overlap_param_gather: False # if has text-only must be false
++  use_mcore_models: True
++  transformer_impl: local
++  # recompute_method: "uniform"
++  # recompute_granularity: "full"
++  # recompute_num_layers: 18
++  # use_te: True
++  precision:
++    bf16: True
++    attention_softmax_in_fp32: True
++  logging:
++    timing_log_level: 1
++    log_interval: 10
++    tensorboard_log_interval: 10
++    log_throughput: True
++    wandb_project: ${experiment.exp_name}
++    wandb_exp_name: ${experiment.exp_name}
++    log_params_norm: True
++    log_num_zeros_in_grad: True
++  checkpoint:
++    save_interval: 33
++    # pretrained_checkpoint: checkpoint after converting
++    dataloader_save: ${experiment.exp_dir}/checkpoints/dataloader
++    use_dist_ckpt: False
++    ckpt_format: torch
++    async_save: False
++
++model:
++  # attention_backend: flash # don't use "auto(nvte_flash_attn)"
++  disable_bias_linear: True
++  add_qkv_bias: True
++  num_layers: 36
++  hidden_size: 2048
++  ffn_hidden_size: 11008
++  num_attention_heads: 16
++  num_query_groups: 2
++  seq_length: 16384 # 16384 15360
++  max_padding_length: 16384 # real seq_length
++  # especial for qwen2.5-vl
++  enable_variable_seq_lengths: True
++  max_position_embeddings: 128000 # only useful for additional position embedding
++  swiglu: True
++  normalization: RMSNorm
++  norm_epsilon: 1e-6
++  init_method_std: 0.02
++  attention_dropout: 0.0
++  hidden_dropout: 0.0
++  clip_grad: 1.0
++  #######################
++  train_iters: 2
++  # formal train_iters:
++  # train_iters: 262478 # 1 epoch
++  eval_iters: 0 # no valid
++  micro_batch_size: 1
++  global_batch_size: 1
++  allow_missing_vision_projection_checkpoint: False
++  apply_layernorm_1p: False
++  group_query_attention: True
++  no_masked_softmax_fusion: True
++  # untie_embeddings_and_output_weights: False
++  untie_embeddings_and_output_weights: True
++
++  # position embedding
++  position_embedding_type: mrope
++  rotary_percent: 1.0
++  rotary_base: 1000000
++  rotary_seq_len_interpolation_factor: 1
++  no_rope_fusion: False
++  mrope_section: [16, 24, 24]
++  eod_mask_loss: False
++
++  # vision model
++  freeze_LM: False
++  freeze_ViT: False
++  disable_vision_class_token: True
++  seed: 42
++
++  optimizer:
++    weight_decay: 0.1
++    adam_beta1: 0.9
++    adam_beta2: 0.999
++    lr_scheduler:
++      lr: 5.0e-5
++      min_lr: 0
++      lr_warmup_iters: 0
++      lr_decay_style: cosine
++
++data:
++  data_path: /root/FlagScale/demo_0913_n2/wds-1
++  # formal dataset:
++  # data_path: /share/project/dumengfei/data/libero_data_0910_F1A1C10/wds-1
++  vision_root: /
++  dataloader_type: external
++  split: 100,0,0
++  shuffle-buffer-size: 1000
++  tokenizer:
++    tokenizer_type: Qwen2VLTokenizer
++    tokenizer_path: /data/hcr/models/BAAI/RoboBrain-X0-Preview
++    vocab_size: 151643 # action
++    make_vocab_size_divisible_by: 64
++    extra_vocab_size: 293
+