From b1ce09dc847472c9bab2068dbf5f3ea9f12e463c Mon Sep 17 00:00:00 2001 From: shuyq10 <13569752081@163.com> Date: Thu, 26 Feb 2026 15:37:28 +0800 Subject: [PATCH 1/3] Heterogeneous Training Quick Start Guide --- examples/qwen3/conf/QuickStart.md | 256 ++++++++++++++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 examples/qwen3/conf/QuickStart.md diff --git a/examples/qwen3/conf/QuickStart.md b/examples/qwen3/conf/QuickStart.md new file mode 100644 index 0000000000..f939892934 --- /dev/null +++ b/examples/qwen3/conf/QuickStart.md @@ -0,0 +1,256 @@ +## 1. Heterogeneous Training Environment and Code + +### 1.1 Docker Image Paths + +- NVIDIA A800: https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/hetero_train/metax/nvidia_metax.tar + +- METAX C550: https://baai-flagscale.ks3-cn-beijing.ksyuncs.com/hetero_train/metax/metax_nvidia.tar + +You can directly download the images using the `wget` command on Linux. + +#### Configure SSH Port for Password-Free Multi-Machine Access + +```Plain +#Replace 22 with your custom password-free port +#Execute the following commands inside the Docker container + +sed -i 's/^Port .*/Port 22/' /etc/ssh/sshd_config + +service ssh restart +``` + +### 1.2 Install FlagScale + +#### 1.2.1 Download the Source Code + +```bash +git clone -b main-legacy https://github.com/flagos-ai/FlagScale.git +cd FlagScale/ +``` + +#### 1.2.2 Apply Submodule Patch Code + +```bash +# C550 +python3 tools/patch/unpatch.py --backend FlagScale Megatron-LM --device-type Metax_C550 --task train --commit 4e1b978fd626e8c23e3f894cc32ae09fe641401e + +# A800 +git reset --hard 05267318f750f694f61e547fa7a7b95876c72b5e +python3 tools/patch/unpatch.py --backend Megatron-LM +``` + +## 2. Start Heterogeneous Training (hetero_train) + +### 2.1 Prepare Dataset Demo + +We provide a small processed dataset ([bin](https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.bin) and [idx](https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.idx)) derived from the [Pile](https://pile.eleuther.ai/) dataset. + +```bash +mkdir -p /path/to/data && cd /path/to/data +wget https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.idx +wget https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.bin +``` + +### 2.2 Edit Configuration Files + +We use the qwen3-10b model as an example: + +#### File Path: examples/qwen3/conf/train_hetero_10b.yaml + +```yaml +defaults: + - _self_ + - train: 10b_hetero + +experiment: + exp_name: Qwen3-10b_muxi + seed: 42 + save_steps: 1000 + load: null + exp_dir: ./${experiment.exp_name} + ckpt_format: torch + task: + type: train + backend: megatron + entrypoint: flagscale/train/train_gpt.py + runner: + backend: torchrun + per_node_task: false + no_shared_fs: false + ssh_port: xxx Replace with Docker SSH port + nnodes: 1 + nproc_per_node: 8 + rdzv_backend: static + hostfile: ./muxi_hostfile + cmds: + before_start: source /root/miniconda3/bin/activate flagscale-train + envs: + FLAGCX_ENABLE_TOPO_DETECT: TRUE + FLAGCX_DEBUG: TRACE + FLAGCX_IB_HCA: mlx5 + CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 + CUDA_DEVICE_MAX_CONNECTIONS: 1 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + device_type_specific: + C550: + LOGLEVEL: "INFO" + CUCC_PATH: "/opt/maca/tools/cu-bridge" + CUDA_PATH: "/opt/maca/tools/cu-bridge" + DEVINFO_ROOT: "/opt/maca" + LD_LIBRARY_PATH: "/opt/maca/lib:/opt/maca/mxgpu_llvm/lib:/opt/mxdriver/lib:/opt/maca/ompi/lib:/opt/maca/ucx/lib:/opt/mxdriver/lib" + MACA_CLANG: "/opt/maca/mxgpu_llvm" + MACA_CLANG_PATH: "/opt/maca/mxgpu_llvm/bin" + MACA_PATH: "/opt/maca" + PATH: "/opt/conda/bin:/opt/conda/condabin:/opt/maca/tools/cu-bridge:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/conda/bin:/opt/maca/bin:/opt/maca/mxgpu_llvm/bin:/opt/maca/ompi/bin:/opt/maca/ucx/bin:/opt/mxdriver/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + MCCL_LIMIT_RING_LL_THREADTHRESHOLDS: 1 + SET_DEVICE_NUMA_PREFERRED: 1 + PYTORCH_ENABLE_SAME_RAND_A100: 1 + NVTE_FLASH_ATTN: 1 + NVTE_FUSED_ATTN: 0 + MACA_SMALL_PAGESIZE_ENABLE: 1 + MCCL_MAX_NCHANNELS: 18 + MCCL_P2P_LEVEL: SYS + +device_type_specific: + C550: + build_dir: FlagScale/build/Metax_C550/FlagScale +action: run + +hydra: + run: + dir: ${experiment.exp_dir}/hydra +``` + +#### File Path: examples/qwen3/conf/train/10b_hetero.yaml + +```yaml +system: + distributed_backend: flagcx + no_shared_fs: ${experiment.runner.no_shared_fs} + ... + ... + checkpoint: + save_interval: ${experiment.save_steps} + load: ${experiment.load} + ckpt_format: ${experiment.ckpt_format} + + hetero: + enable_hetero: True + hetero_use_cpu_communication: False + use_partial_reduce_for_shared_embedding: True + mesh format [tp1,cp1,ep1,dp1,pp1,(tp2,cp2...)] + hetero_pipeline_layer_split: [28,28] + hetero_process_meshes: [1,1,1,8,1,1,1,1,8,1] + hetero_device_types: ["A800","C550"] + + standalone_embedding_stage: False + hetero_current_device_type: "A800" + + ... + ... + +data: + data_path: /path/pile_wikipedia_demo + split: 1 + no_mmap_bin_files: true + tokenizer: + legacy_tokenizer: true + tokenizer_type: QwenTokenizerFS + tokenizer_path: xxx + vocab_size: 151851 + padded_vocab_size: 151936 + make_vocab_size_divisible_by: 64 +``` + +#### File Path: ./muxi_hostfile + +``` +ip slots=8 type=A800 +ip slots=8 type=C550 +``` + +### 2.3 Start Training + +```bash +python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b action=run +``` + +### 2.4 Stop Training + +```bash +python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b action=stop +``` + +## 3. Convert Checkpoint to Hugging Face Format + +### 3.1 Navigate to the Checkpoint Tool Directory + +```bash +cd ./tools/checkpoint/ +``` + +### 3.2 Modify the Code Files + +1. Edit `loader_mcore.py` (Line 220) + + ``` + fake_etp_group = _ConverterFakeProcessGroup(size=margs.expert_tensor_parallel_size) + if margs.expert_tensor_parallel_size is None: + margs.expert_tensor_parallel_size = 1 + ``` + +2. Edit `qwen3/model.py` (Line 33) + + ``` + def get_mg_model(dtype, pre_process, post_process): + from flagscale.train.train_gpt import model_provider, gpt_builder + + s_time = time.time() + model = model_provider(gpt_builder, pre_process, post_process).to(dtype) + ``` + +3. Edit `FlagScale/third_party/Megatron-LM/megatron/training/checkpointing.py` (Line 1149) + + ```` + state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False) + ```` + +### 3.3 Edit the Conversion Script + +Create/Edit the `run_qwen3.sh` script: + +```bash +vi run_qwen3.sh +``` + +Paste the following content into the script: + +```bash +python convert.py \ + --model-type qwen3 \ + --loader mcore \ + --saver transformers \ + --target-tensor-parallel-size 1 \ + --target-pipeline-parallel-size 1 \ + --target-expert-parallel-size 1 \ + --max-queue-size 50 \ + --target-params-dtype bf16 \ + --true-vocab-size 151851 \ + --megatron-path ../../third_party/Megatron-LM \ + --load-dir xxx/ckpt \ + --save-dir xxx/ckpt_hf \ +``` + +### 3.4 Execute the Conversion Script + +```bash +bash run_qwen3.sh +``` + +#### ps: + +#### --load-dir: Path to the trained checkpoint in Torch format. + +#### --save-dir: Path to save the converted checkpoint in Hugging Face format. + + From 5d0934c3a7c6a9e78034e0eaa0dd18eeca2a2697 Mon Sep 17 00:00:00 2001 From: shuyq10 <13569752081@163.com> Date: Thu, 5 Mar 2026 10:58:00 +0800 Subject: [PATCH 2/3] modify path --- examples/qwen3/conf/QuickStart.md | 76 +------------------------------ 1 file changed, 2 insertions(+), 74 deletions(-) diff --git a/examples/qwen3/conf/QuickStart.md b/examples/qwen3/conf/QuickStart.md index f939892934..c19799acb7 100644 --- a/examples/qwen3/conf/QuickStart.md +++ b/examples/qwen3/conf/QuickStart.md @@ -78,7 +78,7 @@ experiment: per_node_task: false no_shared_fs: false ssh_port: xxx Replace with Docker SSH port - nnodes: 1 + nnodes: 2 nproc_per_node: 8 rdzv_backend: static hostfile: ./muxi_hostfile @@ -113,7 +113,7 @@ experiment: device_type_specific: C550: - build_dir: FlagScale/build/Metax_C550/FlagScale + build_dir: /path/to/FlagScale/build/Metax_C550/FlagScale action: run hydra: @@ -181,76 +181,4 @@ python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b action=stop ``` -## 3. Convert Checkpoint to Hugging Face Format - -### 3.1 Navigate to the Checkpoint Tool Directory - -```bash -cd ./tools/checkpoint/ -``` - -### 3.2 Modify the Code Files - -1. Edit `loader_mcore.py` (Line 220) - - ``` - fake_etp_group = _ConverterFakeProcessGroup(size=margs.expert_tensor_parallel_size) - if margs.expert_tensor_parallel_size is None: - margs.expert_tensor_parallel_size = 1 - ``` - -2. Edit `qwen3/model.py` (Line 33) - - ``` - def get_mg_model(dtype, pre_process, post_process): - from flagscale.train.train_gpt import model_provider, gpt_builder - - s_time = time.time() - model = model_provider(gpt_builder, pre_process, post_process).to(dtype) - ``` - -3. Edit `FlagScale/third_party/Megatron-LM/megatron/training/checkpointing.py` (Line 1149) - - ```` - state_dict = torch.load(checkpoint_name, map_location='cpu', weights_only=False) - ```` - -### 3.3 Edit the Conversion Script - -Create/Edit the `run_qwen3.sh` script: - -```bash -vi run_qwen3.sh -``` - -Paste the following content into the script: - -```bash -python convert.py \ - --model-type qwen3 \ - --loader mcore \ - --saver transformers \ - --target-tensor-parallel-size 1 \ - --target-pipeline-parallel-size 1 \ - --target-expert-parallel-size 1 \ - --max-queue-size 50 \ - --target-params-dtype bf16 \ - --true-vocab-size 151851 \ - --megatron-path ../../third_party/Megatron-LM \ - --load-dir xxx/ckpt \ - --save-dir xxx/ckpt_hf \ -``` - -### 3.4 Execute the Conversion Script - -```bash -bash run_qwen3.sh -``` - -#### ps: - -#### --load-dir: Path to the trained checkpoint in Torch format. - -#### --save-dir: Path to save the converted checkpoint in Hugging Face format. - From 6b278078cb67693cb5868bfe8d94fe310ca75f35 Mon Sep 17 00:00:00 2001 From: shuyq10 <13569752081@163.com> Date: Tue, 31 Mar 2026 01:49:49 +0000 Subject: [PATCH 3/3] modify some questions --- examples/qwen3/conf/QuickStart.md | 92 ++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 13 deletions(-) diff --git a/examples/qwen3/conf/QuickStart.md b/examples/qwen3/conf/QuickStart.md index c19799acb7..3651d778bc 100644 --- a/examples/qwen3/conf/QuickStart.md +++ b/examples/qwen3/conf/QuickStart.md @@ -63,7 +63,7 @@ defaults: - train: 10b_hetero experiment: - exp_name: Qwen3-10b_muxi + exp_name: Qwen3-10b_hetero seed: 42 save_steps: 1000 load: null @@ -81,16 +81,17 @@ experiment: nnodes: 2 nproc_per_node: 8 rdzv_backend: static - hostfile: ./muxi_hostfile + hostfile: ./hetero_hostfile cmds: before_start: source /root/miniconda3/bin/activate flagscale-train envs: - FLAGCX_ENABLE_TOPO_DETECT: TRUE - FLAGCX_DEBUG: TRACE + #FLAGCX_ENABLE_TOPO_DETECT: TRUE + #FLAGCX_DEBUG: TRACE FLAGCX_IB_HCA: mlx5 + FLAGCX_IB_GID_INDEX: 3 CUDA_VISIBLE_DEVICES: 0,1,2,3,4,5,6,7 CUDA_DEVICE_MAX_CONNECTIONS: 1 - NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 + #NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0 device_type_specific: C550: LOGLEVEL: "INFO" @@ -127,8 +128,33 @@ hydra: system: distributed_backend: flagcx no_shared_fs: ${experiment.runner.no_shared_fs} - ... - ... + num_workers: 16 + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 2 + context_parallel_size: 1 + disable_bias_linear: true + reset_position_ids: True + reset_attention_mask: True + qk_layernorm: true + sequence_parallel: true + use_distributed_optimizer: true + overlap_grad_reduce: true + overlap_param_gather: true + precision: + bf16: true + attention_softmax_in_fp32: true + accumulate_allreduce_grads_in_fp32: true + logging: + log_interval: 1 + tensorboard_log_interval: 1 + wandb_project: ${experiment.exp_name} + wandb_exp_name: ${experiment.exp_name} + log_timers_to_tensorboard: true + log_validation_ppl_to_tensorboard: true + log_throughput: true + log_params_norm: false + log_num_zeros_in_grad: true + log_memory_to_tensorboard: true checkpoint: save_interval: ${experiment.save_steps} load: ${experiment.load} @@ -146,8 +172,47 @@ system: standalone_embedding_stage: False hetero_current_device_type: "A800" - ... - ... +model: + transformer_impl: transformer_engine + num_layers: 56 + hidden_size: 2560 + ffn_hidden_size: 19456 + kv_channels: 128 + group_query_attention: true + num_attention_heads: 32 + num_query_groups: 8 # num_key_value_heads + seq_length: 4096 + max_position_embeddings: 32768 + norm_epsilon: 1e-6 + use_rotary_position_embeddings: true + rotary_base: 1000000 + swiglu: true + normalization: RMSNorm + init_method_std: 6e-3 + attention_dropout: 0.0 + hidden_dropout: 0.0 + clip_grad: 1.0 + position_embedding_type: rope + untie_embeddings_and_output_weights: true + no_position_embedding: true + no_rope_fusion: true + attention_backend: flash + # training + seed: ${experiment.seed} + # finetune: false + micro_batch_size: 1 + global_batch_size: 2048 + eval_iters: 0 + train_samples: 244142080 #1T #29297664 #120B tokens + optimizer: + weight_decay: 0.1 + adam_beta1: 0.9 + adam_beta2: 0.95 + lr_scheduler: + lr: 1.0e-3 + min_lr: 1.0e-4 + lr_warmup_samples: 2048000 + lr_decay_style: cosine data: data_path: /path/pile_wikipedia_demo @@ -156,13 +221,14 @@ data: tokenizer: legacy_tokenizer: true tokenizer_type: QwenTokenizerFS - tokenizer_path: xxx + tokenizer_path: xxx # Download the official Qwen3-0.6B model from ModelScope to get tokenizer-related files vocab_size: 151851 padded_vocab_size: 151936 make_vocab_size_divisible_by: 64 + ``` -#### File Path: ./muxi_hostfile +#### File Path: ./hetero_hostfile ``` ip slots=8 type=A800 @@ -172,13 +238,13 @@ ip slots=8 type=C550 ### 2.3 Start Training ```bash -python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b action=run +python run.py --config-path ./examples/qwen3/conf --config-name train_hetero_10b action=run ``` ### 2.4 Stop Training ```bash -python run.py --config-path ./examples/qwen3conf --config-name train_hetero_10b action=stop +python run.py --config-path ./examples/qwen3/conf --config-name train_hetero_10b action=stop ```