flagos-ai · Darryl233 · Apr 2, 2026 · Mar 9, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/.github/configs/ascend.yml b/.github/configs/ascend.yml
@@ -0,0 +1,15 @@
+# Huawei Ascend NPU configuration
+image: ascend-infer:ubuntu18.04
+labels:
+  - npu
+  - ascend
+docker_options: |
+  --device /dev/davinci0
+  --device /dev/davinci1
+  --device /dev/davinci2
+  --device /dev/davinci3
+  --device /dev/davinci_manager
+  --device /dev/devmm_svm
+  --device /dev/hisi_hdc
+  --volume /usr/local/Ascend/driver:/usr/local/Ascend/driver
+  --volume /usr/local/Ascend/add-ons:/usr/local/Ascend/add-ons
diff --git a/.github/configs/cuda.yml b/.github/configs/cuda.yml
@@ -0,0 +1,60 @@
+# CUDA Hardware Configuration for TransformerEngine-FL
+# Refactored for BAAI DGX A100 Nodes
+# This file defines environment variables, volumes, and test filters for TE tests.
+
+hardware_name: cuda
+display_name: "NVIDIA CUDA (A100)"
+
+ci_image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
+
+# Runner labels for self-hosted A100 node
+runner_labels:
+  - self-hosted
+  - Linux
+  - X64
+  - nvidia
+  - gpu-8
+
+# Container volumes
+container_volumes:
+  - .:/opt/transformerengine
+  - ./ci_logs:/logs
+  - /home/flagscale_cicd/data:/opt/data
+
+# Container options
+container_options: >-
+  --privileged 
+  --gpus all 
+  --shm-size=500g 
+  --ipc=host 
+  --ulimit memlock=-1 
+  --ulimit stack=67108864 
+  --user root
+
+# Device types
+device_types:
+  - a100
+
+# Environment variables
+env_vars:
+  NVTE_FRAMEWORK: pytorch
+  TE_WITH_NCCL: 1
+  NVTE_PROJECT_BUILDING: 1
+  TE_FL_SKIP_CUDA: 0
+
+# Test matrix configuration
+test_matrix:
+  l0_pytorch:
+    path: "qa/L0_pytorch_unittest/test.sh"
+    ignored_tests:
+      - test_sanity_layernorm_mlp
+      - test_sanity_gpt
+      - test_sanity_bert
+      - test_sanity_T5
+      - test_sanity_amp_and_nvfuser
+      - test_sanity_drop_path
+      - test_layernorm_mlp_accuracy
+      - test_grouped_linear_accuracy
+      - test_gpt_accuracy
+      - test_basic_linear
+      - test_layer_norm
diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml
@@ -0,0 +1,85 @@
+# CUDA Hardware Configuration for Megatron-LM-FL
+# This file defines CI/CD settings for CUDA-based testing
+# Test configurations are defined in tests/test_utils/config/platforms/cuda.yaml
+
+hardware_name: metax
+display_name: 'Metax Tests'
+
+# Docker image for this hardware
+# ci_image: cr.metax-tech.com/public-ai-release/maca/megatron-lm:0.12.0-maca.ai3.3.0.11-torch2.6-py312-ubuntu22.04-amd64
+ci_image: localhost:5000/megatron-lm-with-te:v1
+
+# Runner labels for this hardware
+runner_labels:
+  - self-hosted
+  - Linux
+  - X64
+  - metax
+  # - gpu-8
+  - dev
+
+# Container volumes (hardware-specific paths)
+container_volumes:
+  # - /home/flagscale_cicd/flask/static:/workspace/report
+  # - /home/flagscale_cicd/flask/config:/workspace/config
+  # - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data
+  # - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers
+  # - /home/flagscale_cicd/docker/docker_build/docker_data/Megatron-LM/datasets:/opt/data/datasets
+  # - /home/flagscale_cicd/docker/docker_build/docker_tokenizers/Megatron-LM/tokenizers:/opt/data/tokenizers
+  # --- 新增：Transformer Engine 开发专用路径 ---
+  - /home/muxiuser/jinglong/TransformerEngine-FL:/workspace/TransformerEngine-FL  # 开发仓库
+  - /home/muxiuser/jinglong:/opt/te_packages  # 存放编译好的 TE 包，供测试安装使用
+  - /usr/local/maca:/usr/local/maca:ro  # [关键] 挂载宿主机的 MACA 驱动库（设为只读），确保算子能跑
+
+# Container options (hardware-specific settings)
+container_options: '--privileged --shm-size=500g --hostname megatron_cicd --user root --ulimit nofile=65535:65535 '
+
+# Device types to run tests on
+device_types:
+  - C500
+
+# Test matrix configuration
+test_matrix:
+  unit:
+    devices:
+      - C500
+    # Ignored test files for unit tests
+    # These files will be skipped when running pytest
+    ignored_tests:
+      - tests/unit_tests/data/test_preprocess_data.py
+      - tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py
+      - tests/unit_tests/dist_checkpointing/test_optimizer.py
+      - tests/unit_tests/dist_checkpointing/test_nonpersistent.py
+      - tests/unit_tests/dist_checkpointing/test_optimizer.py
+      - tests/unit_tests/dist_checkpointing/test_safe_globals.py
+      - tests/unit_tests/dist_checkpointing/models/test_moe_experts.py
+      - tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py
+      - tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py
+      - tests/unit_tests/export/trtllm/test_distributed_fp8.py
+      - tests/unit_tests/export/trtllm/test_single_device_fp8.py
+      - tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py
+      - tests/unit_tests/test_inference.py
+      - tests/unit_tests/test_rl_utils.py
+      - tests/unit_tests/models/test_gpt_model.py
+      - tests/unit_tests/models/test_mamba_model.py
+      - tests/unit_tests/post_training/test_modelopt_module_spec.py
+      - tests/unit_tests/transformer/moe/test_aux_loss.py
+      - tests/unit_tests/transformer/moe/test_moe_layer_discrepancy.py
+      - tests/unit_tests/transformer/moe/test_routers.py
+      - tests/unit_tests/transformer/test_attention.py
+      - tests/unit_tests/transformer/test_attention_packed_seq.py
+      - tests/unit_tests/transformer/test_cuda_graphs.py
+      - tests/unit_tests/transformer/test_full_cuda_graph.py
+      - tests/unit_tests/transformer/test_multi_latent_attention.py
+      - tests/unit_tests/transformer/test_multi_token_prediction.py
+      - tests/unit_tests/transformer/test_retro_attention.py
+      - tests/unit_tests/transformer/test_transformer_block.py
+      - tests/unit_tests/transformer/test_transformer_block_custom_pgs.py
+      - tests/unit_tests/dist_checkpointing/test_local.py
+
+  # functional:
+  #   train:
+  #     - device: C500
+  #       task: train
+  #       model: gpt
+  #       case: all
diff --git a/.github/configs/template.yml b/.github/configs/template.yml
@@ -0,0 +1,16 @@
+# Configuration Template
+# This file describes the structure for hardware-specific configurations.
+#
+# Fields:
+# - image: Docker image to use for the runner
+# - labels: List of labels for the runner
+# - docker_options: Additional Docker options for mounting devices, volumes, etc.
+#
+# Example:
+# image: <docker_image>
+# labels:
+#   - <label1>
+#   - <label2>
+# docker_options: |
+#   --option1 value1
+#   --option2 value2
diff --git a/.github/workflows/all_tests_ascend.yml b/.github/workflows/all_tests_ascend.yml
@@ -0,0 +1,32 @@
+name: ascend_tests
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/ascend.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: ascend
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"
diff --git a/.github/workflows/all_tests_common.yml b/.github/workflows/all_tests_common.yml
@@ -0,0 +1,136 @@
+name: Common All Tests
+
+on:
+  workflow_call:
+    inputs:
+      platform:
+        required: true
+        type: string
+        description: Platform name (e.g., cuda, default)
+
+jobs:
+  checkout_and_config:
+    defaults:
+      run:
+        shell: bash
+    runs-on: ubuntu-latest
+    outputs:
+      ci_image: ${{ steps.config.outputs.ci_image }}
+      runs_on: ${{ steps.config.outputs.runs_on }}
+      container_volumes: ${{ steps.config.outputs.container_volumes }}
+      container_options: ${{ steps.config.outputs.container_options }}
+      device_types: ${{ steps.config.outputs.device_types }}
+      train_test_matrix: ${{ steps.config.outputs.train_test_matrix }}
+      ignored_tests: ${{ steps.config.outputs.ignored_tests }}
+    steps:
+      - name: Checkout source code
+        uses: actions/checkout@v4
+
+      - name: Load platform configuration
+        id: config
+        run: |
+          set -euo pipefail
+
+          PLATFORM="${{ inputs.platform }}"
+          CONFIG_FILE=".github/configs/${PLATFORM}.yml"
+
+          # Install mikefarah/yq (v4) for YAML parsing
+          sudo wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/download/v4.45.1/yq_linux_amd64
+          sudo chmod +x /usr/local/bin/yq
+          /usr/local/bin/yq --version
+          echo "Loading configuration from $CONFIG_FILE"
+
+          # Read CI image
+          CI_IMAGE=$(yq '.ci_image' "$CONFIG_FILE")
+          echo "ci_image=$CI_IMAGE" >> $GITHUB_OUTPUT
+
+          # Read runner labels and format as JSON array
+          RUNS_ON=$(yq '.runner_labels | tojson(0)' "$CONFIG_FILE")
+          echo "runs_on=$RUNS_ON" >> $GITHUB_OUTPUT
+
+          # Read container volumes and format as JSON array
+          VOLUMES=$(yq '.container_volumes | tojson(0)' "$CONFIG_FILE")
+          echo "container_volumes=$VOLUMES" >> $GITHUB_OUTPUT
+
+          # Read container options
+          OPTIONS=$(yq '.container_options' "$CONFIG_FILE")
+          echo "container_options=$OPTIONS" >> $GITHUB_OUTPUT
+
+          # Read device types
+          DEVICE_TYPES=$(yq '.device_types | tojson(0)' "$CONFIG_FILE")
+          echo "device_types=$DEVICE_TYPES" >> $GITHUB_OUTPUT
+
+          # Read test matrix for training
+          TRAIN_MATRIX=$(yq '.test_matrix.functional.train | tojson(0)' "$CONFIG_FILE")
+          echo "train_test_matrix=$TRAIN_MATRIX" >> $GITHUB_OUTPUT
+
+          # Read ignored tests list from test_matrix.unit (default to empty array if not defined)
+          IGNORED_TESTS=$(yq '.test_matrix.unit.ignored_tests // [] | tojson(0)' "$CONFIG_FILE")
+          echo "ignored_tests=$IGNORED_TESTS" >> $GITHUB_OUTPUT
+
+  unit_tests:
+    needs: checkout_and_config
+    strategy:
+      fail-fast: false
+      matrix:
+        device: ${{ fromJson(needs.checkout_and_config.outputs.device_types) }}
+    uses: ./.github/workflows/unit_tests_common.yml
+    name: unit_tests
+    with:
+      platform: ${{ inputs.platform }}
+      device: ${{ matrix.device }}
+      image: ${{ needs.checkout_and_config.outputs.ci_image }}
+      runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+      container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+      container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+      ignored_tests: ${{ needs.checkout_and_config.outputs.ignored_tests }}
+
+  # arguments.py not compatible with megatron-core-fl
+  # functional_tests_train:
+  #   needs:
+  #     - checkout_and_config
+  #     - unit_tests
+  #   if: fromJson(needs.checkout_and_config.outputs.train_test_matrix)[0] != null
+  #   uses: ./.github/workflows/functional_tests_train.yml
+  #   with:
+  #     platform: ${{ inputs.platform }}
+  #     test_matrix: ${{ needs.checkout_and_config.outputs.train_test_matrix }}
+  #     image: ${{ needs.checkout_and_config.outputs.ci_image }}
+  #     runs_on: ${{ needs.checkout_and_config.outputs.runs_on }}
+  #     container_volumes: ${{ needs.checkout_and_config.outputs.container_volumes }}
+  #     container_options: ${{ needs.checkout_and_config.outputs.container_options }}
+
+
+  all_tests_complete:
+    defaults:
+      run:
+        shell: bash
+    needs:
+      - checkout_and_config
+      - unit_tests
+      # - functional_tests_train
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify all tests passed
+        run: |
+          # Check all test jobs (skip if not run)
+          failed=false
+
+          if [ "${{ needs.unit_tests.result }}" != "success" ]; then
+            echo "❌ Unit tests failed"
+            failed=true
+          fi
+
+          # # Only check functional tests if they ran
+          # if [ "${{ needs.functional_tests_train.result }}" != "success" ] && \
+          #    [ "${{ needs.functional_tests_train.result }}" != "skipped" ]; then
+          #   echo "❌ Training functional tests failed"
+          #   failed=true
+          # fi
+
+          if [ "$failed" = "true" ]; then
+            exit 1
+          fi
+
+          echo "✅ All tests completed successfully!"
diff --git a/.github/workflows/all_tests_cuda.yml b/.github/workflows/all_tests_cuda.yml
@@ -0,0 +1,32 @@
+name: cuda_tests
+
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
+  cancel-in-progress: true
+
+jobs:
+  run_tests:
+    # Package manager and environment settings are read from .github/configs/cuda.yml
+    uses: ./.github/workflows/all_tests_common.yml
+    with:
+      platform: cuda
+
+  all_tests:
+    needs: run_tests
+    runs-on: ubuntu-latest
+    if: always()
+    steps:
+      - name: Verify workflow status
+        run: |
+          if [ "${{ needs.run_tests.result }}" != "success" ]; then
+            echo "❌ Tests workflow failed"
+            exit 1
+          fi
+          echo "✅ All tests passed!"
no result