diff --git a/.github/configs/metax.yml b/.github/configs/metax.yml new file mode 100644 index 00000000000..e26dd83bd05 --- /dev/null +++ b/.github/configs/metax.yml @@ -0,0 +1,98 @@ +# CUDA Hardware Configuration for Megatron-LM-FL +# This file defines CI/CD settings for CUDA-based testing +# Test configurations are defined in tests/test_utils/config/platforms/cuda.yaml + +hardware_name: metax +display_name: 'Metax Tests' + +# Docker image for this hardware +ci_image: localhost:5000/cr.metax-tech.com/public-ai-release/maca/megatron-lm:0.12.0-maca.ai3.3.0.11-torch2.6-py312-ubuntu22.04-amd64 + +# Runner labels for this hardware +runner_labels: + - self-hosted + - Linux + - X64 + - metax + - dev + +# Container volumes (hardware-specific paths) +container_volumes: + - /home/flagscale_cicd/flask/static:/workspace/report + - /home/flagscale_cicd/flask/config:/workspace/config + - /home/flagscale_cicd/docker/docker_build/docker_data:/home/gitlab-runner/data + - /home/flagscale_cicd/docker/docker_build/docker_tokenizers:/home/gitlab-runner/tokenizers + - /home/flagscale_cicd/docker/docker_build/docker_data/Megatron-LM/datasets:/opt/data/datasets + - /home/flagscale_cicd/docker/docker_build/docker_tokenizers/Megatron-LM/tokenizers:/opt/data/tokenizers + + # ===== 需要修改 3:沐曦设备路径 ===== + # 这些路径必须存在,否则容器启动会失败 + - /dev/dri:/dev/dri # 沐曦 GPU 设备文件 + - /dev/mxcd:/dev/mxcd # 沐曦 GPU 设备文件 + - /opt/maca:/opt/maca # MACA 软件栈 + - /opt/mxdriver:/opt/mxdriver # 沐曦驱动路径 + +# Container options (hardware-specific settings) +# container_options: '--privileged --gpus all --shm-size=500g --hostname megatron_cicd --user root --ulimit nofile=65535:65535' +# MetaX 容器选项 +# ===== 需要修改 4:容器选项 ===== +container_options: >- + --privileged + --shm-size=500g + --hostname megatron_cicd + --user root + --ulimit nofile=65535:65535 + --device=/dev/dri + --device=/dev/mxcd + --group-add video + -e PLATFORM=metax + -e TORCH_DISTRIBUTED_BACKEND=mccl + -e MUSA_VISIBLE_DEVICES=all +device_types: + - C500 + +# Test matrix configuration +test_matrix: + unit: + devices: + - C500 + # Ignored test files for unit tests + # These files will be skipped when running pytest + ignored_tests: + - tests/unit_tests/data/test_preprocess_data.py + - tests/unit_tests/dist_checkpointing/test_global_metadata_reuse.py + - tests/unit_tests/dist_checkpointing/test_optimizer.py + - tests/unit_tests/dist_checkpointing/test_nonpersistent.py + - tests/unit_tests/dist_checkpointing/test_optimizer.py + - tests/unit_tests/dist_checkpointing/test_safe_globals.py + - tests/unit_tests/dist_checkpointing/models/test_moe_experts.py + - tests/unit_tests/distributed/test_grad_sync_with_expert_parallel.py + - tests/unit_tests/distributed/test_mcore_fully_sharded_data_parallel.py + - tests/unit_tests/export/trtllm/test_distributed_fp8.py + - tests/unit_tests/export/trtllm/test_single_device_fp8.py + - tests/unit_tests/transformer/moe/test_a2a_token_dispatcher.py + - tests/unit_tests/test_inference.py + - tests/unit_tests/test_rl_utils.py + - tests/unit_tests/models/test_gpt_model.py + - tests/unit_tests/models/test_mamba_model.py + - tests/unit_tests/post_training/test_modelopt_module_spec.py + - tests/unit_tests/transformer/moe/test_aux_loss.py + - tests/unit_tests/transformer/moe/test_moe_layer_discrepancy.py + - tests/unit_tests/transformer/moe/test_routers.py + - tests/unit_tests/transformer/test_attention.py + - tests/unit_tests/transformer/test_attention_packed_seq.py + - tests/unit_tests/transformer/test_cuda_graphs.py + - tests/unit_tests/transformer/test_full_cuda_graph.py + - tests/unit_tests/transformer/test_multi_latent_attention.py + - tests/unit_tests/transformer/test_multi_token_prediction.py + - tests/unit_tests/transformer/test_retro_attention.py + - tests/unit_tests/transformer/test_transformer_block.py + - tests/unit_tests/transformer/test_transformer_block_custom_pgs.py + - tests/unit_tests/dist_checkpointing/test_local.py + + # functional: + # train: + # - device: C500 + # task: train + # model: gpt + # case: all