Skip to content

Add workflows

Add workflows #1

name: QA L1 - Comprehensive Integration Tests
on:
push:
branches: addWorkflows26014
paths:
- '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
- 'qa/L1_cpp_distributed/**'
- 'tests/cpp_distributed/**'
- 'qa/L1_pytorch_thunder_integration/**'
- 'qa/L1_pytorch_distributed_unittest/**'
- 'tests/pytorch/distributed/**'
- 'tests/pytorch/attention/**'
- 'qa/L1_pytorch_onnx_unittest/**'
- 'tests/pytorch/test_onnx_export.py'
pull_request:
branches: main
paths:
- '.github/workflows/qa-l1-te-cpp-pytorch-tests.yml'
- 'qa/L1_cpp_distributed/**'
- 'tests/cpp_distributed/**'
- 'qa/L1_pytorch_thunder_integration/**'
- 'qa/L1_pytorch_distributed_unittest/**'
- 'tests/pytorch/distributed/**'
- 'tests/pytorch/attention/**'
- 'qa/L1_pytorch_onnx_unittest/**'
- 'tests/pytorch/test_onnx_export.py'
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true
jobs:
run-qa-l1-comprehensive-tests:
runs-on: [ self-hosted, Linux, X64, nvidia, gpu-8 ]
defaults:
run:
shell: bash
container:
image: harbor.baai.ac.cn/flagscale/cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
ports:
- 80:80
options: >-
--gpus all
--shm-size=500g
--privileged
--ipc=host
--ulimit memlock=-1
--ulimit stack=67108864
--ulimit nofile=65535:65535
--user root
--pull always
steps:
- name: Checkout Code
uses: actions/[email protected]
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
ssh-strict: true
ssh-user: git
persist-credentials: true
clean: true
sparse-checkout-cone-mode: true
fetch-tags: false
show-progress: true
lfs: false
submodules: recursive
set-safe-directory: true
- name: Install Dependencies & Build Transformer Engine
# timeout-minutes: 40
env:
NVTE_FRAMEWORK: pytorch
TE_WITH_NCCL: 1
run: |
# Activate conda environment
echo "=== Activating Conda Environment ==="
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
# Install MPI
apt update
apt install -y libopenmpi-dev openmpi-bin openmpi-common
apt install -y libmpich-dev mpich
# Verify the MPI header file
mpicxx -show | awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'
# Verify whether the MPI C++ environment is ready
# 1. Verify whether the MPI C++ compiler (mpicxx) exists
mpicxx --version
# 2. Verify if the MPI library file exists
ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so
# Install dependencies
pip install optree looseversion opt_einsum lightning_utilities
# Clone lightning-thunder
git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git
echo "Install transformer_engine"
pip install --no-build-isolation -vvv . --no-deps
# Verify installation
python3 tests/pytorch/test_sanity_import.py
- name: Verify GPU Availability & Health
run: |
# Execute GPU check
echo "=== Checking GPU Status ==="
source .github/workflows/scripts/gpu_check.sh
wait_for_gpu
# - name: Run L1 PyTorch Thunder Integration Tests
# env:
# XML_LOG_DIR: "/logs/pytorch/thunder"
# THUNDER_PATH: "lightning-thunder"
# TE_PATH: .
# run: |
# # Activate conda environment
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# # Run thunder integration tests
# echo "=== Running L1 PyTorch Thunder Integration Tests ==="
# bash ./qa/L1_pytorch_thunder_integration/test.sh
# # timeout-minutes: 5
# - name: Run L1 PyTorch Distributed Unit Tests
# continue-on-error: true
# env:
# XML_LOG_DIR: "/logs/pytorch/distributed"
# TE_PATH: .
# run: |
# # Activate conda environment
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# # Run distributed unit tests
# echo "=== Running L1 PyTorch Distributed Unit Tests ==="
# bash ./qa/L1_pytorch_distributed_unittest/test.sh
# # timeout-minutes: 5
# - name: Run L1 PyTorch ONNX Unit Tests
# env:
# XML_LOG_DIR: "/logs/pytorch/onnx"
# TE_PATH: .
# run: |
# # Activate conda environment
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# # Run ONNX unit tests
# echo "=== Running L1 PyTorch ONNX Unit Tests ==="
# bash ./qa/L1_pytorch_onnx_unittest/test.sh
# # timeout-minutes: 30