Skip to content

Merge branch 'flagos-ai:main' into test-ye #6

Merge branch 'flagos-ai:main' into test-ye

Merge branch 'flagos-ai:main' into test-ye #6

Workflow file for this run

name: QA L1 Tests
on:
push:
branches:
- test-ye
pull_request:
branches:
- main
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true
jobs:
qa-l1-test:
runs-on: [ self-hosted, TE_FL ]
defaults:
run:
shell: bash
container:
image: localhost:5000/flagscale-cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
ports:
- 80:80
options: >-
--gpus all
--shm-size=500g
--privileged
--ipc=host
--ulimit memlock=-1
--ulimit stack=67108864
--ulimit nofile=65535:65535
--user root
--pull always
steps:
- name: Checkout Code
uses: actions/[email protected]
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
ssh-strict: true
ssh-user: git
persist-credentials: true
clean: true
sparse-checkout-cone-mode: true
fetch-tags: false
show-progress: true
lfs: false
submodules: recursive
set-safe-directory: true
- name: Install dependencies and build transformer_engine
# timeout-minutes: 30
env:
NVTE_FRAMEWORK: pytorch
TE_WITH_NCCL: 1
run: |
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
# Install MPI
apt update
apt install -y libopenmpi-dev openmpi-bin openmpi-common
apt install -y libmpich-dev mpich
# Verify the MPI header file
mpicxx -show | awk '{for(i=1;i<=NF;i++) if($i ~ /-I/) print substr($i,3)}'
# Verify whether the MPI C++ environment is ready
# 1. Verify whether the MPI C++ compiler (mpicxx) exists
mpicxx --version
# 2. Verify if the MPI library file exists
ls /usr/lib/x86_64-linux-gnu/libmpi_cxx.so
# Install dependencies
pip install optree looseversion opt_einsum lightning_utilities
# Clone lightning-thunder
git clone --recurse-submodules https://github.com/Lightning-AI/lightning-thunder.git
echo "Install transformer_engine"
pip install --no-build-isolation -vvv . --no-deps
# Verify installation
python3 tests/pytorch/test_sanity_import.py
- name: GPU Usage Check / Verification
run: |
source .github/workflows/scripts/gpu_check.sh
wait_for_gpu
- name: L1 CPP Distributed
id: L1_cpp_distributed
# timeout-minutes: 10
env:
TE_PATH: .
run: |
TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine"
export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}"
export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}"
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
bash ./qa/L1_cpp_distributed/test.sh
- name: L1 Pytorch Thunder Integration
id: L1_pytorch_thunder_integration
env:
XML_LOG_DIR: "/logs/pytorch/thunder"
THUNDER_PATH: "lightning-thunder"
run: |
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
bash ./qa/L1_pytorch_thunder_integration/test.sh
- name: L1 Pytorch Distributed Unittest
id: L1_pytorch_distributed_unittest
continue-on-error: true
env:
XML_LOG_DIR: "/logs/pytorch/distributed"
TE_PATH: .
run: |
ignore_files=("tests/pytorch/distributed/test_sanity.py" "tests/pytorch/distributed/test_comm_gemm_overlap.py"
"tests/pytorch/distributed/test_fusible_ops_with_userbuffers.py" "tests/pytorch/attention/test_attention_with_cp.py"
"tests/pytorch/debug/test_distributed.py" "tests/pytorch/distributed/test_numerics.py"
)
for file in "${ignore_files[@]}"; do
escaped_file=$(echo "$file" | sed 's/\//\\\//g')
sed -i "s/^.*\($escaped_file\).*$/#&/" ./qa/L1_pytorch_distributed_unittest/test.sh
done
cat ./qa/L1_pytorch_distributed_unittest/test.sh
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
bash ./qa/L1_pytorch_distributed_unittest/test.sh
- name: L1 Pytorch Onnx Unittest
id: L1_pytorch_onnx_unittest
env:
XML_LOG_DIR: "/logs/pytorch/onnx"
TE_PATH: .
run: |
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
# Test items that need to be skipped
SKIP_TESTS=(
"test_trt_integration[None]"
)
# Execution file path
TARGET_FILE="./qa/L1_pytorch_onnx_unittest/test.sh"
OLD_CMD="python3 -m pytest --tb=auto --junitxml=\$XML_LOG_DIR/test_onnx_export.xml \$TE_PATH/tests/pytorch/test_onnx_export.py"
SKIP_PARAM=""
if [ ${#SKIP_TESTS[@]} -gt 0 ]; then
SKIP_EXPR=""
for test in "${SKIP_TESTS[@]}"; do
if [ -z "$SKIP_EXPR" ]; then
SKIP_EXPR="not $test"
else
SKIP_EXPR+=" and not $test"
fi
done
SKIP_PARAM="-k \"${SKIP_EXPR}\""
fi
CMD="${OLD_CMD} ${SKIP_PARAM}"
sed -i "s|${OLD_CMD}|${CMD}|g" "${TARGET_FILE}"
echo ""
cat ./qa/L1_pytorch_onnx_unittest/test.sh
echo ""
bash ./qa/L1_pytorch_onnx_unittest/test.sh