Skip to content

Update qa_l1_test.yml #9

Update qa_l1_test.yml

Update qa_l1_test.yml #9

Workflow file for this run

name: QA L0 Tests
on:
push:
branches:
- test-ye
pull_request:
branches:
- main
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }}
cancel-in-progress: true
jobs:
qa-l0-test:
runs-on: [ self-hosted, TE_FL ]
defaults:
run:
shell: bash
container:
image: localhost:5000/flagscale-cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209
ports:
- 80:80
options: >-
--gpus all
--shm-size=500g
--privileged
--ipc=host
--ulimit memlock=-1
--ulimit stack=67108864
--ulimit nofile=65535:65535
--user root
--pull always
steps:
- name: Checkout Code
uses: actions/[email protected]
with:
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
ssh-strict: true
ssh-user: git
persist-credentials: true
clean: true
sparse-checkout-cone-mode: true
fetch-tags: false
show-progress: true
lfs: false
submodules: recursive
set-safe-directory: true
- name: Install dependencies and build transformer_engine
timeout-minutes: 60
env:
NVTE_FRAMEWORK: pytorch
TE_WITH_NCCL: 1
run: |
echo "Install transformer_engine"
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
pip install transformers expecttest
pip install --no-build-isolation -vvv . --no-deps
# Verify installation
python3 tests/pytorch/test_sanity_import.py
- name: GPU Usage Check / Verification
run: |
source .github/workflows/scripts/gpu_check.sh
wait_for_gpu
# - name: L0 CPP Unittest
# timeout-minutes: 80
# env:
# TE_PATH: .
# run: |
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}')
# TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine"
# export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}"
# export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}"
# NUM_PHYSICAL_CORES=$(nproc)
# NUM_PARALLEL_JOBS=4
# cd $TE_PATH/tests/cpp
# cmake -GNinja -Bbuild . -DTE_LIB_PATH="${TE_CPP_LIB_PATH}"
# cmake --build build
# export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS))
# ctest --test-dir build -j$NUM_PARALLEL_JOBS
# - name: PyTorch C++ Lint
# timeout-minutes: 5
# env:
# CPP_ONLY: 1
# TE_PATH: .
# run: |
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# bash ./qa/L0_pytorch_lint/test.sh || true
# echo ""
# echo "-----------------------------------------------------"
# echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611"
# echo "-----------------------------------------------------"
# continue-on-error: true
# - name: PyTorch Python Lint
# timeout-minutes: 5
# env:
# PYTHON_ONLY: 1
# TE_PATH: .
# run: |
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# bash ./qa/L0_pytorch_lint/test.sh || true
# echo ""
# echo "-----------------------------------------------------"
# echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611"
# echo "-----------------------------------------------------"
# continue-on-error: true
# - name: L0 Pytorch Debug Unittest
# # timeout-minutes: 10
# env:
# TE_PATH: .
# run: |
# source /opt/miniconda3/etc/profile.d/conda.sh
# conda activate flagscale-train
# bash ./qa/L0_pytorch_debug_unittest/test.sh
- name: L0 Pytorch Unittest
# timeout-minutes: 20
env:
TE_PATH: .
run: |
source /opt/miniconda3/etc/profile.d/conda.sh
conda activate flagscale-train
bash ./qa/L0_pytorch_unittest/test.sh