Update qa_l1_test.yml #9
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: QA L0 Tests | |
| on: | |
| push: | |
| branches: | |
| - test-ye | |
| pull_request: | |
| branches: | |
| - main | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.actor }} | |
| cancel-in-progress: true | |
| jobs: | |
| qa-l0-test: | |
| runs-on: [ self-hosted, TE_FL ] | |
| defaults: | |
| run: | |
| shell: bash | |
| container: | |
| image: localhost:5000/flagscale-cuda12.8.1-torch2.7.1-python3.10-te2.9:20260209 | |
| ports: | |
| - 80:80 | |
| options: >- | |
| --gpus all | |
| --shm-size=500g | |
| --privileged | |
| --ipc=host | |
| --ulimit memlock=-1 | |
| --ulimit stack=67108864 | |
| --ulimit nofile=65535:65535 | |
| --user root | |
| --pull always | |
| steps: | |
| - name: Checkout Code | |
| uses: actions/[email protected] | |
| with: | |
| repository: ${{ github.event.pull_request.head.repo.full_name }} | |
| ref: ${{ github.event.pull_request.head.ref }} | |
| ssh-strict: true | |
| ssh-user: git | |
| persist-credentials: true | |
| clean: true | |
| sparse-checkout-cone-mode: true | |
| fetch-tags: false | |
| show-progress: true | |
| lfs: false | |
| submodules: recursive | |
| set-safe-directory: true | |
| - name: Install dependencies and build transformer_engine | |
| timeout-minutes: 60 | |
| env: | |
| NVTE_FRAMEWORK: pytorch | |
| TE_WITH_NCCL: 1 | |
| run: | | |
| echo "Install transformer_engine" | |
| source /opt/miniconda3/etc/profile.d/conda.sh | |
| conda activate flagscale-train | |
| pip install transformers expecttest | |
| pip install --no-build-isolation -vvv . --no-deps | |
| # Verify installation | |
| python3 tests/pytorch/test_sanity_import.py | |
| - name: GPU Usage Check / Verification | |
| run: | | |
| source .github/workflows/scripts/gpu_check.sh | |
| wait_for_gpu | |
| # - name: L0 CPP Unittest | |
| # timeout-minutes: 80 | |
| # env: | |
| # TE_PATH: . | |
| # run: | | |
| # source /opt/miniconda3/etc/profile.d/conda.sh | |
| # conda activate flagscale-train | |
| # TE_LIB_PATH=$(pip3 show transformer-engine | grep -E "Location:|Editable project location:" | tail -n 1 | awk '{print $NF}') | |
| # TE_CPP_LIB_PATH="${TE_LIB_PATH}/transformer_engine" | |
| # export CMAKE_PREFIX_PATH="${TE_CPP_LIB_PATH}:${CMAKE_PREFIX_PATH}" | |
| # export LD_LIBRARY_PATH="${TE_CPP_LIB_PATH}:${LD_LIBRARY_PATH}" | |
| # NUM_PHYSICAL_CORES=$(nproc) | |
| # NUM_PARALLEL_JOBS=4 | |
| # cd $TE_PATH/tests/cpp | |
| # cmake -GNinja -Bbuild . -DTE_LIB_PATH="${TE_CPP_LIB_PATH}" | |
| # cmake --build build | |
| # export OMP_NUM_THREADS=$((NUM_PHYSICAL_CORES / NUM_PARALLEL_JOBS)) | |
| # ctest --test-dir build -j$NUM_PARALLEL_JOBS | |
| # - name: PyTorch C++ Lint | |
| # timeout-minutes: 5 | |
| # env: | |
| # CPP_ONLY: 1 | |
| # TE_PATH: . | |
| # run: | | |
| # source /opt/miniconda3/etc/profile.d/conda.sh | |
| # conda activate flagscale-train | |
| # bash ./qa/L0_pytorch_lint/test.sh || true | |
| # echo "" | |
| # echo "-----------------------------------------------------" | |
| # echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611" | |
| # echo "-----------------------------------------------------" | |
| # continue-on-error: true | |
| # - name: PyTorch Python Lint | |
| # timeout-minutes: 5 | |
| # env: | |
| # PYTHON_ONLY: 1 | |
| # TE_PATH: . | |
| # run: | | |
| # source /opt/miniconda3/etc/profile.d/conda.sh | |
| # conda activate flagscale-train | |
| # bash ./qa/L0_pytorch_lint/test.sh || true | |
| # echo "" | |
| # echo "-----------------------------------------------------" | |
| # echo "Note: Pylint check ignores errors C0411 (incorrect import position) and W0611 (unused import), which can be achieved by adding the parameter --disable=C0411,W0611" | |
| # echo "-----------------------------------------------------" | |
| # continue-on-error: true | |
| # - name: L0 Pytorch Debug Unittest | |
| # # timeout-minutes: 10 | |
| # env: | |
| # TE_PATH: . | |
| # run: | | |
| # source /opt/miniconda3/etc/profile.d/conda.sh | |
| # conda activate flagscale-train | |
| # bash ./qa/L0_pytorch_debug_unittest/test.sh | |
| - name: L0 Pytorch Unittest | |
| # timeout-minutes: 20 | |
| env: | |
| TE_PATH: . | |
| run: | | |
| source /opt/miniconda3/etc/profile.d/conda.sh | |
| conda activate flagscale-train | |
| bash ./qa/L0_pytorch_unittest/test.sh | |