CI Level 3 #1036
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved. | |
| # | |
| # See LICENSE for license information. | |
| name: Build and Test Branch | |
| run-name: CI Level ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }} | |
| on: | |
| push: | |
| branches: | |
| - 'dev' | |
| - 'release_v2.*_rocm' | |
| workflow_call: | |
| inputs: | |
| test_level: | |
| description: 'Test Level (1-3)' | |
| required: false | |
| default: '1' | |
| type: string | |
| docker_image_override: | |
| description: 'Manual Docker Image (Leave empty to use config file value)' | |
| required: false | |
| type: string | |
| test_config_from_source: | |
| description: 'DEBUG: Use config.json from current source branch instead of dev' | |
| required: false | |
| default: false | |
| type: boolean | |
| workflow_dispatch: | |
| inputs: | |
| test_level: | |
| description: 'Test Level (1-3)' | |
| required: true | |
| default: '1' | |
| docker_image_override: | |
| description: 'Manual Docker Image (Leave empty to use config file value)' | |
| required: false | |
| type: string | |
| test_config_from_source: | |
| description: 'DEBUG: Use config.json from current source branch instead of dev' | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| TEST_LEVEL: ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }} | |
| jobs: | |
| select_image: | |
| name: Select Docker Image | |
| runs-on: ubuntu-latest | |
| timeout-minutes: 10 | |
| outputs: | |
| image-tag: ${{ steps.select-image.outputs.image-tag }} | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| with: | |
| ref: ${{ inputs.test_config_from_source && github.ref_name || github.event.repository.default_branch || 'dev' }} | |
| sparse-checkout: ci/ci_config.json | |
| sparse-checkout-cone-mode: false | |
| - name: Select Docker Image Tag | |
| id: select-image | |
| run: | | |
| if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then | |
| echo "::notice::Debugging mode: Using ci/ci_config.json from ${{ github.ref_name }}" | |
| else | |
| echo "::notice::Using ci/ci_config.json from ${{ github.event.repository.default_branch || 'dev' }}" | |
| fi | |
| if [[ ! -f "ci/ci_config.json" ]]; then | |
| echo "::error::Config file not found in checkout." | |
| exit 1 | |
| fi | |
| BRANCH_NAME="${{ github.base_ref || github.ref_name }}" | |
| echo "Determining image for branch: $BRANCH_NAME" | |
| VERSION_KEY="$BRANCH_NAME" | |
| if jq -e --arg key "$VERSION_KEY" '.docker_images[$key]' ci/ci_config.json > /dev/null; then | |
| JSON_KEY="$VERSION_KEY" | |
| else | |
| JSON_KEY="default" | |
| fi | |
| echo "Selected config key: $JSON_KEY" | |
| IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json) | |
| MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" | |
| if [[ -n "$MANUAL_OVERRIDE" ]]; then | |
| echo "::notice::Manual override detected: $MANUAL_OVERRIDE" | |
| IMAGE_TO_USE="$MANUAL_OVERRIDE" | |
| fi | |
| echo "Selected image: $IMAGE_TO_USE" | |
| echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT | |
| build: | |
| # Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`. | |
| uses: ./.github/workflows/rocm-wheels-build.yml | |
| secrets: inherit | |
| sgpu_tests: | |
| name: sGPU Tests (${{ matrix.arch_label }}) | |
| needs: [select_image, build] | |
| timeout-minutes: 360 | |
| runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| arch_label: [mi30x, mi35x] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Initialize required submodules | |
| run: | | |
| git submodule update --init --recursive --depth 1 \ | |
| 3rdparty/googletest \ | |
| 3rdparty/hipify_torch | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: te-rocm-wheels | |
| path: dist/ | |
| - name: Host Diagnostics | |
| run: | | |
| echo "::group::Host Diagnostics" | |
| echo ">>> GPU info:" | |
| ls -l /dev/dri | |
| ls -l /dev/kfd | |
| rocm-smi | |
| echo "::endgroup::" | |
| - name: Pull Docker Image | |
| run: | | |
| docker pull ${{ needs.select_image.outputs.image-tag }} | |
| - name: Run Container | |
| run: | | |
| docker run -dt \ | |
| --rm \ | |
| --name te-runner \ | |
| --network=host \ | |
| --device=/dev/dri --device=/dev/kfd \ | |
| --shm-size=16G \ | |
| --pid=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| ${{ needs.select_image.outputs.image-tag }} | |
| - name: Install packages | |
| run: | | |
| docker exec te-runner bash -c "$(cat <<'EOF' | |
| set -ex | |
| # core (cpp) tests build via cmake inside the repo; allow git ops in-tree. | |
| git config --global --add safe.directory '*' | |
| TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) | |
| TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) | |
| TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) | |
| test -n "$TE_CORE_PKG" && test -n "$TE_TORCH_PKG" && test -n "$TE_JAX_PKG" | |
| pip install --no-deps "$TE_CORE_PKG" | |
| pip install ninja pybind11[global] | |
| pip install --upgrade hypothesis setuptools | |
| pip install --no-build-isolation --no-deps "$TE_TORCH_PKG" | |
| pip install --no-build-isolation --no-deps "$TE_JAX_PKG" | |
| EOF | |
| )" | |
| - name: Run sGPU tests in parallel (pytorch, jax, examples, core) | |
| id: run-tests | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| rm -f FAIL_* | |
| docker exec \ | |
| -e TEST_SGPU=1 \ | |
| -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ | |
| -e HF_TOKEN="$HF_TOKEN" \ | |
| te-runner bash -c "$(cat <<'EOF' | |
| #!/usr/bin/bash | |
| set -x -o pipefail | |
| ulimit -c 0 # Disable core dumps | |
| HIP_VISIBLE_DEVICES=0 ci/pytorch.sh > /workspace/torch.log 2>&1 & | |
| TORCH_PID=$! | |
| HIP_VISIBLE_DEVICES=1 ci/jax.sh > /workspace/jax.log 2>&1 & | |
| JAX_PID=$! | |
| ( | |
| set -e | |
| python -c "import os; print('HF_TOKEN set:', bool(os.environ.get('HF_TOKEN')))" | |
| JAX_CONSTRAINTS=/tmp/jax-constraints.txt | |
| pip freeze | grep -iE '^(jax|jaxlib|jax[_-]rocm|jax[_-]plugins)[=@]' > "$JAX_CONSTRAINTS" || true | |
| export HIP_VISIBLE_DEVICES=2 | |
| cd /workspace/examples/pytorch/mnist | |
| python main.py | |
| python main.py --use-te | |
| python main.py --use-fp8 | |
| cd /workspace/examples/jax/mnist | |
| pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt | |
| python test_single_gpu_mnist.py | |
| python test_single_gpu_mnist.py --use-te | |
| python test_single_gpu_mnist.py --use-fp8 | |
| cd /workspace/examples/jax/encoder | |
| pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt | |
| python test_single_gpu_encoder.py | |
| python test_single_gpu_encoder.py --use-fp8 | |
| ) > /workspace/examples.log 2>&1 & | |
| EXAMPLES_PID=$! | |
| HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core.log 2>&1 & | |
| CORE_PID=$! | |
| wait $TORCH_PID; torch_rc=$? | |
| wait $JAX_PID; jax_rc=$? | |
| wait $EXAMPLES_PID; examples_rc=$? | |
| wait $CORE_PID; core_rc=$? | |
| if [ $torch_rc -ne 0 ]; then | |
| echo "::group::[FAILED] PyTorch Log" | |
| cat /workspace/torch.log | |
| echo "::endgroup::" | |
| echo "::error::PyTorch tests FAILED." | |
| touch /workspace/FAIL_TORCH | |
| fi | |
| if [ $jax_rc -ne 0 ]; then | |
| echo "::group::[FAILED] JAX Log" | |
| cat /workspace/jax.log | |
| echo "::endgroup::" | |
| echo "::error::JAX tests FAILED." | |
| touch /workspace/FAIL_JAX | |
| fi | |
| if [ $examples_rc -ne 0 ]; then | |
| echo "::group::[FAILED] Examples Log" | |
| cat /workspace/examples.log | |
| echo "::endgroup::" | |
| echo "::error::Examples FAILED." | |
| touch /workspace/FAIL_EXAMPLES | |
| fi | |
| if [ $core_rc -ne 0 ]; then | |
| echo "::group::[FAILED] Core Log" | |
| cat /workspace/core.log | |
| echo "::endgroup::" | |
| echo "::error::Core tests FAILED." | |
| touch /workspace/FAIL_CORE | |
| fi | |
| test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $examples_rc -eq 0 -a $core_rc -eq 0 | |
| EOF | |
| )" | |
| - name: Check suite failure status | |
| if: always() | |
| run: | | |
| EXIT_STATUS=0 | |
| if [[ -f FAIL_TORCH ]]; then | |
| echo "::error::PyTorch tests failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_JAX ]]; then | |
| echo "::error::JAX tests failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_EXAMPLES ]]; then | |
| echo "::error::Examples failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_CORE ]]; then | |
| echo "::error::Core tests failed." | |
| EXIT_STATUS=1 | |
| fi | |
| exit $EXIT_STATUS | |
| - name: Upload logs | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: logs-sgpu-${{ matrix.arch_label }} | |
| path: | | |
| *.log | |
| if-no-files-found: ignore | |
| retention-days: 5 | |
| - name: Cleanup container | |
| if: always() | |
| run: docker rm -f te-runner || true | |
| mgpu_tests: | |
| name: mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }}) | |
| needs: [select_image, build] | |
| timeout-minutes: 360 | |
| runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| arch_label: [mi30x, mi35x] | |
| framework: [pytorch, jax] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Download build artifacts | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: te-rocm-wheels | |
| path: dist/ | |
| - name: Host Diagnostics | |
| run: | | |
| echo "::group::Host Diagnostics" | |
| echo ">>> GPU info:" | |
| ls -l /dev/dri | |
| ls -l /dev/kfd | |
| rocm-smi | |
| echo "::endgroup::" | |
| - name: Pull Docker Image | |
| run: | | |
| docker pull ${{ needs.select_image.outputs.image-tag }} | |
| - name: Run Container | |
| run: | | |
| docker run -dt \ | |
| --rm \ | |
| --name te-runner \ | |
| --network=host \ | |
| --device=/dev/dri --device=/dev/kfd \ | |
| --shm-size=16G \ | |
| --pid=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| ${{ needs.select_image.outputs.image-tag }} | |
| - name: Install packages | |
| env: | |
| FRAMEWORK: ${{ matrix.framework }} | |
| run: | | |
| docker exec -e FRAMEWORK="$FRAMEWORK" te-runner bash -c "$(cat <<'EOF' | |
| set -ex | |
| TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1) | |
| if [ "$FRAMEWORK" = "pytorch" ]; then | |
| TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1) | |
| else | |
| TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1) | |
| fi | |
| test -n "$TE_CORE_PKG" && test -n "$TE_FW_PKG" | |
| pip install --no-deps "$TE_CORE_PKG" | |
| pip install ninja pybind11[global] | |
| pip install --upgrade hypothesis setuptools | |
| pip install --no-build-isolation --no-deps "$TE_FW_PKG" | |
| EOF | |
| )" | |
| - name: Run mGPU tests | |
| id: mgpu-tests | |
| env: | |
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | |
| run: | | |
| case "${{ matrix.framework }}" in | |
| pytorch) TEST_SCRIPT=ci/pytorch.sh; LOG_FILE=/workspace/torch_mgpu.log; SUITE_NAME=PyTorch ;; | |
| jax) TEST_SCRIPT=ci/jax.sh; LOG_FILE=/workspace/jax_mgpu.log; SUITE_NAME=JAX ;; | |
| *) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;; | |
| esac | |
| docker exec \ | |
| -e TEST_MGPU=1 \ | |
| -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ | |
| -e TEST_SCRIPT=$TEST_SCRIPT \ | |
| -e LOG_FILE=$LOG_FILE \ | |
| -e SUITE_NAME=$SUITE_NAME \ | |
| -e NVTE_FRAMEWORK=${{ matrix.framework }} \ | |
| -e HF_TOKEN="$HF_TOKEN" \ | |
| te-runner bash -c "$(cat <<'EOF' | |
| #!/usr/bin/bash | |
| set -x -o pipefail | |
| ulimit -c 0 # Disable core dumps | |
| "$TEST_SCRIPT" > "$LOG_FILE" 2>&1 | |
| test_rc=$? | |
| if [ $test_rc -ne 0 ]; then | |
| echo "::group::[FAILED] ${SUITE_NAME} mGPU Log" | |
| cat "$LOG_FILE" | |
| echo "::endgroup::" | |
| echo "::error::${SUITE_NAME} mGPU tests FAILED." | |
| fi | |
| exit $test_rc | |
| EOF | |
| )" | |
| - name: Upload logs | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }} | |
| path: | | |
| *.log | |
| if-no-files-found: ignore | |
| retention-days: 5 | |
| - name: Cleanup container | |
| if: always() | |
| run: docker rm -f te-runner || true |