Debugging #98
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: TransformerEngine CI | |
| # This workflow runs on pushes and PRs to specific branches | |
| on: | |
| push: | |
| branches: | |
| - 'dev' | |
| - 'leo/*' | |
| - 'release_v1.*_rocm' | |
| - 'release_v2.*_rocm' | |
| pull_request: | |
| branches: | |
| - 'dev' | |
| - 'release_v1.**_rocm' | |
| - 'release_v2.**_rocm' | |
| # The config below allows running this workflow manually from the Actions tab | |
| workflow_dispatch: | |
| inputs: | |
| test_level: | |
| description: 'Test Level (1-3)' | |
| required: true | |
| default: '1' | |
| node_label: | |
| description: 'Runner label for tests' | |
| required: true | |
| default: 'te-gfx942-4gpu' | |
| jobs: | |
| setup: | |
| name: Select Docker Image | |
| runs-on: ubuntu-latest | |
| outputs: | |
| docker-image: ${{ steps.select-image.outputs.image-tag }} | |
| steps: | |
| - name: 📜 Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: 🖼️ Select Docker Image | |
| id: select-image | |
| run: | | |
| # Use the target branch for PRs, otherwise use the branch name | |
| BRANCH_NAME="${{ github.base_ref || github.ref_name }}" | |
| echo "Determining image for branch: $BRANCH_NAME" | |
| DEV_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273" | |
| REL613_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273" | |
| LEGACY_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.3_66_ubuntu22_py310_torch25_jax0431_fa263_hipblaslt10patch.2" | |
| IMAGE_TO_USE="$DEV_DOCKER_IMAGE" # Default | |
| if [[ $BRANCH_NAME =~ ^release_v([0-9]+)\.([0-9]+)_rocm$ ]]; then | |
| MAJOR_VERSION=${BASH_REMATCH[1]} | |
| MINOR_VERSION=${BASH_REMATCH[2]} | |
| if (( MAJOR_VERSION == 1 )); then | |
| if (( MINOR_VERSION >= 9 && MINOR_VERSION <= 12 )); then | |
| IMAGE_TO_USE="$LEGACY_DOCKER_IMAGE" | |
| elif (( MINOR_VERSION == 13 || MINOR_VERSION == 14 )); then | |
| IMAGE_TO_USE="$REL613_DOCKER_IMAGE" | |
| fi | |
| fi | |
| fi | |
| echo "Selected image: $IMAGE_TO_USE" | |
| echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT | |
| build: | |
| name: Build TransformerEngine | |
| # This is a build job, doesn't need GPUs | |
| runs-on: [te-gfx942-4gpu] | |
| container: | |
| image: ${{ needs.setup.outputs.docker-image }} | |
| options: --shm-size=16g --user root | |
| steps: | |
| - name: 📜 Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: 'recursive' | |
| fetch-depth: 0 # Needed to get all branches for version detection | |
| - name: 🔐 Log in to Docker registry | |
| uses: docker/login-action@v3 | |
| with: | |
| username: ${{ secrets.ARTIFACTORY_USER }} | |
| password: ${{ secrets.ARTIFACTORY_PAT }} | |
| - name: 📦 Build wheel inside container | |
| run: | | |
| sudo bash <<'EOF' | |
| set -ex | |
| # Ensure core build tools are present | |
| apt-get update | |
| apt-get install -y build-essential | |
| # Upgrade pip and add its location to the PATH | |
| pip install --upgrade pip ninja | |
| # Install custom CMake version | |
| CMAKE_LINK="https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh" | |
| wget -q -P /opt ${CMAKE_LINK} | |
| chmod +x /opt/cmake-*.sh | |
| mkdir -p /opt/cmake | |
| bash /opt/cmake-*.sh --skip-license --prefix=/opt/cmake | |
| ln -sf /opt/cmake/bin/* /usr/local/bin/ | |
| cmake --version | |
| ROCM_PATH="/opt/rocm" \ | |
| PATH="/opt/rocm/bin:${PATH}" \ | |
| LD_LIBRARY_PATH="/opt/rocm/lib" \ | |
| CXX="/opt/rocm/bin/hipcc" \ | |
| CC="/opt/rocm/bin/hipcc" \ | |
| PYTORCH_ROCM_ARCH="gfx942" \ | |
| NVTE_ROCM_ARCH="gfx942" \ | |
| AMDGPU_TARGETS="gfx942" \ | |
| pip install -v . | |
| EOF | |
| - name: ⬆️ Upload wheel artifact | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: transformer-engine-wheel | |
| path: ./*.whl | |
| if-no-files-found: error | |
| test: | |
| name: Test - ${{ matrix.framework }} (${{ matrix.gpu_mode }}) | |
| needs: [setup, build] | |
| runs-on: [te-gfx942-4gpu] | |
| continue-on-error: true | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| # Define main test combinations | |
| framework: ['PyTorch', 'JAX'] | |
| gpu_mode: ['Single-GPU', 'Multi-GPU'] | |
| # Add special, non-standard combinations | |
| include: | |
| - framework: 'Core' | |
| gpu_mode: 'Single-GPU' | |
| - framework: 'Examples' | |
| gpu_mode: 'Single-GPU' # Runs on a single GPU | |
| # Dynamically generate job name, log name, and the test script from the matrix | |
| env: | |
| TEST_NAME: ${{ matrix.framework }} (${{ matrix.gpu_mode }}) | |
| LOG_NAME: ${{ matrix.framework }}_${{ matrix.gpu_mode == 'Single-GPU' && 'sgpu' || 'mgpu' }} | |
| TEST_LEVEL: ${{ github.event.inputs.test_level || (github.ref_name == 'dev' && '3' || '1') }} | |
| TEST_SGPU: ${{ contains(matrix.gpu_mode, 'Single-GPU') }} | |
| TEST_MGPU: ${{ contains(matrix.gpu_mode, 'Multi-GPU') }} | |
| container: | |
| image: ${{ needs.setup.outputs.docker-image }} | |
| options: --device=/dev/kfd --device=/dev/dri --group-add=video --shm-size=16g --pid=host --user root | |
| steps: | |
| - name: 📜 Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: 'recursive' | |
| - name: ⬇️ Download wheel artifact | |
| uses: actions/download-artifact@v4 | |
| with: | |
| name: transformer-engine-wheel | |
| - name: 🐍 Install dependencies and wheel | |
| run: | | |
| pip install --upgrade pip | |
| pip install ./*.whl | |
| - name: 🧪 Run tests | |
| run: | | |
| set -ex | |
| # Enhanced debug output | |
| ls -d /opt/rocm* | |
| python --version | |
| pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext" | |
| # Dynamically select and run the correct test script | |
| if [ "${{ matrix.framework }}" = "PyTorch" ]; then | |
| if [ "${{ matrix.gpu_mode }}" = "Single-GPU" ]; then | |
| HIP_VISIBLE_DEVICES=1 ci/pytorch.sh | |
| else | |
| ci/pytorch.sh | |
| fi | |
| elif [ "${{ matrix.framework }}" = "JAX" ]; then | |
| if [ "${{ matrix.gpu_mode }}" = "Single-GPU" ]; then | |
| HIP_VISIBLE_DEVICES=2 ci/jax.sh | |
| else | |
| ci/jax.sh | |
| fi | |
| elif [ "${{ matrix.framework }}" = "Core" ]; then | |
| HIP_VISIBLE_DEVICES=3 ci/core.sh | |
| elif [ "${{ matrix.framework }}" = "Examples" ]; then | |
| # Fully expanded examples test suite | |
| pip3 install -r examples/jax/mnist/requirements.txt | |
| pip3 install -r examples/jax/encoder/requirements.txt | |
| # PyTorch MNIST | |
| python examples/pytorch/mnist/main.py | |
| python examples/pytorch/mnist/main.py --use-te | |
| python examples/pytorch/mnist/main.py --use-fp8 | |
| # JAX MNIST | |
| python examples/jax/mnist/test_single_gpu_mnist.py | |
| python examples/jax/mnist/test_single_gpu_mnist.py --use-te | |
| python examples/jax/mnist/test_single_gpu_mnist.py --use-fp8 | |
| # JAX Encoder | |
| python examples/jax/encoder/test_single_gpu_encoder.py | |
| python examples/jax/encoder/test_single_gpu_encoder.py --use-fp8 | |
| fi | |
| - name: Normalize LOG_NAME | |
| run: | | |
| echo "LOG_NAME=$(echo '${{ env.LOG_NAME }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV | |
| - name: ⬆️ Upload logs on failure | |
| if: failure() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: ${{ env.LOG_NAME }}-logs | |
| path: | | |
| *.log | |
| report-status: | |
| name: Report Final Status | |
| needs: test | |
| runs-on: ubuntu-latest | |
| if: always() | |
| steps: | |
| - name: Check test results | |
| if: ${{ needs.test.result != 'success' }} | |
| run: | | |
| echo "One or more test suites failed." | |
| exit 1 | |
| - name: Report success | |
| if: ${{ needs.test.result == 'success' }} | |
| run: echo "All test suites passed!" |