ROCm · leo-automation · Sep 17, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
@@ -0,0 +1,293 @@
+# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+#
+# See LICENSE for license information.
+
+name: TransformerEngine CI
+
+on:
+  push:
+    branches:
+      - 'dev'
+      - 'release_v1.*_rocm'
+      - 'release_v2.*_rocm'
+  pull_request:
+    branches:
+      - 'dev'
+      - 'release_v1.**_rocm'
+      - 'release_v2.**_rocm'
+  workflow_dispatch:
+    inputs:
+      test_level:
+        description: 'Test Level (1-3)'
+        required: true
+        default: '1'
+      skip_dev_merge:
+        description: 'Skip merging dev branch'
+        type: boolean
+        default: false
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build_and_test:
+    name: Build and Test on GPU
+    timeout-minutes: 720
+    runs-on: linux-mi325-4
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
+
+      - name: Merge origin/dev
+        # Only run on PRs targeting dev, or manual runs where we didn't skip it
+        if: |
+          (github.event_name == 'pull_request' && github.base_ref == 'dev') ||
+          (github.event_name == 'workflow_dispatch' && inputs.skip_dev_merge != 'true' && github.ref == 'refs/heads/dev')
+        run: |
+          echo "Attempting to merge origin/dev..."
+          git config --global user.email "amd@amd.com"
+          git config --global user.name "AMD CI"
+
+          # Fetch dev specifically
+          git fetch origin dev
+
+          # Attempt merge; this will exit with error code 1 if there is a conflict, failing the job
+          git merge origin/dev
+          echo "Merge successful."
+
+      - name: Select Docker Image Tag
+        id: select-image
+        env:
+          DEV_IMAGE: ${{ vars.DEV_DOCKER_IMAGE }}
+          REL_IMAGE: ${{ vars.REL613_DOCKER_IMAGE }}
+        run: |
+          BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
+          echo "Determining image for branch: $BRANCH_NAME"
+          DEV_DOCKER_IMAGE="$DEV_IMAGE"
+          REL613_DOCKER_IMAGE="$REL_IMAGE"
+          IMAGE_TO_USE="$DEV_DOCKER_IMAGE"
+          if [[ $BRANCH_NAME =~ ^release_v([0-9]+)\.([0-9]+)_rocm$ ]]; then
+            MAJOR_VERSION=${BASH_REMATCH[1]}
+            MINOR_VERSION=${BASH_REMATCH[2]}
+            if (( MAJOR_VERSION == 1 )); then
+              if (( MINOR_VERSION == 13 || MINOR_VERSION == 14 )); then IMAGE_TO_USE="$REL613_DOCKER_IMAGE"; fi
+            fi
+          fi
+          echo "Selected image: $IMAGE_TO_USE"
+          echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
+
+      - name: Pull Docker Image
+        run: |
+          docker pull ${{ steps.select-image.outputs.image-tag }}
+
+      - name: Run Container
+        run: |
+          docker run -dt \
+            --name te-runner \
+            --network=host \
+            --device=/dev/dri --device=/dev/kfd \
+            --shm-size=16G \
+            --pid=host \
+            --group-add $(getent group render | cut -d: -f3) \
+            --group-add $(getent group video | cut -d: -f3) \
+            -v "${{ github.workspace }}:/workspace" \
+            -w /workspace \
+            ${{ steps.select-image.outputs.image-tag}}
+
+      - name: Determine GPU Architecture via rocminfo
+        id: gpu-arch
+        run: |
+          # Run rocminfo inside the container and capture the output
+          ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'")
+          if [ -z "$ARCH" ]; then
+            echo "::error::Could not determine GPU architecture using rocminfo inside the container."
+            # Optional: Print full rocminfo output for debugging
+            docker exec te-runner rocminfo
+            exit 1
+          fi
+          echo "Detected GPU Arch: $ARCH"
+          echo "arch=$ARCH" >> $GITHUB_OUTPUT
+
+      - name: Build Project
+        run: |
+          docker exec \
+            -e GPU_ARCH=${{ steps.gpu-arch.outputs.arch }} \
+            te-runner bash -c "$(cat <<'EOF'
+          set -ex
+
+          export HIP_PATH=""
+          export PYTORCH_ROCM_ARCH=$GPU_ARCH
+          export NVTE_ROCM_ARCH=$GPU_ARCH
+          export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
+          pip install ninja
+          pip install -v . 2>&1
+          EOF
+          )"
+
+      - name: Run sGPU tests
+        id: sgpu-tests
+        continue-on-error: true
+        run: |
+          docker exec te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          # debug output
+          ls -d /opt/rocm*
+          python --version
+          pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"
+
+          HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
+          torch_pid=$!; echo Pytorch test pid $!
+
+          HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
+          jax_pid=$!; echo JAX test pid $!
+
+          HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
+          core_pid=$!; echo Core test pid $!
+
+          wait $core_pid; core_rc=$?
+          wait $jax_pid; jax_rc=$?
+          wait $torch_pid; torch_rc=$?
+
+          # Check PyTorch
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch sGPU Log"
+            cat /workspace/torch_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch sGPU test FAILED."
+          fi
+
+          # Check JAX
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX sGPU Log"
+            cat /workspace/jax_sgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX sGPU test FAILED."
+          fi
+
+          # Check Core
+          if [ $core_rc -ne 0 ]; then 
+            echo "::group::[FAILED] Core sGPU Log"
+            cat /workspace/core_sgpu.log
+            echo "::endgroup::"
+            echo "::error::Core sGPU test FAILED."
+          fi
+
+          test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
+          EOF
+          )"
+
+      - name: Run mGPU tests
+        id: mgpu-tests
+        continue-on-error: true
+        run: |
+          docker exec te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -x -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          # Run PyTorch
+          ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
+          torch_rc=$?
+
+          # Run JAX
+          ci/jax.sh > /workspace/jax_mgpu.log 2>&1
+          jax_rc=$?
+
+          if [ $torch_rc -ne 0 ]; then 
+            echo "::group::[FAILED] PyTorch mGPU Log"
+            cat /workspace/torch_mgpu.log
+            echo "::endgroup::"
+            echo "::error::Pytorch mGPU test FAILED."
+          fi
+
+          if [ $jax_rc -ne 0 ]; then 
+            echo "::group::[FAILED] JAX mGPU Log"
+            cat /workspace/jax_mgpu.log
+            echo "::endgroup::"
+            echo "::error::JAX mGPU test FAILED."
+          fi
+
+          test $torch_rc -eq 0 -a $jax_rc -eq 0
+          EOF
+          )"
+
+      - name: Run Examples
+        id: examples-tests
+        continue-on-error: true
+        run: |
+          docker exec te-runner bash -c "$(cat <<'EOF'
+          #!/usr/bin/bash
+          set -ex -o pipefail
+          ulimit -c 0 # Disable core dumps
+
+          cd /workspace/examples/pytorch/mnist
+          python main.py 2>&1 | tee /workspace/examples.log
+          python main.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+
+          cd /workspace/examples/jax/mnist
+          pip3 install -r requirements.txt
+          python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+
+          cd /workspace/examples/jax/encoder
+          pip3 install -r requirements.txt
+          python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log
+          python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log
+          EOF
+          )"
+
+      - name: Check Test Failure Status
+        if: always()
+        run: |
+          # Check outcomes of the specific test steps
+          # "outcome" will be 'failure' even if continue-on-error was true
+          if [[ "${{ steps.sgpu-tests.outcome }}" == "failure" ]]; then
+            echo "::error::sGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          if [[ "${{ steps.mgpu-tests.outcome }}" == "failure" ]]; then
+            echo "::error::mGPU Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
+            echo "::error::Example Tests Failed."
+            EXIT_STATUS=1
+          fi
+
+          # Fail the job if any errors were detected
+          if [[ "$EXIT_STATUS" == "1" ]]; then
+            exit 1
+          fi
+
+      - name: Copy logs and reports from container
+        if: always()
+        run: |
+          docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true
+          docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true
+          docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true
+          docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true
+          docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true
+
+      - name: Upload logs and test reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: logs-and-reports
+          path: |
+            *.log
+          if-no-files-found: ignore
+          retention-days: 5
+
+      - name: Cleanup container
+        if: always()
+        run: docker rm -f te-runner || true
@@ -264,15 +264,15 @@ Note that when using `THD` format tensors with CK Fused Attention, one should pa
 to indicate that there is no padding between sequences. Otherwise, passing proper tensors will indicate padding between sequences. This is the case
 for both the `FusedAttention` and `DotProductAttention` modules.
 
-FA v3 Kernels in CK Backend
+AITER FA v3 Kernels
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-ROCm TE provides experimental support for flash-attention v3 fwd/bwd kernels using the ck backend for limited fused attention configs.
-To enable FA v3 kernels, the following environment variables can be used:
+ROCm TE supports flash-attention v3 fwd/bwd kernels on gfx942 and gfx950 using AITER backend.
+This functionality can be controlled by the following environment variables:
 
-* NVTE_CK_USES_FWD_V3 - by default 0, if set to 1, some cases will call the fwd v3 kernel, only applicable to the gfx942 architecture;
-* NVTE_CK_USES_BWD_V3 - by default 0, if set to 1, some cases will call the bwd v3 dqdkdv kernel;
-* NVTE_CK_IS_V3_ATOMIC_FP32 - by default 1, if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) in bwd pass when NVTE_CK_USES_BWD_V3 is set to 1;
-* NVTE_CK_HOW_V3_BF16_CVT - by default 1, float to bf16 convert type when bwd_v3 is set to 1, 0:RTNE; 1:RTNA; 2:RTZ, only applicable to the gfx942 architecture.
+* NVTE_CK_USES_FWD_V3 - by default 1, if set to 0, v3 kernels will not be used for fwd pass;
+* NVTE_CK_USES_BWD_V3 - by default 1, if set to 0, v3 kernels will not be used for bwd pass;
+* NVTE_CK_IS_V3_ATOMIC_FP32 - by default 1, if set to 0 will use atomic fp16/bf16(w/o convert_dq kernel) in bwd pass when v3 is enabled;
+* NVTE_CK_HOW_V3_BF16_CVT - by default 1, float to bf16 convert type when v3 is enabled, 0:RTNE; 1:RTNA; 2:RTZ, only applicable to the gfx942 architecture.
 
 Float to BFloat16 Conversion in CK Backend (gfx942 only)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^