Enable gfx950 CI on dev branch #150
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. | |
| # | |
| # See LICENSE for license information. | |
| name: TransformerEngine CI | |
| on: | |
| push: | |
| branches: | |
| - 'dev' | |
| - 'release_v1.*_rocm' | |
| - 'release_v2.*_rocm' | |
| pull_request: | |
| branches: | |
| - 'dev' | |
| - 'release_v1.**_rocm' | |
| - 'release_v2.**_rocm' | |
| workflow_dispatch: | |
| inputs: | |
| test_level: | |
| description: 'Test Level (1-3)' | |
| required: true | |
| default: '1' | |
| skip_dev_merge: | |
| description: 'Skip merging dev branch' | |
| type: boolean | |
| default: false | |
| docker_image_override: | |
| description: 'Manual Docker Image (Leave empty to use config file value)' | |
| required: false | |
| type: string | |
| test_config_from_source: | |
| description: 'DEBUG: Use config.json from current source branch instead of dev' | |
| type: boolean | |
| default: false | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| build_and_test: | |
| name: Build and Test on GPU | |
| timeout-minutes: 720 | |
| runs-on: linux-mi325-8 | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| with: | |
| submodules: 'recursive' | |
| fetch-depth: 0 | |
| - name: Host Diagnostics & Environment Setup | |
| id: host-setup | |
| run: | | |
| # Host Activity Checks | |
| echo "::group::Host Diagnostics" | |
| echo ">>> Active Containers:" | |
| docker ps -a | |
| echo ">>> ROCm Installation:" | |
| ls -d /opt/rocm* || echo "No /opt/rocm found" | |
| echo ">>> GPU info:" | |
| ls -l /dev/dri | |
| ls -l /dev/kfd | |
| rocm-smi | |
| echo ">>> Kernel Command Line:" | |
| cat /proc/cmdline | |
| echo "::endgroup::" | |
| # Calculate Test Level | |
| # Default to input (or '1' if input is missing/null) | |
| CALC_LEVEL="${{ inputs.test_level || '1' }}" | |
| # COnly force Level 3 if this is a direct PUSH to dev or a release branch | |
| if [[ "${{ github.event_name }}" == "push" ]]; then | |
| if [[ "${{ github.ref_name }}" == "dev" || "${{ github.ref_name }}" =~ ^release_v.*_rocm$ ]]; then | |
| echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3." | |
| CALC_LEVEL="3" | |
| fi | |
| fi | |
| echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV | |
| # Print Final Environment | |
| echo "::group::Environment & Parameters" | |
| echo "Final Test Level: $CALC_LEVEL" | |
| echo "Event Name: ${{ github.event_name }}" | |
| echo "Ref Name: ${{ github.ref_name }}" | |
| echo "Base Ref: ${{ github.base_ref }}" | |
| env | sort | |
| echo "::endgroup::" | |
| - name: Select Docker Image Tag | |
| id: select-image | |
| run: | | |
| # Determine config source | |
| # Default we are fetching from 'dev' branch | |
| CONFIG_BRANCH="dev" | |
| # If manual run requesting source config, switch branch | |
| if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then | |
| CONFIG_BRANCH="${{ github.ref_name }}" | |
| echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)" | |
| fi | |
| # Download config | |
| CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json" | |
| echo "Attempting to fetch image config from: $CONFIG_URL" | |
| if curl -s -f -o docker_config.json "$CONFIG_URL"; then | |
| echo "Successfully downloaded config from $CONFIG_BRANCH." | |
| else | |
| echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)." | |
| # Fallback: Check source branch file | |
| if [[ -f "ci/ci_config.json" ]]; then | |
| echo "::notice::Falling back to local 'ci/ci_config.json' from checkout." | |
| cp ci/ci_config.json docker_config.json | |
| else | |
| echo "::error::Config file not found in $CONFIG_BRANCH OR locally." | |
| exit 1 | |
| fi | |
| fi | |
| # Determine image key | |
| BRANCH_NAME="${{ github.base_ref || github.ref_name }}" | |
| echo "Determining image for branch: $BRANCH_NAME" | |
| # Logic: Check if branch matches "release_vX.X". | |
| # If so, look for that key in JSON. Otherwise default. | |
| JSON_KEY="default" | |
| if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then | |
| VERSION_KEY="release_v${BASH_REMATCH[1]}" | |
| # Check if this specific version key exists in the JSON | |
| if [[ $(jq "(.docker_images | has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then | |
| JSON_KEY="$VERSION_KEY" | |
| fi | |
| fi | |
| echo "Selected config key: $JSON_KEY" | |
| # Extract image name from json | |
| IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json) | |
| # Check input from workflow_dispatch overriding the image | |
| MANUAL_OVERRIDE="${{ inputs.docker_image_override }}" | |
| if [[ -n "$MANUAL_OVERRIDE" ]]; then | |
| echo "::notice::Manual override detected: $MANUAL_OVERRIDE" | |
| IMAGE_TO_USE="$MANUAL_OVERRIDE" | |
| fi | |
| echo "Selected image: $IMAGE_TO_USE" | |
| echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT | |
| - name: Pull Docker Image | |
| run: | | |
| docker pull ${{ steps.select-image.outputs.image-tag }} | |
| - name: Run Container | |
| run: | | |
| docker run -dt \ | |
| --name te-runner \ | |
| --network=host \ | |
| --device=/dev/dri --device=/dev/kfd \ | |
| --shm-size=16G \ | |
| --pid=host \ | |
| --group-add $(getent group render | cut -d: -f3) \ | |
| --group-add $(getent group video | cut -d: -f3) \ | |
| -v "${{ github.workspace }}:/workspace" \ | |
| -w /workspace \ | |
| ${{ steps.select-image.outputs.image-tag}} | |
| - name: Container Diagnostics & GPU Setup | |
| id: container-diag | |
| run: | | |
| echo "::group::Container Configuration" | |
| # Check Shared Memory Size inside container | |
| echo ">>> /dev/shm size:" | |
| docker exec te-runner df -h /dev/shm | |
| # Check OS/Kernel inside container | |
| echo ">>> Container OS:" | |
| docker exec te-runner cat /etc/os-release | grep PRETTY_NAME | |
| echo "::endgroup::" | |
| echo "::group::ROCm Diagnostics (Host vs Container)" | |
| echo ">>> CONTAINER rocm-smi:" | |
| docker exec te-runner rocm-smi || true | |
| echo "::endgroup::" | |
| # Determine Architecture | |
| # Run rocminfo inside the container and capture the output | |
| ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'") | |
| if [ -z "$ARCH" ]; then | |
| echo "::error::Could not determine GPU architecture using rocminfo inside the container." | |
| docker exec te-runner rocminfo | |
| exit 1 | |
| fi | |
| echo "Detected GPU Arch: $ARCH" | |
| echo "arch=$ARCH" >> $GITHUB_OUTPUT | |
| - name: Build Project | |
| run: | | |
| docker exec \ | |
| -e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \ | |
| te-runner bash -c "$(cat <<'EOF' | |
| set -ex | |
| export HIP_PATH="" | |
| export PYTORCH_ROCM_ARCH=$GPU_ARCH | |
| export NVTE_ROCM_ARCH=$GPU_ARCH | |
| export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts | |
| pip install ninja | |
| git config --global --add safe.directory '*' | |
| pip install --no-build-isolation -v . 2>&1 | |
| EOF | |
| )" | |
| - name: Run sGPU tests | |
| id: sgpu-tests | |
| continue-on-error: true | |
| run: | | |
| # Cleanup previous failure markers if any. Don't actually do anything on k8s pods | |
| rm -f FAIL_* | |
| docker exec \ | |
| -e TEST_SGPU=1 \ | |
| -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ | |
| te-runner bash -c "$(cat <<'EOF' | |
| #!/usr/bin/bash | |
| set -x -o pipefail | |
| ulimit -c 0 # Disable core dumps | |
| # debug output | |
| ls -d /opt/rocm* | |
| python --version | |
| pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext" | |
| HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 & | |
| torch_pid=$!; echo Pytorch test pid $! | |
| HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 & | |
| jax_pid=$!; echo JAX test pid $! | |
| HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 & | |
| core_pid=$!; echo Core test pid $! | |
| wait $core_pid; core_rc=$? | |
| wait $jax_pid; jax_rc=$? | |
| wait $torch_pid; torch_rc=$? | |
| # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later | |
| # Check PyTorch | |
| if [ $torch_rc -ne 0 ]; then | |
| echo "::group::[FAILED] PyTorch sGPU Log" | |
| cat /workspace/torch_sgpu.log | |
| echo "::endgroup::" | |
| echo "::error::Pytorch sGPU test FAILED." | |
| touch /workspace/FAIL_TORCH_SGPU | |
| fi | |
| # Check JAX | |
| if [ $jax_rc -ne 0 ]; then | |
| echo "::group::[FAILED] JAX sGPU Log" | |
| cat /workspace/jax_sgpu.log | |
| echo "::endgroup::" | |
| echo "::error::JAX sGPU test FAILED." | |
| touch /workspace/FAIL_JAX_SGPU | |
| fi | |
| # Check Core | |
| if [ $core_rc -ne 0 ]; then | |
| echo "::group::[FAILED] Core sGPU Log" | |
| cat /workspace/core_sgpu.log | |
| echo "::endgroup::" | |
| echo "::error::Core sGPU test FAILED." | |
| touch /workspace/FAIL_CORE_SGPU | |
| fi | |
| test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0 | |
| EOF | |
| )" | |
| # Export failed tests statuses to host runner | |
| if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi | |
| if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi | |
| if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi | |
| - name: Run mGPU tests | |
| id: mgpu-tests | |
| continue-on-error: true | |
| run: | | |
| docker exec \ | |
| -e TEST_MGPU=1 \ | |
| -e TEST_LEVEL=${{ env.TEST_LEVEL }} \ | |
| te-runner bash -c "$(cat <<'EOF' | |
| #!/usr/bin/bash | |
| set -x -o pipefail | |
| ulimit -c 0 # Disable core dumps | |
| # Run PyTorch | |
| ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1 | |
| torch_rc=$? | |
| # Run JAX | |
| ci/jax.sh > /workspace/jax_mgpu.log 2>&1 | |
| jax_rc=$? | |
| # /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later | |
| if [ $torch_rc -ne 0 ]; then | |
| echo "::group::[FAILED] PyTorch mGPU Log" | |
| cat /workspace/torch_mgpu.log | |
| echo "::endgroup::" | |
| echo "::error::Pytorch mGPU test FAILED." | |
| touch /workspace/FAIL_TORCH_MGPU | |
| fi | |
| if [ $jax_rc -ne 0 ]; then | |
| echo "::group::[FAILED] JAX mGPU Log" | |
| cat /workspace/jax_mgpu.log | |
| echo "::endgroup::" | |
| echo "::error::JAX mGPU test FAILED." | |
| touch /workspace/FAIL_JAX_MGPU | |
| fi | |
| test $torch_rc -eq 0 -a $jax_rc -eq 0 | |
| EOF | |
| )" | |
| # Export failed tests statuses to host runner | |
| if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi | |
| if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi | |
| - name: Run Examples | |
| id: examples-tests | |
| continue-on-error: true | |
| run: | | |
| docker exec te-runner bash -c "$(cat <<'EOF' | |
| #!/usr/bin/bash | |
| set -ex -o pipefail | |
| ulimit -c 0 # Disable core dumps | |
| cd /workspace/examples/pytorch/mnist | |
| python main.py 2>&1 | tee /workspace/examples.log | |
| python main.py --use-te 2>&1 | tee -a /workspace/examples.log | |
| python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log | |
| cd /workspace/examples/jax/mnist | |
| pip3 install -r requirements.txt | |
| python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log | |
| python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log | |
| python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log | |
| cd /workspace/examples/jax/encoder | |
| pip3 install -r requirements.txt | |
| python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log | |
| python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log | |
| EOF | |
| )" | |
| - name: Check Test Failure Status | |
| if: always() | |
| run: | | |
| EXIT_STATUS=0 | |
| # Check outcomes of the specific test steps | |
| # "outcome" will be 'failure' even if continue-on-error was true | |
| # sGPU CHECKS | |
| # We check for the file existence directly because the 'Run sGPU tests' step | |
| # halts immediately on docker failure, skipping the lines that set step outputs. | |
| if [[ -f FAIL_CORE_SGPU ]]; then | |
| echo "::error::Core sGPU Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_TORCH_SGPU ]]; then | |
| echo "::error::PyTorch sGPU Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_JAX_SGPU ]]; then | |
| echo "::error::JAX sGPU Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| # mGPU CHECKS | |
| if [[ -f FAIL_TORCH_MGPU ]]; then | |
| echo "::error::PyTorch mGPU Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| if [[ -f FAIL_JAX_MGPU ]]; then | |
| echo "::error::JAX mGPU Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| # EXAMPLES CHECK | |
| # Examples script does not use marker files, so we rely on step outcome | |
| if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then | |
| echo "::error::Example Tests Failed." | |
| EXIT_STATUS=1 | |
| fi | |
| # Fail the job if any errors were detected | |
| if [[ "$EXIT_STATUS" == "1" ]]; then | |
| exit 1 | |
| fi | |
| - name: Copy logs and reports from container | |
| if: always() | |
| run: | | |
| docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true | |
| docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true | |
| docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true | |
| docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true | |
| docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true | |
| - name: Upload logs and test reports | |
| if: always() | |
| uses: actions/upload-artifact@v4 | |
| with: | |
| name: logs-and-reports | |
| path: | | |
| *.log | |
| if-no-files-found: ignore | |
| retention-days: 5 | |
| - name: Cleanup container | |
| if: always() | |
| run: docker rm -f te-runner || true |