Skip to content

Enable gfx950 CI on dev branch #150

Enable gfx950 CI on dev branch

Enable gfx950 CI on dev branch #150

Workflow file for this run

# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
name: TransformerEngine CI
on:
push:
branches:
- 'dev'
- 'release_v1.*_rocm'
- 'release_v2.*_rocm'
pull_request:
branches:
- 'dev'
- 'release_v1.**_rocm'
- 'release_v2.**_rocm'
workflow_dispatch:
inputs:
test_level:
description: 'Test Level (1-3)'
required: true
default: '1'
skip_dev_merge:
description: 'Skip merging dev branch'
type: boolean
default: false
docker_image_override:
description: 'Manual Docker Image (Leave empty to use config file value)'
required: false
type: string
test_config_from_source:
description: 'DEBUG: Use config.json from current source branch instead of dev'
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
jobs:
build_and_test:
name: Build and Test on GPU
timeout-minutes: 720
runs-on: linux-mi325-8
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
submodules: 'recursive'
fetch-depth: 0
- name: Host Diagnostics & Environment Setup
id: host-setup
run: |
# Host Activity Checks
echo "::group::Host Diagnostics"
echo ">>> Active Containers:"
docker ps -a
echo ">>> ROCm Installation:"
ls -d /opt/rocm* || echo "No /opt/rocm found"
echo ">>> GPU info:"
ls -l /dev/dri
ls -l /dev/kfd
rocm-smi
echo ">>> Kernel Command Line:"
cat /proc/cmdline
echo "::endgroup::"
# Calculate Test Level
# Default to input (or '1' if input is missing/null)
CALC_LEVEL="${{ inputs.test_level || '1' }}"
# COnly force Level 3 if this is a direct PUSH to dev or a release branch
if [[ "${{ github.event_name }}" == "push" ]]; then
if [[ "${{ github.ref_name }}" == "dev" || "${{ github.ref_name }}" =~ ^release_v.*_rocm$ ]]; then
echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3."
CALC_LEVEL="3"
fi
fi
echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV
# Print Final Environment
echo "::group::Environment & Parameters"
echo "Final Test Level: $CALC_LEVEL"
echo "Event Name: ${{ github.event_name }}"
echo "Ref Name: ${{ github.ref_name }}"
echo "Base Ref: ${{ github.base_ref }}"
env | sort
echo "::endgroup::"
- name: Select Docker Image Tag
id: select-image
run: |
# Determine config source
# Default we are fetching from 'dev' branch
CONFIG_BRANCH="dev"
# If manual run requesting source config, switch branch
if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then
CONFIG_BRANCH="${{ github.ref_name }}"
echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)"
fi
# Download config
CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json"
echo "Attempting to fetch image config from: $CONFIG_URL"
if curl -s -f -o docker_config.json "$CONFIG_URL"; then
echo "Successfully downloaded config from $CONFIG_BRANCH."
else
echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)."
# Fallback: Check source branch file
if [[ -f "ci/ci_config.json" ]]; then
echo "::notice::Falling back to local 'ci/ci_config.json' from checkout."
cp ci/ci_config.json docker_config.json
else
echo "::error::Config file not found in $CONFIG_BRANCH OR locally."
exit 1
fi
fi
# Determine image key
BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
echo "Determining image for branch: $BRANCH_NAME"
# Logic: Check if branch matches "release_vX.X".
# If so, look for that key in JSON. Otherwise default.
JSON_KEY="default"
if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then
VERSION_KEY="release_v${BASH_REMATCH[1]}"
# Check if this specific version key exists in the JSON
if [[ $(jq "(.docker_images | has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then
JSON_KEY="$VERSION_KEY"
fi
fi
echo "Selected config key: $JSON_KEY"
# Extract image name from json
IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json)
# Check input from workflow_dispatch overriding the image
MANUAL_OVERRIDE="${{ inputs.docker_image_override }}"
if [[ -n "$MANUAL_OVERRIDE" ]]; then
echo "::notice::Manual override detected: $MANUAL_OVERRIDE"
IMAGE_TO_USE="$MANUAL_OVERRIDE"
fi
echo "Selected image: $IMAGE_TO_USE"
echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
- name: Pull Docker Image
run: |
docker pull ${{ steps.select-image.outputs.image-tag }}
- name: Run Container
run: |
docker run -dt \
--name te-runner \
--network=host \
--device=/dev/dri --device=/dev/kfd \
--shm-size=16G \
--pid=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
${{ steps.select-image.outputs.image-tag}}
- name: Container Diagnostics & GPU Setup
id: container-diag
run: |
echo "::group::Container Configuration"
# Check Shared Memory Size inside container
echo ">>> /dev/shm size:"
docker exec te-runner df -h /dev/shm
# Check OS/Kernel inside container
echo ">>> Container OS:"
docker exec te-runner cat /etc/os-release | grep PRETTY_NAME
echo "::endgroup::"
echo "::group::ROCm Diagnostics (Host vs Container)"
echo ">>> CONTAINER rocm-smi:"
docker exec te-runner rocm-smi || true
echo "::endgroup::"
# Determine Architecture
# Run rocminfo inside the container and capture the output
ARCH=$(docker exec te-runner bash -c "rocminfo | grep -m 1 -oP 'gfx[0-9a-fA-F]+'")
if [ -z "$ARCH" ]; then
echo "::error::Could not determine GPU architecture using rocminfo inside the container."
docker exec te-runner rocminfo
exit 1
fi
echo "Detected GPU Arch: $ARCH"
echo "arch=$ARCH" >> $GITHUB_OUTPUT
- name: Build Project
run: |
docker exec \
-e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \
te-runner bash -c "$(cat <<'EOF'
set -ex
export HIP_PATH=""
export PYTORCH_ROCM_ARCH=$GPU_ARCH
export NVTE_ROCM_ARCH=$GPU_ARCH
export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
pip install ninja
git config --global --add safe.directory '*'
pip install --no-build-isolation -v . 2>&1
EOF
)"
- name: Run sGPU tests
id: sgpu-tests
continue-on-error: true
run: |
# Cleanup previous failure markers if any. Don't actually do anything on k8s pods
rm -f FAIL_*
docker exec \
-e TEST_SGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps
# debug output
ls -d /opt/rocm*
python --version
pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"
HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
torch_pid=$!; echo Pytorch test pid $!
HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
jax_pid=$!; echo JAX test pid $!
HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
core_pid=$!; echo Core test pid $!
wait $core_pid; core_rc=$?
wait $jax_pid; jax_rc=$?
wait $torch_pid; torch_rc=$?
# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
# Check PyTorch
if [ $torch_rc -ne 0 ]; then
echo "::group::[FAILED] PyTorch sGPU Log"
cat /workspace/torch_sgpu.log
echo "::endgroup::"
echo "::error::Pytorch sGPU test FAILED."
touch /workspace/FAIL_TORCH_SGPU
fi
# Check JAX
if [ $jax_rc -ne 0 ]; then
echo "::group::[FAILED] JAX sGPU Log"
cat /workspace/jax_sgpu.log
echo "::endgroup::"
echo "::error::JAX sGPU test FAILED."
touch /workspace/FAIL_JAX_SGPU
fi
# Check Core
if [ $core_rc -ne 0 ]; then
echo "::group::[FAILED] Core sGPU Log"
cat /workspace/core_sgpu.log
echo "::endgroup::"
echo "::error::Core sGPU test FAILED."
touch /workspace/FAIL_CORE_SGPU
fi
test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
EOF
)"
# Export failed tests statuses to host runner
if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi
- name: Run mGPU tests
id: mgpu-tests
continue-on-error: true
run: |
docker exec \
-e TEST_MGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps
# Run PyTorch
ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
torch_rc=$?
# Run JAX
ci/jax.sh > /workspace/jax_mgpu.log 2>&1
jax_rc=$?
# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
if [ $torch_rc -ne 0 ]; then
echo "::group::[FAILED] PyTorch mGPU Log"
cat /workspace/torch_mgpu.log
echo "::endgroup::"
echo "::error::Pytorch mGPU test FAILED."
touch /workspace/FAIL_TORCH_MGPU
fi
if [ $jax_rc -ne 0 ]; then
echo "::group::[FAILED] JAX mGPU Log"
cat /workspace/jax_mgpu.log
echo "::endgroup::"
echo "::error::JAX mGPU test FAILED."
touch /workspace/FAIL_JAX_MGPU
fi
test $torch_rc -eq 0 -a $jax_rc -eq 0
EOF
)"
# Export failed tests statuses to host runner
if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi
- name: Run Examples
id: examples-tests
continue-on-error: true
run: |
docker exec te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -ex -o pipefail
ulimit -c 0 # Disable core dumps
cd /workspace/examples/pytorch/mnist
python main.py 2>&1 | tee /workspace/examples.log
python main.py --use-te 2>&1 | tee -a /workspace/examples.log
python main.py --use-fp8 2>&1 | tee -a /workspace/examples.log
cd /workspace/examples/jax/mnist
pip3 install -r requirements.txt
python test_single_gpu_mnist.py 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_mnist.py --use-te 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_mnist.py --use-fp8 2>&1 | tee -a /workspace/examples.log
cd /workspace/examples/jax/encoder
pip3 install -r requirements.txt
python test_single_gpu_encoder.py 2>&1 | tee -a /workspace/examples.log
python test_single_gpu_encoder.py --use-fp8 2>&1 | tee -a /workspace/examples.log
EOF
)"
- name: Check Test Failure Status
if: always()
run: |
EXIT_STATUS=0
# Check outcomes of the specific test steps
# "outcome" will be 'failure' even if continue-on-error was true
# sGPU CHECKS
# We check for the file existence directly because the 'Run sGPU tests' step
# halts immediately on docker failure, skipping the lines that set step outputs.
if [[ -f FAIL_CORE_SGPU ]]; then
echo "::error::Core sGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_TORCH_SGPU ]]; then
echo "::error::PyTorch sGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_JAX_SGPU ]]; then
echo "::error::JAX sGPU Tests Failed."
EXIT_STATUS=1
fi
# mGPU CHECKS
if [[ -f FAIL_TORCH_MGPU ]]; then
echo "::error::PyTorch mGPU Tests Failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_JAX_MGPU ]]; then
echo "::error::JAX mGPU Tests Failed."
EXIT_STATUS=1
fi
# EXAMPLES CHECK
# Examples script does not use marker files, so we rely on step outcome
if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
echo "::error::Example Tests Failed."
EXIT_STATUS=1
fi
# Fail the job if any errors were detected
if [[ "$EXIT_STATUS" == "1" ]]; then
exit 1
fi
- name: Copy logs and reports from container
if: always()
run: |
docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log || true
docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log || true
docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log || true
docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log || true
docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log || true
- name: Upload logs and test reports
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-and-reports
path: |
*.log
if-no-files-found: ignore
retention-days: 5
- name: Cleanup container
if: always()
run: docker rm -f te-runner || true