Skip to content

Debugging

Debugging #98

name: TransformerEngine CI
# This workflow runs on pushes and PRs to specific branches
on:
push:
branches:
- 'dev'
- 'leo/*'
- 'release_v1.*_rocm'
- 'release_v2.*_rocm'
pull_request:
branches:
- 'dev'
- 'release_v1.**_rocm'
- 'release_v2.**_rocm'
# The config below allows running this workflow manually from the Actions tab
workflow_dispatch:
inputs:
test_level:
description: 'Test Level (1-3)'
required: true
default: '1'
node_label:
description: 'Runner label for tests'
required: true
default: 'te-gfx942-4gpu'
jobs:
setup:
name: Select Docker Image
runs-on: ubuntu-latest
outputs:
docker-image: ${{ steps.select-image.outputs.image-tag }}
steps:
- name: 📜 Checkout repository
uses: actions/checkout@v4
- name: 🖼️ Select Docker Image
id: select-image
run: |
# Use the target branch for PRs, otherwise use the branch name
BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
echo "Determining image for branch: $BRANCH_NAME"
DEV_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273"
REL613_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.4_0_ubuntu22_py310_torch25_jax0435qa_fa273"
LEGACY_DOCKER_IMAGE="compute-artifactory.amd.com:5000/rocm-plus-docker/framework/private/te-ci:rocm-6.3_66_ubuntu22_py310_torch25_jax0431_fa263_hipblaslt10patch.2"
IMAGE_TO_USE="$DEV_DOCKER_IMAGE" # Default
if [[ $BRANCH_NAME =~ ^release_v([0-9]+)\.([0-9]+)_rocm$ ]]; then
MAJOR_VERSION=${BASH_REMATCH[1]}
MINOR_VERSION=${BASH_REMATCH[2]}
if (( MAJOR_VERSION == 1 )); then
if (( MINOR_VERSION >= 9 && MINOR_VERSION <= 12 )); then
IMAGE_TO_USE="$LEGACY_DOCKER_IMAGE"
elif (( MINOR_VERSION == 13 || MINOR_VERSION == 14 )); then
IMAGE_TO_USE="$REL613_DOCKER_IMAGE"
fi
fi
fi
echo "Selected image: $IMAGE_TO_USE"
echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
build:
name: Build TransformerEngine
# This is a build job, doesn't need GPUs
runs-on: [te-gfx942-4gpu]
container:
image: ${{ needs.setup.outputs.docker-image }}
options: --shm-size=16g --user root
steps:
- name: 📜 Checkout repository
uses: actions/checkout@v4
with:
submodules: 'recursive'
fetch-depth: 0 # Needed to get all branches for version detection
- name: 🔐 Log in to Docker registry
uses: docker/login-action@v3
with:
username: ${{ secrets.ARTIFACTORY_USER }}
password: ${{ secrets.ARTIFACTORY_PAT }}
- name: 📦 Build wheel inside container
run: |
sudo bash <<'EOF'
set -ex
# Ensure core build tools are present
apt-get update
apt-get install -y build-essential
# Upgrade pip and add its location to the PATH
pip install --upgrade pip ninja
# Install custom CMake version
CMAKE_LINK="https://github.com/Kitware/CMake/releases/download/v3.30.0/cmake-3.30.0-linux-x86_64.sh"
wget -q -P /opt ${CMAKE_LINK}
chmod +x /opt/cmake-*.sh
mkdir -p /opt/cmake
bash /opt/cmake-*.sh --skip-license --prefix=/opt/cmake
ln -sf /opt/cmake/bin/* /usr/local/bin/
cmake --version
ROCM_PATH="/opt/rocm" \
PATH="/opt/rocm/bin:${PATH}" \
LD_LIBRARY_PATH="/opt/rocm/lib" \
CXX="/opt/rocm/bin/hipcc" \
CC="/opt/rocm/bin/hipcc" \
PYTORCH_ROCM_ARCH="gfx942" \
NVTE_ROCM_ARCH="gfx942" \
AMDGPU_TARGETS="gfx942" \
pip install -v .
EOF
- name: ⬆️ Upload wheel artifact
uses: actions/upload-artifact@v4
with:
name: transformer-engine-wheel
path: ./*.whl
if-no-files-found: error
test:
name: Test - ${{ matrix.framework }} (${{ matrix.gpu_mode }})
needs: [setup, build]
runs-on: [te-gfx942-4gpu]
continue-on-error: true
strategy:
fail-fast: false
matrix:
# Define main test combinations
framework: ['PyTorch', 'JAX']
gpu_mode: ['Single-GPU', 'Multi-GPU']
# Add special, non-standard combinations
include:
- framework: 'Core'
gpu_mode: 'Single-GPU'
- framework: 'Examples'
gpu_mode: 'Single-GPU' # Runs on a single GPU
# Dynamically generate job name, log name, and the test script from the matrix
env:
TEST_NAME: ${{ matrix.framework }} (${{ matrix.gpu_mode }})
LOG_NAME: ${{ matrix.framework }}_${{ matrix.gpu_mode == 'Single-GPU' && 'sgpu' || 'mgpu' }}
TEST_LEVEL: ${{ github.event.inputs.test_level || (github.ref_name == 'dev' && '3' || '1') }}
TEST_SGPU: ${{ contains(matrix.gpu_mode, 'Single-GPU') }}
TEST_MGPU: ${{ contains(matrix.gpu_mode, 'Multi-GPU') }}
container:
image: ${{ needs.setup.outputs.docker-image }}
options: --device=/dev/kfd --device=/dev/dri --group-add=video --shm-size=16g --pid=host --user root
steps:
- name: 📜 Checkout repository
uses: actions/checkout@v4
with:
submodules: 'recursive'
- name: ⬇️ Download wheel artifact
uses: actions/download-artifact@v4
with:
name: transformer-engine-wheel
- name: 🐍 Install dependencies and wheel
run: |
pip install --upgrade pip
pip install ./*.whl
- name: 🧪 Run tests
run: |
set -ex
# Enhanced debug output
ls -d /opt/rocm*
python --version
pip list | egrep "transformer_e|torch|jax|numpy|ml_dtypes|typing_ext"
# Dynamically select and run the correct test script
if [ "${{ matrix.framework }}" = "PyTorch" ]; then
if [ "${{ matrix.gpu_mode }}" = "Single-GPU" ]; then
HIP_VISIBLE_DEVICES=1 ci/pytorch.sh
else
ci/pytorch.sh
fi
elif [ "${{ matrix.framework }}" = "JAX" ]; then
if [ "${{ matrix.gpu_mode }}" = "Single-GPU" ]; then
HIP_VISIBLE_DEVICES=2 ci/jax.sh
else
ci/jax.sh
fi
elif [ "${{ matrix.framework }}" = "Core" ]; then
HIP_VISIBLE_DEVICES=3 ci/core.sh
elif [ "${{ matrix.framework }}" = "Examples" ]; then
# Fully expanded examples test suite
pip3 install -r examples/jax/mnist/requirements.txt
pip3 install -r examples/jax/encoder/requirements.txt
# PyTorch MNIST
python examples/pytorch/mnist/main.py
python examples/pytorch/mnist/main.py --use-te
python examples/pytorch/mnist/main.py --use-fp8
# JAX MNIST
python examples/jax/mnist/test_single_gpu_mnist.py
python examples/jax/mnist/test_single_gpu_mnist.py --use-te
python examples/jax/mnist/test_single_gpu_mnist.py --use-fp8
# JAX Encoder
python examples/jax/encoder/test_single_gpu_encoder.py
python examples/jax/encoder/test_single_gpu_encoder.py --use-fp8
fi
- name: Normalize LOG_NAME
run: |
echo "LOG_NAME=$(echo '${{ env.LOG_NAME }}' | tr '[:upper:]' '[:lower:]')" >> $GITHUB_ENV
- name: ⬆️ Upload logs on failure
if: failure()
uses: actions/upload-artifact@v4
with:
name: ${{ env.LOG_NAME }}-logs
path: |
*.log
report-status:
name: Report Final Status
needs: test
runs-on: ubuntu-latest
if: always()
steps:
- name: Check test results
if: ${{ needs.test.result != 'success' }}
run: |
echo "One or more test suites failed."
exit 1
- name: Report success
if: ${{ needs.test.result == 'success' }}
run: echo "All test suites passed!"