Skip to content

CI Level 3

CI Level 3 #1036

Workflow file for this run

# Copyright (c) 2024-2026, Advanced Micro Devices, Inc. All rights reserved.
#
# See LICENSE for license information.
name: Build and Test Branch
run-name: CI Level ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }}
on:
push:
branches:
- 'dev'
- 'release_v2.*_rocm'
workflow_call:
inputs:
test_level:
description: 'Test Level (1-3)'
required: false
default: '1'
type: string
docker_image_override:
description: 'Manual Docker Image (Leave empty to use config file value)'
required: false
type: string
test_config_from_source:
description: 'DEBUG: Use config.json from current source branch instead of dev'
required: false
default: false
type: boolean
workflow_dispatch:
inputs:
test_level:
description: 'Test Level (1-3)'
required: true
default: '1'
docker_image_override:
description: 'Manual Docker Image (Leave empty to use config file value)'
required: false
type: string
test_config_from_source:
description: 'DEBUG: Use config.json from current source branch instead of dev'
type: boolean
default: false
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
TEST_LEVEL: ${{ (github.event_name == 'push' && '3') || inputs.test_level || '1' }}
jobs:
select_image:
name: Select Docker Image
runs-on: ubuntu-latest
timeout-minutes: 10
outputs:
image-tag: ${{ steps.select-image.outputs.image-tag }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
with:
ref: ${{ inputs.test_config_from_source && github.ref_name || github.event.repository.default_branch || 'dev' }}
sparse-checkout: ci/ci_config.json
sparse-checkout-cone-mode: false
- name: Select Docker Image Tag
id: select-image
run: |
if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then
echo "::notice::Debugging mode: Using ci/ci_config.json from ${{ github.ref_name }}"
else
echo "::notice::Using ci/ci_config.json from ${{ github.event.repository.default_branch || 'dev' }}"
fi
if [[ ! -f "ci/ci_config.json" ]]; then
echo "::error::Config file not found in checkout."
exit 1
fi
BRANCH_NAME="${{ github.base_ref || github.ref_name }}"
echo "Determining image for branch: $BRANCH_NAME"
VERSION_KEY="$BRANCH_NAME"
if jq -e --arg key "$VERSION_KEY" '.docker_images[$key]' ci/ci_config.json > /dev/null; then
JSON_KEY="$VERSION_KEY"
else
JSON_KEY="default"
fi
echo "Selected config key: $JSON_KEY"
IMAGE_TO_USE=$(jq -r --arg key "$JSON_KEY" '.docker_images[$key]' ci/ci_config.json)
MANUAL_OVERRIDE="${{ inputs.docker_image_override }}"
if [[ -n "$MANUAL_OVERRIDE" ]]; then
echo "::notice::Manual override detected: $MANUAL_OVERRIDE"
IMAGE_TO_USE="$MANUAL_OVERRIDE"
fi
echo "Selected image: $IMAGE_TO_USE"
echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT
build:
# Delegate wheel building to the reusable workflow on dev. It produces a core .whl plus framework .tar.gz sdists under artifact name `te-rocm-wheels`.
uses: ./.github/workflows/rocm-wheels-build.yml
secrets: inherit
sgpu_tests:
name: sGPU Tests (${{ matrix.arch_label }})
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-4' || 'linux-te-mi35x-4' }}
strategy:
fail-fast: false
matrix:
arch_label: [mi30x, mi35x]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Initialize required submodules
run: |
git submodule update --init --recursive --depth 1 \
3rdparty/googletest \
3rdparty/hipify_torch
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: te-rocm-wheels
path: dist/
- name: Host Diagnostics
run: |
echo "::group::Host Diagnostics"
echo ">>> GPU info:"
ls -l /dev/dri
ls -l /dev/kfd
rocm-smi
echo "::endgroup::"
- name: Pull Docker Image
run: |
docker pull ${{ needs.select_image.outputs.image-tag }}
- name: Run Container
run: |
docker run -dt \
--rm \
--name te-runner \
--network=host \
--device=/dev/dri --device=/dev/kfd \
--shm-size=16G \
--pid=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
${{ needs.select_image.outputs.image-tag }}
- name: Install packages
run: |
docker exec te-runner bash -c "$(cat <<'EOF'
set -ex
# core (cpp) tests build via cmake inside the repo; allow git ops in-tree.
git config --global --add safe.directory '*'
TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1)
TE_TORCH_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1)
TE_JAX_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1)
test -n "$TE_CORE_PKG" && test -n "$TE_TORCH_PKG" && test -n "$TE_JAX_PKG"
pip install --no-deps "$TE_CORE_PKG"
pip install ninja pybind11[global]
pip install --upgrade hypothesis setuptools
pip install --no-build-isolation --no-deps "$TE_TORCH_PKG"
pip install --no-build-isolation --no-deps "$TE_JAX_PKG"
EOF
)"
- name: Run sGPU tests in parallel (pytorch, jax, examples, core)
id: run-tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
rm -f FAIL_*
docker exec \
-e TEST_SGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps
HIP_VISIBLE_DEVICES=0 ci/pytorch.sh > /workspace/torch.log 2>&1 &
TORCH_PID=$!
HIP_VISIBLE_DEVICES=1 ci/jax.sh > /workspace/jax.log 2>&1 &
JAX_PID=$!
(
set -e
python -c "import os; print('HF_TOKEN set:', bool(os.environ.get('HF_TOKEN')))"
JAX_CONSTRAINTS=/tmp/jax-constraints.txt
pip freeze | grep -iE '^(jax|jaxlib|jax[_-]rocm|jax[_-]plugins)[=@]' > "$JAX_CONSTRAINTS" || true
export HIP_VISIBLE_DEVICES=2
cd /workspace/examples/pytorch/mnist
python main.py
python main.py --use-te
python main.py --use-fp8
cd /workspace/examples/jax/mnist
pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt
python test_single_gpu_mnist.py
python test_single_gpu_mnist.py --use-te
python test_single_gpu_mnist.py --use-fp8
cd /workspace/examples/jax/encoder
pip3 install -c "$JAX_CONSTRAINTS" -r requirements.txt
python test_single_gpu_encoder.py
python test_single_gpu_encoder.py --use-fp8
) > /workspace/examples.log 2>&1 &
EXAMPLES_PID=$!
HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core.log 2>&1 &
CORE_PID=$!
wait $TORCH_PID; torch_rc=$?
wait $JAX_PID; jax_rc=$?
wait $EXAMPLES_PID; examples_rc=$?
wait $CORE_PID; core_rc=$?
if [ $torch_rc -ne 0 ]; then
echo "::group::[FAILED] PyTorch Log"
cat /workspace/torch.log
echo "::endgroup::"
echo "::error::PyTorch tests FAILED."
touch /workspace/FAIL_TORCH
fi
if [ $jax_rc -ne 0 ]; then
echo "::group::[FAILED] JAX Log"
cat /workspace/jax.log
echo "::endgroup::"
echo "::error::JAX tests FAILED."
touch /workspace/FAIL_JAX
fi
if [ $examples_rc -ne 0 ]; then
echo "::group::[FAILED] Examples Log"
cat /workspace/examples.log
echo "::endgroup::"
echo "::error::Examples FAILED."
touch /workspace/FAIL_EXAMPLES
fi
if [ $core_rc -ne 0 ]; then
echo "::group::[FAILED] Core Log"
cat /workspace/core.log
echo "::endgroup::"
echo "::error::Core tests FAILED."
touch /workspace/FAIL_CORE
fi
test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $examples_rc -eq 0 -a $core_rc -eq 0
EOF
)"
- name: Check suite failure status
if: always()
run: |
EXIT_STATUS=0
if [[ -f FAIL_TORCH ]]; then
echo "::error::PyTorch tests failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_JAX ]]; then
echo "::error::JAX tests failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_EXAMPLES ]]; then
echo "::error::Examples failed."
EXIT_STATUS=1
fi
if [[ -f FAIL_CORE ]]; then
echo "::error::Core tests failed."
EXIT_STATUS=1
fi
exit $EXIT_STATUS
- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-sgpu-${{ matrix.arch_label }}
path: |
*.log
if-no-files-found: ignore
retention-days: 5
- name: Cleanup container
if: always()
run: docker rm -f te-runner || true
mgpu_tests:
name: mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})
needs: [select_image, build]
timeout-minutes: 360
runs-on: ${{ matrix.arch_label == 'mi30x' && 'linux-te-mi30x-8' || 'linux-te-mi35x-8' }}
strategy:
fail-fast: false
matrix:
arch_label: [mi30x, mi35x]
framework: [pytorch, jax]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Download build artifacts
uses: actions/download-artifact@v4
with:
name: te-rocm-wheels
path: dist/
- name: Host Diagnostics
run: |
echo "::group::Host Diagnostics"
echo ">>> GPU info:"
ls -l /dev/dri
ls -l /dev/kfd
rocm-smi
echo "::endgroup::"
- name: Pull Docker Image
run: |
docker pull ${{ needs.select_image.outputs.image-tag }}
- name: Run Container
run: |
docker run -dt \
--rm \
--name te-runner \
--network=host \
--device=/dev/dri --device=/dev/kfd \
--shm-size=16G \
--pid=host \
--group-add $(getent group render | cut -d: -f3) \
--group-add $(getent group video | cut -d: -f3) \
-v "${{ github.workspace }}:/workspace" \
-w /workspace \
${{ needs.select_image.outputs.image-tag }}
- name: Install packages
env:
FRAMEWORK: ${{ matrix.framework }}
run: |
docker exec -e FRAMEWORK="$FRAMEWORK" te-runner bash -c "$(cat <<'EOF'
set -ex
TE_CORE_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm[0-9]*.whl' | sort | head -n 1)
if [ "$FRAMEWORK" = "pytorch" ]; then
TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_torch*.tar.gz' | sort | head -n 1)
else
TE_FW_PKG=$(find /workspace/dist -type f -name 'transformer_engine_rocm_jax*.tar.gz' | sort | head -n 1)
fi
test -n "$TE_CORE_PKG" && test -n "$TE_FW_PKG"
pip install --no-deps "$TE_CORE_PKG"
pip install ninja pybind11[global]
pip install --upgrade hypothesis setuptools
pip install --no-build-isolation --no-deps "$TE_FW_PKG"
EOF
)"
- name: Run mGPU tests
id: mgpu-tests
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
case "${{ matrix.framework }}" in
pytorch) TEST_SCRIPT=ci/pytorch.sh; LOG_FILE=/workspace/torch_mgpu.log; SUITE_NAME=PyTorch ;;
jax) TEST_SCRIPT=ci/jax.sh; LOG_FILE=/workspace/jax_mgpu.log; SUITE_NAME=JAX ;;
*) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;;
esac
docker exec \
-e TEST_MGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e TEST_SCRIPT=$TEST_SCRIPT \
-e LOG_FILE=$LOG_FILE \
-e SUITE_NAME=$SUITE_NAME \
-e NVTE_FRAMEWORK=${{ matrix.framework }} \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps
"$TEST_SCRIPT" > "$LOG_FILE" 2>&1
test_rc=$?
if [ $test_rc -ne 0 ]; then
echo "::group::[FAILED] ${SUITE_NAME} mGPU Log"
cat "$LOG_FILE"
echo "::endgroup::"
echo "::error::${SUITE_NAME} mGPU tests FAILED."
fi
exit $test_rc
EOF
)"
- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }}
path: |
*.log
if-no-files-found: ignore
retention-days: 5
- name: Cleanup container
if: always()
run: docker rm -f te-runner || true