Enable gfx950 CI on dev branch #150

Workflow file for this run

.github/workflows/rocm-ci.yml at a602c3e

	# Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
	#
	# See LICENSE for license information.

	name: TransformerEngine CI

	on:
	push:
	branches:
	- 'dev'
	- 'release_v1.*_rocm'
	- 'release_v2.*_rocm'
	pull_request:
	branches:
	- 'dev'
	- 'release_v1.**_rocm'
	- 'release_v2.**_rocm'
	workflow_dispatch:
	inputs:
	test_level:
	description: 'Test Level (1-3)'
	required: true
	default: '1'
	skip_dev_merge:
	description: 'Skip merging dev branch'
	type: boolean
	default: false
	docker_image_override:
	description: 'Manual Docker Image (Leave empty to use config file value)'
	required: false
	type: string
	test_config_from_source:
	description: 'DEBUG: Use config.json from current source branch instead of dev'
	type: boolean
	default: false

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	build_and_test:
	name: Build and Test on GPU
	timeout-minutes: 720
	runs-on: linux-mi325-8
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	submodules: 'recursive'
	fetch-depth: 0

	- name: Host Diagnostics & Environment Setup
	id: host-setup
	run: \|
	# Host Activity Checks
	echo "::group::Host Diagnostics"

	echo ">>> Active Containers:"
	docker ps -a

	echo ">>> ROCm Installation:"
	ls -d /opt/rocm* \|\| echo "No /opt/rocm found"
	echo ">>> GPU info:"
	ls -l /dev/dri
	ls -l /dev/kfd
	rocm-smi

	echo ">>> Kernel Command Line:"
	cat /proc/cmdline
	echo "::endgroup::"

	# Calculate Test Level
	# Default to input (or '1' if input is missing/null)
	CALC_LEVEL="${{ inputs.test_level \|\| '1' }}"

	# COnly force Level 3 if this is a direct PUSH to dev or a release branch
	if [[ "${{ github.event_name }}" == "push" ]]; then
	if [[ "${{ github.ref_name }}" == "dev" \|\| "${{ github.ref_name }}" =~ ^release_v.*_rocm$ ]]; then
	echo "::notice::Push to monitored branch (${{ github.ref_name }}) detected. Forcing Level 3."
	CALC_LEVEL="3"
	fi
	fi

	echo "TEST_LEVEL=$CALC_LEVEL" >> $GITHUB_ENV

	# Print Final Environment
	echo "::group::Environment & Parameters"
	echo "Final Test Level: $CALC_LEVEL"
	echo "Event Name: ${{ github.event_name }}"
	echo "Ref Name: ${{ github.ref_name }}"
	echo "Base Ref: ${{ github.base_ref }}"
	env \| sort
	echo "::endgroup::"

	- name: Select Docker Image Tag
	id: select-image
	run: \|
	# Determine config source
	# Default we are fetching from 'dev' branch
	CONFIG_BRANCH="dev"

	# If manual run requesting source config, switch branch
	if [[ "${{ inputs.test_config_from_source }}" == "true" ]]; then
	CONFIG_BRANCH="${{ github.ref_name }}"
	echo "::notice::Debugging mode: Fetching config from current branch ($CONFIG_BRANCH)"
	fi

	# Download config
	CONFIG_URL="https://raw.githubusercontent.com/ROCm/TransformerEngine/${CONFIG_BRANCH}/ci/ci_config.json"
	echo "Attempting to fetch image config from: $CONFIG_URL"

	if curl -s -f -o docker_config.json "$CONFIG_URL"; then
	echo "Successfully downloaded config from $CONFIG_BRANCH."
	else
	echo "::warning::Failed to fetch config from $CONFIG_BRANCH (File might not exist yet)."

	# Fallback: Check source branch file
	if [[ -f "ci/ci_config.json" ]]; then
	echo "::notice::Falling back to local 'ci/ci_config.json' from checkout."
	cp ci/ci_config.json docker_config.json
	else
	echo "::error::Config file not found in $CONFIG_BRANCH OR locally."
	exit 1
	fi
	fi

	# Determine image key
	BRANCH_NAME="${{ github.base_ref \|\| github.ref_name }}"
	echo "Determining image for branch: $BRANCH_NAME"

	# Logic: Check if branch matches "release_vX.X".
	# If so, look for that key in JSON. Otherwise default.
	JSON_KEY="default"

	if [[ $BRANCH_NAME =~ ^release_v([0-9]+\.[0-9]+)_rocm$ ]]; then
	VERSION_KEY="release_v${BASH_REMATCH[1]}"
	# Check if this specific version key exists in the JSON
	if [[ $(jq "(.docker_images \| has(\"$VERSION_KEY\"))" docker_config.json) == "true" ]]; then
	JSON_KEY="$VERSION_KEY"
	fi
	fi

	echo "Selected config key: $JSON_KEY"

	# Extract image name from json
	IMAGE_TO_USE=$(jq -r ".docker_images.\"$JSON_KEY\"" docker_config.json)

	# Check input from workflow_dispatch overriding the image
	MANUAL_OVERRIDE="${{ inputs.docker_image_override }}"
	if [[ -n "$MANUAL_OVERRIDE" ]]; then
	echo "::notice::Manual override detected: $MANUAL_OVERRIDE"
	IMAGE_TO_USE="$MANUAL_OVERRIDE"
	fi

	echo "Selected image: $IMAGE_TO_USE"
	echo "image-tag=$IMAGE_TO_USE" >> $GITHUB_OUTPUT

	- name: Pull Docker Image
	run: \|
	docker pull ${{ steps.select-image.outputs.image-tag }}

	- name: Run Container
	run: \|
	docker run -dt \
	--name te-runner \
	--network=host \
	--device=/dev/dri --device=/dev/kfd \
	--shm-size=16G \
	--pid=host \
	--group-add $(getent group render \| cut -d: -f3) \
	--group-add $(getent group video \| cut -d: -f3) \
	-v "${{ github.workspace }}:/workspace" \
	-w /workspace \
	${{ steps.select-image.outputs.image-tag}}

	- name: Container Diagnostics & GPU Setup
	id: container-diag
	run: \|
	echo "::group::Container Configuration"
	# Check Shared Memory Size inside container
	echo ">>> /dev/shm size:"
	docker exec te-runner df -h /dev/shm

	# Check OS/Kernel inside container
	echo ">>> Container OS:"
	docker exec te-runner cat /etc/os-release \| grep PRETTY_NAME
	echo "::endgroup::"

	echo "::group::ROCm Diagnostics (Host vs Container)"
	echo ">>> CONTAINER rocm-smi:"
	docker exec te-runner rocm-smi \|\| true
	echo "::endgroup::"

	# Determine Architecture
	# Run rocminfo inside the container and capture the output
	ARCH=$(docker exec te-runner bash -c "rocminfo \| grep -m 1 -oP 'gfx[0-9a-fA-F]+'")

	if [ -z "$ARCH" ]; then
	echo "::error::Could not determine GPU architecture using rocminfo inside the container."
	docker exec te-runner rocminfo
	exit 1
	fi

	echo "Detected GPU Arch: $ARCH"
	echo "arch=$ARCH" >> $GITHUB_OUTPUT

	- name: Build Project
	run: \|
	docker exec \
	-e GPU_ARCH=${{ steps.container-diag.outputs.arch }} \
	te-runner bash -c "$(cat <<'EOF'
	set -ex

	export HIP_PATH=""
	export PYTORCH_ROCM_ARCH=$GPU_ARCH
	export NVTE_ROCM_ARCH=$GPU_ARCH
	export NVTE_AITER_PREBUILT_BASE_URL=https://compute-artifactory.amd.com:5000/artifactory/rocm-generic-local/te-ci/aiter-prebuilts
	pip install ninja
	git config --global --add safe.directory '*'
	pip install --no-build-isolation -v . 2>&1
	EOF
	)"

	- name: Run sGPU tests
	id: sgpu-tests
	continue-on-error: true
	run: \|
	# Cleanup previous failure markers if any. Don't actually do anything on k8s pods
	rm -f FAIL_*

	docker exec \
	-e TEST_SGPU=1 \
	-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
	te-runner bash -c "$(cat <<'EOF'
	#!/usr/bin/bash
	set -x -o pipefail
	ulimit -c 0 # Disable core dumps

	# debug output
	ls -d /opt/rocm*
	python --version
	pip list \| egrep "transformer_e\|torch\|jax\|numpy\|ml_dtypes\|typing_ext"

	HIP_VISIBLE_DEVICES=1 ci/pytorch.sh > /workspace/torch_sgpu.log 2>&1 &
	torch_pid=$!; echo Pytorch test pid $!

	HIP_VISIBLE_DEVICES=2 ci/jax.sh > /workspace/jax_sgpu.log 2>&1 &
	jax_pid=$!; echo JAX test pid $!

	HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core_sgpu.log 2>&1 &
	core_pid=$!; echo Core test pid $!

	wait $core_pid; core_rc=$?
	wait $jax_pid; jax_rc=$?
	wait $torch_pid; torch_rc=$?

	# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
	# Check PyTorch
	if [ $torch_rc -ne 0 ]; then
	echo "::group::[FAILED] PyTorch sGPU Log"
	cat /workspace/torch_sgpu.log
	echo "::endgroup::"
	echo "::error::Pytorch sGPU test FAILED."
	touch /workspace/FAIL_TORCH_SGPU
	fi

	# Check JAX
	if [ $jax_rc -ne 0 ]; then
	echo "::group::[FAILED] JAX sGPU Log"
	cat /workspace/jax_sgpu.log
	echo "::endgroup::"
	echo "::error::JAX sGPU test FAILED."
	touch /workspace/FAIL_JAX_SGPU
	fi

	# Check Core
	if [ $core_rc -ne 0 ]; then
	echo "::group::[FAILED] Core sGPU Log"
	cat /workspace/core_sgpu.log
	echo "::endgroup::"
	echo "::error::Core sGPU test FAILED."
	touch /workspace/FAIL_CORE_SGPU
	fi

	test $torch_rc -eq 0 -a $jax_rc -eq 0 -a $core_rc -eq 0
	EOF
	)"

	# Export failed tests statuses to host runner
	if [ -f FAIL_TORCH_SGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
	if [ -f FAIL_JAX_SGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi
	if [ -f FAIL_CORE_SGPU ]; then echo "core=fail" >> $GITHUB_OUTPUT; fi

	- name: Run mGPU tests
	id: mgpu-tests
	continue-on-error: true
	run: \|
	docker exec \
	-e TEST_MGPU=1 \
	-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
	te-runner bash -c "$(cat <<'EOF'
	#!/usr/bin/bash
	set -x -o pipefail
	ulimit -c 0 # Disable core dumps

	# Run PyTorch
	ci/pytorch.sh > /workspace/torch_mgpu.log 2>&1
	torch_rc=$?

	# Run JAX
	ci/jax.sh > /workspace/jax_mgpu.log 2>&1
	jax_rc=$?

	# /workspace/FAIL_* files are for failure markers we can extract to the host runner and process later
	if [ $torch_rc -ne 0 ]; then
	echo "::group::[FAILED] PyTorch mGPU Log"
	cat /workspace/torch_mgpu.log
	echo "::endgroup::"
	echo "::error::Pytorch mGPU test FAILED."
	touch /workspace/FAIL_TORCH_MGPU
	fi

	if [ $jax_rc -ne 0 ]; then
	echo "::group::[FAILED] JAX mGPU Log"
	cat /workspace/jax_mgpu.log
	echo "::endgroup::"
	echo "::error::JAX mGPU test FAILED."
	touch /workspace/FAIL_JAX_MGPU
	fi

	test $torch_rc -eq 0 -a $jax_rc -eq 0
	EOF
	)"

	# Export failed tests statuses to host runner
	if [ -f FAIL_TORCH_MGPU ]; then echo "torch=fail" >> $GITHUB_OUTPUT; fi
	if [ -f FAIL_JAX_MGPU ]; then echo "jax=fail" >> $GITHUB_OUTPUT; fi

	- name: Run Examples
	id: examples-tests
	continue-on-error: true
	run: \|
	docker exec te-runner bash -c "$(cat <<'EOF'
	#!/usr/bin/bash
	set -ex -o pipefail
	ulimit -c 0 # Disable core dumps

	cd /workspace/examples/pytorch/mnist
	python main.py 2>&1 \| tee /workspace/examples.log
	python main.py --use-te 2>&1 \| tee -a /workspace/examples.log
	python main.py --use-fp8 2>&1 \| tee -a /workspace/examples.log

	cd /workspace/examples/jax/mnist
	pip3 install -r requirements.txt
	python test_single_gpu_mnist.py 2>&1 \| tee -a /workspace/examples.log
	python test_single_gpu_mnist.py --use-te 2>&1 \| tee -a /workspace/examples.log
	python test_single_gpu_mnist.py --use-fp8 2>&1 \| tee -a /workspace/examples.log

	cd /workspace/examples/jax/encoder
	pip3 install -r requirements.txt
	python test_single_gpu_encoder.py 2>&1 \| tee -a /workspace/examples.log
	python test_single_gpu_encoder.py --use-fp8 2>&1 \| tee -a /workspace/examples.log
	EOF
	)"

	- name: Check Test Failure Status
	if: always()
	run: \|
	EXIT_STATUS=0
	# Check outcomes of the specific test steps
	# "outcome" will be 'failure' even if continue-on-error was true

	# sGPU CHECKS
	# We check for the file existence directly because the 'Run sGPU tests' step
	# halts immediately on docker failure, skipping the lines that set step outputs.
	if [[ -f FAIL_CORE_SGPU ]]; then
	echo "::error::Core sGPU Tests Failed."
	EXIT_STATUS=1
	fi
	if [[ -f FAIL_TORCH_SGPU ]]; then
	echo "::error::PyTorch sGPU Tests Failed."
	EXIT_STATUS=1
	fi
	if [[ -f FAIL_JAX_SGPU ]]; then
	echo "::error::JAX sGPU Tests Failed."
	EXIT_STATUS=1
	fi

	# mGPU CHECKS
	if [[ -f FAIL_TORCH_MGPU ]]; then
	echo "::error::PyTorch mGPU Tests Failed."
	EXIT_STATUS=1
	fi
	if [[ -f FAIL_JAX_MGPU ]]; then
	echo "::error::JAX mGPU Tests Failed."
	EXIT_STATUS=1
	fi

	# EXAMPLES CHECK
	# Examples script does not use marker files, so we rely on step outcome
	if [[ "${{ steps.examples-tests.outcome }}" == "failure" ]]; then
	echo "::error::Example Tests Failed."
	EXIT_STATUS=1
	fi

	# Fail the job if any errors were detected
	if [[ "$EXIT_STATUS" == "1" ]]; then
	exit 1
	fi

	- name: Copy logs and reports from container
	if: always()
	run: \|
	docker cp te-runner:/workspace/torch_sgpu.log ./torch_sgpu.log \|\| true
	docker cp te-runner:/workspace/jax_sgpu.log ./jax_sgpu.log \|\| true
	docker cp te-runner:/workspace/core_sgpu.log ./core_sgpu.log \|\| true
	docker cp te-runner:/workspace/torch_mgpu.log ./torch_mgpu.log \|\| true
	docker cp te-runner:/workspace/jax_mgpu.log ./jax_mgpu.log \|\| true

	- name: Upload logs and test reports
	if: always()
	uses: actions/upload-artifact@v4
	with:
	name: logs-and-reports
	path: \|
	*.log
	if-no-files-found: ignore
	retention-days: 5

	- name: Cleanup container
	if: always()
	run: docker rm -f te-runner \|\| true

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Enable gfx950 CI on dev branch #150

Workflow file

Enable gfx950 CI on dev branch #150

Uh oh!

Workflow file for this run