Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions .github/workflows/rocm-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -179,24 +179,36 @@ jobs:

- name: Run sGPU tests in parallel (pytorch, jax, examples, core)
id: run-tests
# Below the job's timeout-minutes so an overrun kills only this step;
# the `if: always()` report + upload steps still run (artifacts survive).
timeout-minutes: 330
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
rm -f FAIL_*
# One subdir per suite so PyTorch/JAX/Core get independent reports and
# never collide on identically-named XML (e.g. test_sanity_import.auto.xml
# exists in both the torch and jax suites and runs in parallel).
rm -rf test-results && mkdir -p test-results/torch test-results/jax test-results/core

docker exec \
-e TEST_SGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e JUNITXML_PREFIX=/workspace/test-results/ \
-e JUNITXML_SUFFIX=.xml \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
set -x -o pipefail
ulimit -c 0 # Disable core dumps

HIP_VISIBLE_DEVICES=0 ci/pytorch.sh > /workspace/torch.log 2>&1 &
# Per-suite JUNITXML_PREFIX overrides the docker-exec default so each
# suite writes into its own subdir (inline VAR=val applies only to that
# backgrounded subprocess; the sourced _utils.sh reads it from the env).
HIP_VISIBLE_DEVICES=0 JUNITXML_PREFIX=/workspace/test-results/torch/ ci/pytorch.sh > /workspace/torch.log 2>&1 &
TORCH_PID=$!

HIP_VISIBLE_DEVICES=1 ci/jax.sh > /workspace/jax.log 2>&1 &
HIP_VISIBLE_DEVICES=1 JUNITXML_PREFIX=/workspace/test-results/jax/ ci/jax.sh > /workspace/jax.log 2>&1 &
JAX_PID=$!

(
Expand Down Expand Up @@ -226,7 +238,7 @@ jobs:
) > /workspace/examples.log 2>&1 &
EXAMPLES_PID=$!

HIP_VISIBLE_DEVICES=3 ci/core.sh > /workspace/core.log 2>&1 &
HIP_VISIBLE_DEVICES=3 JUNITXML_PREFIX=/workspace/test-results/core/ ci/core.sh > /workspace/core.log 2>&1 &
CORE_PID=$!

wait $TORCH_PID; torch_rc=$?
Expand Down Expand Up @@ -270,6 +282,20 @@ jobs:
EOF
)"

- name: Generate test report
if: always()
run: |
command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
# One report per suite; each appends its own section to the job summary.
# An empty subdir (suite crashed before writing XML) yields an explicit
# "no results" warning attributable to that suite.
python3 ci/junit_report.py test-results/torch \
--title "sGPU PyTorch (${{ matrix.arch_label }})"
python3 ci/junit_report.py test-results/jax \
--title "sGPU JAX (${{ matrix.arch_label }})"
python3 ci/junit_report.py test-results/core \
--title "sGPU Core (${{ matrix.arch_label }})"

- name: Check suite failure status
if: always()
run: |
Expand Down Expand Up @@ -299,6 +325,7 @@ jobs:
name: logs-sgpu-${{ matrix.arch_label }}
path: |
*.log
test-results/**/*.xml
if-no-files-found: ignore
retention-days: 5

Expand Down Expand Up @@ -378,6 +405,9 @@ jobs:

- name: Run mGPU tests
id: mgpu-tests
# Below the job's timeout-minutes so an overrun kills only this step;
# the `if: always()` report + upload steps still run (artifacts survive).
timeout-minutes: 330
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
Expand All @@ -387,13 +417,17 @@ jobs:
*) echo "::error::Unknown framework: ${{ matrix.framework }}"; exit 1 ;;
esac

rm -rf test-results && mkdir -p test-results

docker exec \
-e TEST_MGPU=1 \
-e TEST_LEVEL=${{ env.TEST_LEVEL }} \
-e TEST_SCRIPT=$TEST_SCRIPT \
-e LOG_FILE=$LOG_FILE \
-e SUITE_NAME=$SUITE_NAME \
-e NVTE_FRAMEWORK=${{ matrix.framework }} \
-e JUNITXML_PREFIX=/workspace/test-results/ \
-e JUNITXML_SUFFIX=.xml \
-e HF_TOKEN="$HF_TOKEN" \
te-runner bash -c "$(cat <<'EOF'
#!/usr/bin/bash
Expand All @@ -414,13 +448,21 @@ jobs:
EOF
)"

- name: Generate test report
if: always()
run: |
command -v python3 >/dev/null 2>&1 || { echo "python3 not available; skipping report"; exit 0; }
python3 ci/junit_report.py test-results \
--title "mGPU ${{ matrix.framework == 'pytorch' && 'Torch' || 'JAX' }} (${{ matrix.arch_label }})"

- name: Upload logs
if: always()
uses: actions/upload-artifact@v4
with:
name: logs-mgpu-${{ matrix.arch_label }}-${{ matrix.framework }}
path: |
*.log
test-results/*.xml
if-no-files-found: ignore
retention-days: 5

Expand Down
2 changes: 1 addition & 1 deletion ci/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ The scripts return 0 in case of test success, and other values for testing error
The scripts can be controlled by environment variables:
* `TEST_LEVEL` specifies testing thoroughness. Levels 1 and 3 are currently defined and can be used to run in feature branch and main branch correspondingly. Default=99 (maximal thoroughness)
* `TEST_SGPU` and `TEST_MGPU` instructs to run single-GPU tests or multi-GPU tests only that can be used to run several sGPU tests parallel on mGPU config
* `JUNITXML_PREFIX` and `JUNITXML_SUFFIX` enable pytest (pytorch and jax) junitxml logging if set. Each test will generate a junitxml log with the full filename `JUNITXML_PREFIX<test_name>.<test_config>JUNITXML_SUFFIX`.
* `JUNITXML_PREFIX` and `JUNITXML_SUFFIX` enable JUnit XML logging if set, for both pytest (pytorch and jax) and ctest (core). Each test run generates a JUnit XML log with the full filename `JUNITXML_PREFIX<test_name>.<test_config>JUNITXML_SUFFIX` (for core, `<test_name>.<test_config>` is `core.gemm` / `core.nongemm`).
If JUNITXML_PREFIX contains a path component, it is the caller's responsibility to create necessary directories.
If `JUNITXML_PREFIX` contains only a directory (no filename prefix), it should end with `/`.
Test scripts do not add any extension to the log filename so it is advised to end `JUNITXML_SUFFIX` with `.xml`.
Expand Down
28 changes: 27 additions & 1 deletion ci/_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,21 @@ TEST_START_TS=`date +%s`
#To disable some logs trimming
export CI=1

# Crash/hang visibility and bounding:
# - PYTHONFAULTHANDLER dumps a Python traceback on fatal signals (segfaults).
# - PYTEST_TIMEOUT bounds every individual test item so a single hang cannot
# stall the whole CI job; the offending test is recorded as a failure with a
# traceback instead of the run silently timing out hours later.
# Note: the 'thread' method bounds only the pytest process itself. Tests that
# launch torchrun/mpirun children (tests/pytorch/distributed) are reaped
# separately by tests/pytorch/distributed/conftest.py, which reads PYTEST_TIMEOUT
# to bound each child below this outer limit -- hence the exports below.
# All are overridable from the environment.
export PYTHONFAULTHANDLER=1
export PYTEST_TIMEOUT=${PYTEST_TIMEOUT:-300} # per-test (per-parametrization) timeout, seconds
export PYTEST_TIMEOUT_METHOD=${PYTEST_TIMEOUT_METHOD:-thread} # unstick a hung main thread; see note above
export CTEST_TIMEOUT=${CTEST_TIMEOUT:-300} # per-cpp-test timeout, seconds

_script_error_count=0
_run_error_count=0
_ignored_error_count=0
Expand Down Expand Up @@ -213,6 +228,12 @@ get_pytest_junitxml() {
fi
}

get_ctest_junitxml() {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please also update README to indicate that core tests honor JUNITXML* envs

@Micky774 Micky774 Jun 5, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated

if [ -n "$JUNITXML_PREFIX$JUNITXML_SUFFIX" ]; then
echo "--output-junit ${JUNITXML_PREFIX}$1${JUNITXML_SUFFIX}"
fi
}

check_test_filter() {
test -z "$TEST_FILTER" && return 0
for _tf in $TEST_FILTER; do
Expand Down Expand Up @@ -266,7 +287,12 @@ pytest_run() {
check_test_filter $_test_name_tag || return
_start_ts=`date +%s`
echo "Run [$_test_variant_tag] $@ at `time_elapsed $TEST_START_TS`"
python3 -m pytest -v -rfEs `get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
# A per-test timeout is applied to every item. Callers may still append their
# own --timeout/--timeout-method (e.g. distributed tests); since argparse
# takes the last value, a caller-supplied override wins over these defaults.
python3 -m pytest -v -rfEs \
--timeout=$PYTEST_TIMEOUT --timeout-method=$PYTEST_TIMEOUT_METHOD \
`get_pytest_junitxml $_test_name_tag` $TEST_PYTEST_ARGS "$TEST_DIR/$@"
test $? -eq 0 || test_run_error "[$_test_variant_tag] $1"
echo "Done [$_test_variant_tag] $1 in `time_elapsed $_start_ts`"
}
6 changes: 4 additions & 2 deletions ci/core.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,16 @@ fi
check_test_filter "nongemm"
if [ $? -eq 0 ]; then
echo ===== Run non GEMM tests =====
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -E "GEMMTestSuite"
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
--timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.nongemm) -E "GEMMTestSuite"
test $? -eq 0 || test_run_error "non-GEMM"
fi

check_test_filter "gemm"
if [ $? -eq 0 ]; then
echo ===== Run GEMM tests =====
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure -R "GEMMTestSuite"
ctest --test-dir build -j"$n_parallel_jobs" -V --output-on-failure \
--timeout $CTEST_TIMEOUT $(get_ctest_junitxml core.gemm) -R "GEMMTestSuite"
test $? -eq 0 || test_run_error "GEMM"
fi

Expand Down
Loading
Loading