From c38e2ba2df7e34dd5f333f5b53fbe321003157c4 Mon Sep 17 00:00:00 2001 From: gasoonjia Date: Mon, 1 Jun 2026 13:59:16 -0700 Subject: [PATCH] [ci][cuda] Consolidate A100 CUDA CI jobs into shared batch runners Summary: Collapse the four A100 runners (per-model export + e2e for the two A100-only models) into two shared single-runner batch jobs driven by a shared config file. Each model is processed independently with per-model pass/fail reporting; a job is green only if all its models pass, and one model's failure never blocks another. - Add .ci/scripts/cuda_a100_models.txt: shared source of truth for which models need an A100 runner (one line per model). - Add .ci/scripts/export_model_artifact_batch.sh: loops the config, delegates to export_model_artifact.sh per model under set +e, writes per-model _status files + logs, prints a summary section (stdout + GITHUB_STEP_SUMMARY), emits ::error:: annotations, exits non-zero if any model failed. - Add .ci/scripts/test_model_e2e_batch.sh: runs install_requirements.sh once, reads each model's export status (skips with surfaced error if export failed), delegates to test_model_e2e.sh per model, prints the same style of summary, exits non-zero unless every model fully passed. - test_model_e2e.sh: add backward-compatible SKIP_INSTALL_REQUIREMENTS guard so the batch can run runtime setup once. - cuda.yml: remove the two SocialLocalMobile A100 models (and their exclude rows) from the per-cell export/e2e matrices and simplify those runners back to plain A10G; add new export-models-cuda-a100 and test-models-cuda-a100-e2e jobs on linux.aws.a100 (timeout 150). The e2e job uses if: always() so exported models are still tested on a partial export failure. Test Plan: - bash -n on all three scripts; cuda.yml has no editor diagnostics. - Local smoke tests of both batch orchestrators with stubbed per-model scripts: confirmed loop continues past failures, correct _status files, summaries, ::error:: + step-summary output, SKIP_INSTALL_REQUIREMENTS propagation, export-failed->skipped path, and exit codes (0 all-pass, 1 on any failure / empty / missing config). --- .ci/scripts/cuda_a100_models.txt | 23 ++ .ci/scripts/export_model_artifact_batch.sh | 209 +++++++++++++++++ .ci/scripts/test_model_e2e.sh | 16 +- .ci/scripts/test_model_e2e_batch.sh | 255 +++++++++++++++++++++ .github/workflows/cuda.yml | 113 +++++---- 5 files changed, 566 insertions(+), 50 deletions(-) create mode 100644 .ci/scripts/cuda_a100_models.txt create mode 100755 .ci/scripts/export_model_artifact_batch.sh create mode 100755 .ci/scripts/test_model_e2e_batch.sh diff --git a/.ci/scripts/cuda_a100_models.txt b/.ci/scripts/cuda_a100_models.txt new file mode 100644 index 00000000000..426702c52e7 --- /dev/null +++ b/.ci/scripts/cuda_a100_models.txt @@ -0,0 +1,23 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Shared source of truth for the CUDA models that require an A100 runner. +# +# These models are exported and inference-tested on a single A100 runner each +# (one shared export job + one shared e2e job) by: +# .ci/scripts/export_model_artifact_batch.sh +# .ci/scripts/test_model_e2e_batch.sh +# both driven from .github/workflows/cuda.yml. +# +# Format: one model per line, whitespace-separated: +# / +# Blank lines and lines starting with '#' are ignored. +# +# To onboard a new A100 model, add a single line below. Its HF model id must +# already be handled by both .ci/scripts/export_model_artifact.sh and +# .ci/scripts/test_model_e2e.sh (their `case "$HF_MODEL"` blocks). +SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 quantized-int4-tile-packed +SocialLocalMobile/gemma-4-31B-it-HQQ-INT4 quantized-int4-tile-packed diff --git a/.ci/scripts/export_model_artifact_batch.sh b/.ci/scripts/export_model_artifact_batch.sh new file mode 100755 index 00000000000..c1b1650f9e0 --- /dev/null +++ b/.ci/scripts/export_model_artifact_batch.sh @@ -0,0 +1,209 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Batch export orchestrator for the CUDA models that share a single A100 runner. +# +# Loops over a shared config file (see .ci/scripts/cuda_a100_models.txt) and +# exports each model by delegating to the existing per-model script +# .ci/scripts/export_model_artifact.sh. Each model is exported independently: +# a failure in one model never aborts the others. Per-model outcomes are +# recorded under /_status/ so the downstream e2e batch job can +# decide which models to run, and a summary section is printed at the end (and +# mirrored to $GITHUB_STEP_SUMMARY when available). +# +# This script exits non-zero if ANY model failed to export, so the CI job is +# green only when every model exported successfully. + +show_help() { + cat << EOF +Usage: export_model_artifact_batch.sh + +Export every model listed in to format, writing each +model's artifacts into // and per-model status into +/_status/. + +Arguments: + device cuda, metal, or xnnpack (required) — passed through to + export_model_artifact.sh. + config_file Path to the shared model list (required). One model per line: + / + Blank lines and lines starting with '#' are ignored. + output_root Root directory for artifacts + status (required). Typically + \${RUNNER_ARTIFACT_DIR} in CI. + +Example: + export_model_artifact_batch.sh cuda .ci/scripts/cuda_a100_models.txt "\${RUNNER_ARTIFACT_DIR}" +EOF +} + +if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then + show_help + exit 0 +fi + +DEVICE="${1:-}" +CONFIG_FILE="${2:-}" +OUTPUT_ROOT="${3:-}" + +if [ -z "$DEVICE" ] || [ -z "$CONFIG_FILE" ] || [ -z "$OUTPUT_ROOT" ]; then + echo "Error: missing required argument(s)" + show_help + exit 1 +fi + +if [ ! -f "$CONFIG_FILE" ]; then + echo "Error: config file not found: $CONFIG_FILE" + exit 1 +fi + +# Locate the per-model export script next to this one. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXPORT_SCRIPT="$SCRIPT_DIR/export_model_artifact.sh" +if [ ! -f "$EXPORT_SCRIPT" ]; then + echo "Error: export_model_artifact.sh not found at $EXPORT_SCRIPT" + exit 1 +fi + +STATUS_DIR="$OUTPUT_ROOT/_status" +mkdir -p "$STATUS_DIR" + +# Track per-model results (parallel arrays; bash 3.2 compatible). +MODELS=() +QUANTS=() +STATUSES=() + +OVERALL_RC=0 + +while IFS= read -r raw_line || [ -n "$raw_line" ]; do + # Strip leading/trailing whitespace. + line="$(echo "$raw_line" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + # Skip blanks and comments. + case "$line" in + ''|\#*) continue ;; + esac + + # First field = hf_model, second field = quant (default non-quantized). + HF_MODEL="$(echo "$line" | awk '{print $1}')" + QUANT="$(echo "$line" | awk '{print $2}')" + if [ -z "$QUANT" ]; then + QUANT="non-quantized" + fi + + MODEL_SAFE="$(echo "$HF_MODEL" | tr '/' '_')" + MODEL_OUT="$OUTPUT_ROOT/$MODEL_SAFE" + LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + STATUS_FILE="$STATUS_DIR/$MODEL_SAFE.export.status" + mkdir -p "$MODEL_OUT" + + echo "::group::Export $HF_MODEL ($QUANT)" + # Run the existing per-model script as a subprocess so its `set -e`, traps, + # and cwd changes are isolated and a failure never aborts this loop. + set +e + bash "$EXPORT_SCRIPT" "$DEVICE" "$HF_MODEL" "$QUANT" "$MODEL_OUT" 2>&1 | tee "$LOG_FILE" + RC=${PIPESTATUS[0]} + set -e + echo "::endgroup::" + + MODELS+=("$HF_MODEL") + QUANTS+=("$QUANT") + if [ "$RC" -eq 0 ]; then + echo "success" > "$STATUS_FILE" + STATUSES+=("success") + echo "Export succeeded: $HF_MODEL ($QUANT)" + else + echo "failed" > "$STATUS_FILE" + STATUSES+=("failed") + OVERALL_RC=1 + echo "Export FAILED: $HF_MODEL ($QUANT) (exit $RC)" + fi +done < "$CONFIG_FILE" + +if [ "${#MODELS[@]}" -eq 0 ]; then + echo "Error: no models found in config file: $CONFIG_FILE" + exit 1 +fi + +# ---------------------------------------------------------------------------- +# Summary section +# ---------------------------------------------------------------------------- +emit_summary() { + # $1 = output sink: "stdout" or "github" + local sink="$1" + local i status_icon + if [ "$sink" = "github" ]; then + echo "## A100 export summary" + echo "" + echo "| Model | Quant | Export |" + echo "| --- | --- | --- |" + for i in "${!MODELS[@]}"; do + if [ "${STATUSES[$i]}" = "success" ]; then status_icon="✅ success"; else status_icon="❌ failed"; fi + echo "| ${MODELS[$i]} | ${QUANTS[$i]} | ${status_icon} |" + done + echo "" + else + echo "============================================================" + echo "A100 export summary" + echo "============================================================" + for i in "${!MODELS[@]}"; do + printf ' %-45s %-28s %s\n' "${MODELS[$i]}" "${QUANTS[$i]}" "${STATUSES[$i]}" + done + echo "============================================================" + fi +} + +echo "" +emit_summary stdout + +# Print the error tail for each failed model and emit GitHub annotations. +for i in "${!MODELS[@]}"; do + if [ "${STATUSES[$i]}" != "success" ]; then + MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')" + LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + echo "" + echo "----- Error tail for ${MODELS[$i]} (${QUANTS[$i]}) -----" + if [ -f "$LOG_FILE" ]; then + tail -n 40 "$LOG_FILE" + fi + # One-line annotation (last meaningful log line) for the CI UI. + ERR_LINE="" + if [ -f "$LOG_FILE" ]; then + ERR_LINE="$(grep -E -i 'error|fail' "$LOG_FILE" | tail -n 1)" + if [ -z "$ERR_LINE" ]; then + ERR_LINE="$(tail -n 1 "$LOG_FILE")" + fi + fi + echo "::error::Export failed for ${MODELS[$i]} (${QUANTS[$i]}): ${ERR_LINE}" + fi +done + +# Mirror summary to the GitHub job summary panel when available. +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + { + emit_summary github + for i in "${!MODELS[@]}"; do + if [ "${STATUSES[$i]}" != "success" ]; then + echo "
Error tail: ${MODELS[$i]} (${QUANTS[$i]})" + echo "" + echo '```' + MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')" + LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + if [ -f "$LOG_FILE" ]; then tail -n 40 "$LOG_FILE"; fi + echo '```' + echo "
" + echo "" + fi + done + } >> "$GITHUB_STEP_SUMMARY" +fi + +if [ "$OVERALL_RC" -ne 0 ]; then + echo "One or more models failed to export." +else + echo "All models exported successfully." +fi + +exit "$OVERALL_RC" diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index e1ba976b0cc..83e0fe563bd 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -247,10 +247,18 @@ case "$HF_MODEL" in ;; esac -echo "::group::Setup ExecuTorch Requirements" -./install_requirements.sh -pip list -echo "::endgroup::" +# Allow callers (e.g. the A100 batch orchestrator) to run ./install_requirements.sh +# once up front and skip the per-model install here. Default behavior is unchanged. +if [ -n "${SKIP_INSTALL_REQUIREMENTS:-}" ]; then + echo "::group::Setup ExecuTorch Requirements (skipped: SKIP_INSTALL_REQUIREMENTS set)" + pip list + echo "::endgroup::" +else + echo "::group::Setup ExecuTorch Requirements" + ./install_requirements.sh + pip list + echo "::endgroup::" +fi echo "::group::Prepare $MODEL_NAME Artifacts" diff --git a/.ci/scripts/test_model_e2e_batch.sh b/.ci/scripts/test_model_e2e_batch.sh new file mode 100755 index 00000000000..a1d04f7d9c6 --- /dev/null +++ b/.ci/scripts/test_model_e2e_batch.sh @@ -0,0 +1,255 @@ +#!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Batch end-to-end inference orchestrator for the CUDA models that share a +# single A100 runner. +# +# Loops over a shared config file (see .ci/scripts/cuda_a100_models.txt) and +# runs each model's e2e test by delegating to the existing per-model script +# .ci/scripts/test_model_e2e.sh. The shared runtime setup +# (./install_requirements.sh) runs once up front; each per-model invocation is +# told to skip it via SKIP_INSTALL_REQUIREMENTS. +# +# Models are tested independently: a failure (or a missing/failed export) in +# one model never aborts the others. For each model we first consult the export +# status written by export_model_artifact_batch.sh under +# /_status/.export.status: +# - if export did not succeed, the model is recorded as "skipped: export +# failed" (its export error is surfaced) and the run continues; +# - otherwise the per-model e2e test runs. +# +# A summary section is printed at the end (and mirrored to $GITHUB_STEP_SUMMARY +# when available). This script exits non-zero unless EVERY model both exported +# and passed its e2e test, so the CI job is green only when all models pass. + +show_help() { + cat << EOF +Usage: test_model_e2e_batch.sh + +Run end-to-end inference tests for every model listed in , reading +each model's artifacts from // and its export status +from /_status/. + +Arguments: + device cuda, metal, or xnnpack (required) — passed through to + test_model_e2e.sh. + config_file Path to the shared model list (required). One model per line: + / + Blank lines and lines starting with '#' are ignored. + artifact_root Root directory containing per-model artifacts + _status/ + (required). Typically \${RUNNER_ARTIFACT_DIR} in CI. + +Example: + test_model_e2e_batch.sh cuda .ci/scripts/cuda_a100_models.txt "\${RUNNER_ARTIFACT_DIR}" +EOF +} + +if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then + show_help + exit 0 +fi + +DEVICE="${1:-}" +CONFIG_FILE="${2:-}" +ARTIFACT_ROOT="${3:-}" + +if [ -z "$DEVICE" ] || [ -z "$CONFIG_FILE" ] || [ -z "$ARTIFACT_ROOT" ]; then + echo "Error: missing required argument(s)" + show_help + exit 1 +fi + +if [ ! -f "$CONFIG_FILE" ]; then + echo "Error: config file not found: $CONFIG_FILE" + exit 1 +fi + +# Locate the per-model e2e script and the repo root from this script's location. +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +E2E_SCRIPT="$SCRIPT_DIR/test_model_e2e.sh" +EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +if [ ! -f "$E2E_SCRIPT" ]; then + echo "Error: test_model_e2e.sh not found at $E2E_SCRIPT" + exit 1 +fi + +STATUS_DIR="$ARTIFACT_ROOT/_status" + +# Shared runtime setup — run once for all models. +echo "::group::Setup ExecuTorch Requirements (shared)" +pushd "$EXECUTORCH_ROOT" > /dev/null +./install_requirements.sh +pip list +popd > /dev/null +echo "::endgroup::" + +# Track per-model results (parallel arrays; bash 3.2 compatible). +MODELS=() +QUANTS=() +EXPORT_STATUSES=() +E2E_STATUSES=() + +OVERALL_RC=0 + +while IFS= read -r raw_line || [ -n "$raw_line" ]; do + line="$(echo "$raw_line" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" + case "$line" in + ''|\#*) continue ;; + esac + + HF_MODEL="$(echo "$line" | awk '{print $1}')" + QUANT="$(echo "$line" | awk '{print $2}')" + if [ -z "$QUANT" ]; then + QUANT="non-quantized" + fi + + MODEL_SAFE="$(echo "$HF_MODEL" | tr '/' '_')" + MODEL_DIR="$ARTIFACT_ROOT/$MODEL_SAFE" + EXPORT_STATUS_FILE="$STATUS_DIR/$MODEL_SAFE.export.status" + E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log" + + EXPORT_STATUS="missing" + if [ -f "$EXPORT_STATUS_FILE" ]; then + EXPORT_STATUS="$(cat "$EXPORT_STATUS_FILE")" + fi + + MODELS+=("$HF_MODEL") + QUANTS+=("$QUANT") + EXPORT_STATUSES+=("$EXPORT_STATUS") + + if [ "$EXPORT_STATUS" != "success" ]; then + echo "::group::Skip $HF_MODEL ($QUANT) — export status: $EXPORT_STATUS" + echo "Skipping e2e for $HF_MODEL: export did not succeed (status: $EXPORT_STATUS)" + EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + if [ -f "$EXPORT_LOG_FILE" ]; then + echo "----- Export error tail -----" + tail -n 40 "$EXPORT_LOG_FILE" + fi + echo "::endgroup::" + E2E_STATUSES+=("skipped: export failed") + OVERALL_RC=1 + continue + fi + + echo "::group::E2E $HF_MODEL ($QUANT)" + # Run the existing per-model script as a subprocess so its `set -e`, pushd, + # and cwd changes are isolated and a failure never aborts this loop. The + # shared runtime setup already ran, so skip the per-model install. + set +e + SKIP_INSTALL_REQUIREMENTS=1 bash "$E2E_SCRIPT" "$DEVICE" "$HF_MODEL" "$QUANT" "$MODEL_DIR" 2>&1 | tee "$E2E_LOG_FILE" + RC=${PIPESTATUS[0]} + set -e + echo "::endgroup::" + + if [ "$RC" -eq 0 ]; then + E2E_STATUSES+=("success") + echo "E2E succeeded: $HF_MODEL ($QUANT)" + else + E2E_STATUSES+=("failed") + OVERALL_RC=1 + echo "E2E FAILED: $HF_MODEL ($QUANT) (exit $RC)" + fi +done < "$CONFIG_FILE" + +if [ "${#MODELS[@]}" -eq 0 ]; then + echo "Error: no models found in config file: $CONFIG_FILE" + exit 1 +fi + +# ---------------------------------------------------------------------------- +# Summary section +# ---------------------------------------------------------------------------- +emit_summary() { + # $1 = output sink: "stdout" or "github" + local sink="$1" + local i export_icon e2e_label + if [ "$sink" = "github" ]; then + echo "## A100 e2e summary" + echo "" + echo "| Model | Quant | Export | E2E |" + echo "| --- | --- | --- | --- |" + for i in "${!MODELS[@]}"; do + if [ "${EXPORT_STATUSES[$i]}" = "success" ]; then export_icon="✅"; else export_icon="❌ ${EXPORT_STATUSES[$i]}"; fi + case "${E2E_STATUSES[$i]}" in + success) e2e_label="✅ success" ;; + *) e2e_label="❌ ${E2E_STATUSES[$i]}" ;; + esac + echo "| ${MODELS[$i]} | ${QUANTS[$i]} | ${export_icon} | ${e2e_label} |" + done + echo "" + else + echo "============================================================" + echo "A100 e2e summary" + echo "============================================================" + for i in "${!MODELS[@]}"; do + printf ' %-40s %-26s export=%-8s e2e=%s\n' \ + "${MODELS[$i]}" "${QUANTS[$i]}" "${EXPORT_STATUSES[$i]}" "${E2E_STATUSES[$i]}" + done + echo "============================================================" + fi +} + +echo "" +emit_summary stdout + +# Print error tails and emit GitHub annotations for any non-success model. +for i in "${!MODELS[@]}"; do + if [ "${E2E_STATUSES[$i]}" = "success" ]; then + continue + fi + MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')" + echo "" + echo "----- Error tail for ${MODELS[$i]} (${QUANTS[$i]}) -----" + if [ "${EXPORT_STATUSES[$i]}" != "success" ]; then + # Export failed/missing: surface the export log. + EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + if [ -f "$EXPORT_LOG_FILE" ]; then tail -n 40 "$EXPORT_LOG_FILE"; fi + echo "::error::E2E skipped for ${MODELS[$i]} (${QUANTS[$i]}): export ${EXPORT_STATUSES[$i]}" + else + # Export succeeded but e2e failed: surface the e2e log. + E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log" + ERR_LINE="" + if [ -f "$E2E_LOG_FILE" ]; then + tail -n 40 "$E2E_LOG_FILE" + ERR_LINE="$(grep -E -i 'error|fail|expected' "$E2E_LOG_FILE" | tail -n 1)" + if [ -z "$ERR_LINE" ]; then ERR_LINE="$(tail -n 1 "$E2E_LOG_FILE")"; fi + fi + echo "::error::E2E failed for ${MODELS[$i]} (${QUANTS[$i]}): ${ERR_LINE}" + fi +done + +# Mirror summary to the GitHub job summary panel when available. +if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then + { + emit_summary github + for i in "${!MODELS[@]}"; do + if [ "${E2E_STATUSES[$i]}" = "success" ]; then continue; fi + MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')" + echo "
Error tail: ${MODELS[$i]} (${QUANTS[$i]})" + echo "" + echo '```' + if [ "${EXPORT_STATUSES[$i]}" != "success" ]; then + EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log" + if [ -f "$EXPORT_LOG_FILE" ]; then tail -n 40 "$EXPORT_LOG_FILE"; fi + else + E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log" + if [ -f "$E2E_LOG_FILE" ]; then tail -n 40 "$E2E_LOG_FILE"; fi + fi + echo '```' + echo "
" + echo "" + done + } >> "$GITHUB_STEP_SUMMARY" +fi + +if [ "$OVERALL_RC" -ne 0 ]; then + echo "One or more models failed (export or e2e)." +else + echo "All models passed export and e2e." +fi + +exit "$OVERALL_RC" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index f19b937994f..a2b1df058da 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -183,10 +183,6 @@ jobs: name: "parakeet-tdt" - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" - - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -197,24 +193,6 @@ jobs: repo: "google" name: "gemma-3-4b-it" quant: "quantized-int4-weight-only" - # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed - - model: - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - quant: "non-quantized" - - model: - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - quant: "quantized-int4-weight-only" - # Gemma 4 31B uses a prequantized checkpoint, only tile-packed - - model: - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" - quant: "non-quantized" - - model: - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" - quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" @@ -269,7 +247,7 @@ jobs: with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false @@ -325,10 +303,6 @@ jobs: name: "parakeet-tdt" - repo: "facebook" name: "dinov2-small-imagenet1k-1-layer" - - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -339,24 +313,6 @@ jobs: repo: "google" name: "gemma-3-4b-it" quant: "quantized-int4-weight-only" - # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed - - model: - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - quant: "non-quantized" - - model: - repo: "SocialLocalMobile" - name: "Qwen3.5-35B-A3B-HQQ-INT4" - quant: "quantized-int4-weight-only" - # Gemma 4 31B uses a prequantized checkpoint, only tile-packed - - model: - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" - quant: "non-quantized" - - model: - repo: "SocialLocalMobile" - name: "gemma-4-31B-it-HQQ-INT4" - quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" @@ -405,7 +361,7 @@ jobs: quant: "non-quantized" with: timeout: 90 - runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: linux.g5.4xlarge.nvidia.gpu gpu-arch-type: cuda gpu-arch-version: "13.0" use-custom-docker-registry: false @@ -415,6 +371,71 @@ jobs: script: | source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}" + # Consolidated A100 export job: sharing initialization stage for better resource utilization. + export-models-cuda-a100: + name: export-models-cuda-a100 + # Skip this job if the pull request is from a fork (HuggingFace secrets are not available) + if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request' + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + secrets: inherit + with: + timeout: 150 + secrets-env: EXECUTORCH_HF_TOKEN + runner: linux.aws.a100 + gpu-arch-type: cuda + gpu-arch-version: "13.0" + use-custom-docker-registry: false + submodules: recursive + upload-artifact: cuda-a100-models + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + set -eux + + echo "::group::Setup ExecuTorch" + # Disable MKL to avoid duplicate target error when conda has multiple MKL installations + export USE_MKL=OFF + ./install_executorch.sh + echo "::endgroup::" + + echo "::group::Setup Huggingface" + pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1" + HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')" + hf auth login --token "$HF_AUTH_TOKEN" + OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt) + pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION} + echo "::endgroup::" + + # Export every A100 model listed in the shared config on this one runner. + source .ci/scripts/export_model_artifact_batch.sh cuda .ci/scripts/cuda_a100_models.txt "${RUNNER_ARTIFACT_DIR}" + + # Consolidated A100 runtime jobs: sharing initialization stage for better resource utilization. + test-models-cuda-a100-e2e: + name: test-models-cuda-a100-e2e + needs: export-models-cuda-a100 + # Run even when export partially failed (so exported models still get tested), + # but keep the same fork guard so we don't run on forks where export is skipped. + if: ${{ always() && (github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request') }} + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + with: + timeout: 150 + runner: linux.aws.a100 + gpu-arch-type: cuda + gpu-arch-version: "13.0" + use-custom-docker-registry: false + submodules: recursive + download-artifact: cuda-a100-models + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + # Run inference for every A100 model listed in the shared config on this + # one runner; reads per-model export status from the downloaded artifact. + source .ci/scripts/test_model_e2e_batch.sh cuda .ci/scripts/cuda_a100_models.txt "${RUNNER_ARTIFACT_DIR}" + test-cuda-pybind: name: test-cuda-pybind needs: export-model-cuda-artifact