From c38e2ba2df7e34dd5f333f5b53fbe321003157c4 Mon Sep 17 00:00:00 2001
From: gasoonjia <gasoonjia@fb.com>
Date: Mon, 1 Jun 2026 13:59:16 -0700
Subject: [PATCH] [ci][cuda] Consolidate A100 CUDA CI jobs into shared batch
 runners

Summary:
Collapse the four A100 runners (per-model export + e2e for the two
A100-only models) into two shared single-runner batch jobs driven by a
shared config file. Each model is processed independently with per-model
pass/fail reporting; a job is green only if all its models pass, and one
model's failure never blocks another.

- Add .ci/scripts/cuda_a100_models.txt: shared source of truth for which
  models need an A100 runner (one line per model).
- Add .ci/scripts/export_model_artifact_batch.sh: loops the config,
  delegates to export_model_artifact.sh per model under set +e, writes
  per-model _status files + logs, prints a summary section (stdout +
  GITHUB_STEP_SUMMARY), emits ::error:: annotations, exits non-zero if any
  model failed.
- Add .ci/scripts/test_model_e2e_batch.sh: runs install_requirements.sh
  once, reads each model's export status (skips with surfaced error if
  export failed), delegates to test_model_e2e.sh per model, prints the same
  style of summary, exits non-zero unless every model fully passed.
- test_model_e2e.sh: add backward-compatible SKIP_INSTALL_REQUIREMENTS
  guard so the batch can run runtime setup once.
- cuda.yml: remove the two SocialLocalMobile A100 models (and their exclude
  rows) from the per-cell export/e2e matrices and simplify those runners
  back to plain A10G; add new export-models-cuda-a100 and
  test-models-cuda-a100-e2e jobs on linux.aws.a100 (timeout 150). The e2e
  job uses if: always() so exported models are still tested on a partial
  export failure.

Test Plan:
- bash -n on all three scripts; cuda.yml has no editor diagnostics.
- Local smoke tests of both batch orchestrators with stubbed per-model
  scripts: confirmed loop continues past failures, correct _status files,
  summaries, ::error:: + step-summary output, SKIP_INSTALL_REQUIREMENTS
  propagation, export-failed->skipped path, and exit codes (0 all-pass,
  1 on any failure / empty / missing config).
---
 .ci/scripts/cuda_a100_models.txt           |  23 ++
 .ci/scripts/export_model_artifact_batch.sh | 209 +++++++++++++++++
 .ci/scripts/test_model_e2e.sh              |  16 +-
 .ci/scripts/test_model_e2e_batch.sh        | 255 +++++++++++++++++++++
 .github/workflows/cuda.yml                 | 113 +++++----
 5 files changed, 566 insertions(+), 50 deletions(-)
 create mode 100644 .ci/scripts/cuda_a100_models.txt
 create mode 100755 .ci/scripts/export_model_artifact_batch.sh
 create mode 100755 .ci/scripts/test_model_e2e_batch.sh
diff --git a/.ci/scripts/cuda_a100_models.txt b/.ci/scripts/cuda_a100_models.txt
new file mode 100644
index 00000000000..426702c52e7
--- /dev/null
+++ b/.ci/scripts/cuda_a100_models.txt
@@ -0,0 +1,23 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Shared source of truth for the CUDA models that require an A100 runner.
+#
+# These models are exported and inference-tested on a single A100 runner each
+# (one shared export job + one shared e2e job) by:
+#   .ci/scripts/export_model_artifact_batch.sh
+#   .ci/scripts/test_model_e2e_batch.sh
+# both driven from .github/workflows/cuda.yml.
+#
+# Format: one model per line, whitespace-separated:
+#   <hf_repo>/<hf_name> <quant>
+# Blank lines and lines starting with '#' are ignored.
+#
+# To onboard a new A100 model, add a single line below. Its HF model id must
+# already be handled by both .ci/scripts/export_model_artifact.sh and
+# .ci/scripts/test_model_e2e.sh (their `case "$HF_MODEL"` blocks).
+SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 quantized-int4-tile-packed
+SocialLocalMobile/gemma-4-31B-it-HQQ-INT4 quantized-int4-tile-packed
diff --git a/.ci/scripts/export_model_artifact_batch.sh b/.ci/scripts/export_model_artifact_batch.sh
new file mode 100755
index 00000000000..c1b1650f9e0
--- /dev/null
+++ b/.ci/scripts/export_model_artifact_batch.sh
@@ -0,0 +1,209 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Batch export orchestrator for the CUDA models that share a single A100 runner.
+#
+# Loops over a shared config file (see .ci/scripts/cuda_a100_models.txt) and
+# exports each model by delegating to the existing per-model script
+# .ci/scripts/export_model_artifact.sh. Each model is exported independently:
+# a failure in one model never aborts the others. Per-model outcomes are
+# recorded under <output_root>/_status/ so the downstream e2e batch job can
+# decide which models to run, and a summary section is printed at the end (and
+# mirrored to $GITHUB_STEP_SUMMARY when available).
+#
+# This script exits non-zero if ANY model failed to export, so the CI job is
+# green only when every model exported successfully.
+
+show_help() {
+  cat << EOF
+Usage: export_model_artifact_batch.sh <device> <config_file> <output_root>
+
+Export every model listed in <config_file> to <device> format, writing each
+model's artifacts into <output_root>/<model_safe>/ and per-model status into
+<output_root>/_status/.
+
+Arguments:
+  device        cuda, metal, or xnnpack (required) — passed through to
+                export_model_artifact.sh.
+  config_file   Path to the shared model list (required). One model per line:
+                  <hf_repo>/<hf_name> <quant>
+                Blank lines and lines starting with '#' are ignored.
+  output_root   Root directory for artifacts + status (required). Typically
+                \${RUNNER_ARTIFACT_DIR} in CI.
+
+Example:
+  export_model_artifact_batch.sh cuda .ci/scripts/cuda_a100_models.txt "\${RUNNER_ARTIFACT_DIR}"
+EOF
+}
+
+if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
+  show_help
+  exit 0
+fi
+
+DEVICE="${1:-}"
+CONFIG_FILE="${2:-}"
+OUTPUT_ROOT="${3:-}"
+
+if [ -z "$DEVICE" ] || [ -z "$CONFIG_FILE" ] || [ -z "$OUTPUT_ROOT" ]; then
+  echo "Error: missing required argument(s)"
+  show_help
+  exit 1
+fi
+
+if [ ! -f "$CONFIG_FILE" ]; then
+  echo "Error: config file not found: $CONFIG_FILE"
+  exit 1
+fi
+
+# Locate the per-model export script next to this one.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+EXPORT_SCRIPT="$SCRIPT_DIR/export_model_artifact.sh"
+if [ ! -f "$EXPORT_SCRIPT" ]; then
+  echo "Error: export_model_artifact.sh not found at $EXPORT_SCRIPT"
+  exit 1
+fi
+
+STATUS_DIR="$OUTPUT_ROOT/_status"
+mkdir -p "$STATUS_DIR"
+
+# Track per-model results (parallel arrays; bash 3.2 compatible).
+MODELS=()
+QUANTS=()
+STATUSES=()
+
+OVERALL_RC=0
+
+while IFS= read -r raw_line || [ -n "$raw_line" ]; do
+  # Strip leading/trailing whitespace.
+  line="$(echo "$raw_line" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+  # Skip blanks and comments.
+  case "$line" in
+    ''|\#*) continue ;;
+  esac
+
+  # First field = hf_model, second field = quant (default non-quantized).
+  HF_MODEL="$(echo "$line" | awk '{print $1}')"
+  QUANT="$(echo "$line" | awk '{print $2}')"
+  if [ -z "$QUANT" ]; then
+    QUANT="non-quantized"
+  fi
+
+  MODEL_SAFE="$(echo "$HF_MODEL" | tr '/' '_')"
+  MODEL_OUT="$OUTPUT_ROOT/$MODEL_SAFE"
+  LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+  STATUS_FILE="$STATUS_DIR/$MODEL_SAFE.export.status"
+  mkdir -p "$MODEL_OUT"
+
+  echo "::group::Export $HF_MODEL ($QUANT)"
+  # Run the existing per-model script as a subprocess so its `set -e`, traps,
+  # and cwd changes are isolated and a failure never aborts this loop.
+  set +e
+  bash "$EXPORT_SCRIPT" "$DEVICE" "$HF_MODEL" "$QUANT" "$MODEL_OUT" 2>&1 | tee "$LOG_FILE"
+  RC=${PIPESTATUS[0]}
+  set -e
+  echo "::endgroup::"
+
+  MODELS+=("$HF_MODEL")
+  QUANTS+=("$QUANT")
+  if [ "$RC" -eq 0 ]; then
+    echo "success" > "$STATUS_FILE"
+    STATUSES+=("success")
+    echo "Export succeeded: $HF_MODEL ($QUANT)"
+  else
+    echo "failed" > "$STATUS_FILE"
+    STATUSES+=("failed")
+    OVERALL_RC=1
+    echo "Export FAILED: $HF_MODEL ($QUANT) (exit $RC)"
+  fi
+done < "$CONFIG_FILE"
+
+if [ "${#MODELS[@]}" -eq 0 ]; then
+  echo "Error: no models found in config file: $CONFIG_FILE"
+  exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# Summary section
+# ----------------------------------------------------------------------------
+emit_summary() {
+  # $1 = output sink: "stdout" or "github"
+  local sink="$1"
+  local i status_icon
+  if [ "$sink" = "github" ]; then
+    echo "## A100 export summary"
+    echo ""
+    echo "| Model | Quant | Export |"
+    echo "| --- | --- | --- |"
+    for i in "${!MODELS[@]}"; do
+      if [ "${STATUSES[$i]}" = "success" ]; then status_icon="✅ success"; else status_icon="❌ failed"; fi
+      echo "| ${MODELS[$i]} | ${QUANTS[$i]} | ${status_icon} |"
+    done
+    echo ""
+  else
+    echo "============================================================"
+    echo "A100 export summary"
+    echo "============================================================"
+    for i in "${!MODELS[@]}"; do
+      printf '  %-45s %-28s %s\n' "${MODELS[$i]}" "${QUANTS[$i]}" "${STATUSES[$i]}"
+    done
+    echo "============================================================"
+  fi
+}
+
+echo ""
+emit_summary stdout
+
+# Print the error tail for each failed model and emit GitHub annotations.
+for i in "${!MODELS[@]}"; do
+  if [ "${STATUSES[$i]}" != "success" ]; then
+    MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')"
+    LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+    echo ""
+    echo "----- Error tail for ${MODELS[$i]} (${QUANTS[$i]}) -----"
+    if [ -f "$LOG_FILE" ]; then
+      tail -n 40 "$LOG_FILE"
+    fi
+    # One-line annotation (last meaningful log line) for the CI UI.
+    ERR_LINE=""
+    if [ -f "$LOG_FILE" ]; then
+      ERR_LINE="$(grep -E -i 'error|fail' "$LOG_FILE" | tail -n 1)"
+      if [ -z "$ERR_LINE" ]; then
+        ERR_LINE="$(tail -n 1 "$LOG_FILE")"
+      fi
+    fi
+    echo "::error::Export failed for ${MODELS[$i]} (${QUANTS[$i]}): ${ERR_LINE}"
+  fi
+done
+
+# Mirror summary to the GitHub job summary panel when available.
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  {
+    emit_summary github
+    for i in "${!MODELS[@]}"; do
+      if [ "${STATUSES[$i]}" != "success" ]; then
+        echo "<details><summary>Error tail: ${MODELS[$i]} (${QUANTS[$i]})</summary>"
+        echo ""
+        echo '```'
+        MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')"
+        LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+        if [ -f "$LOG_FILE" ]; then tail -n 40 "$LOG_FILE"; fi
+        echo '```'
+        echo "</details>"
+        echo ""
+      fi
+    done
+  } >> "$GITHUB_STEP_SUMMARY"
+fi
+
+if [ "$OVERALL_RC" -ne 0 ]; then
+  echo "One or more models failed to export."
+else
+  echo "All models exported successfully."
+fi
+
+exit "$OVERALL_RC"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index e1ba976b0cc..83e0fe563bd 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -247,10 +247,18 @@ case "$HF_MODEL" in
     ;;
 esac
 
-echo "::group::Setup ExecuTorch Requirements"
-./install_requirements.sh
-pip list
-echo "::endgroup::"
+# Allow callers (e.g. the A100 batch orchestrator) to run ./install_requirements.sh
+# once up front and skip the per-model install here. Default behavior is unchanged.
+if [ -n "${SKIP_INSTALL_REQUIREMENTS:-}" ]; then
+  echo "::group::Setup ExecuTorch Requirements (skipped: SKIP_INSTALL_REQUIREMENTS set)"
+  pip list
+  echo "::endgroup::"
+else
+  echo "::group::Setup ExecuTorch Requirements"
+  ./install_requirements.sh
+  pip list
+  echo "::endgroup::"
+fi
 
 echo "::group::Prepare $MODEL_NAME Artifacts"
 
diff --git a/.ci/scripts/test_model_e2e_batch.sh b/.ci/scripts/test_model_e2e_batch.sh
new file mode 100755
index 00000000000..a1d04f7d9c6
--- /dev/null
+++ b/.ci/scripts/test_model_e2e_batch.sh
@@ -0,0 +1,255 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Batch end-to-end inference orchestrator for the CUDA models that share a
+# single A100 runner.
+#
+# Loops over a shared config file (see .ci/scripts/cuda_a100_models.txt) and
+# runs each model's e2e test by delegating to the existing per-model script
+# .ci/scripts/test_model_e2e.sh. The shared runtime setup
+# (./install_requirements.sh) runs once up front; each per-model invocation is
+# told to skip it via SKIP_INSTALL_REQUIREMENTS.
+#
+# Models are tested independently: a failure (or a missing/failed export) in
+# one model never aborts the others. For each model we first consult the export
+# status written by export_model_artifact_batch.sh under
+# <artifact_root>/_status/<model_safe>.export.status:
+#   - if export did not succeed, the model is recorded as "skipped: export
+#     failed" (its export error is surfaced) and the run continues;
+#   - otherwise the per-model e2e test runs.
+#
+# A summary section is printed at the end (and mirrored to $GITHUB_STEP_SUMMARY
+# when available). This script exits non-zero unless EVERY model both exported
+# and passed its e2e test, so the CI job is green only when all models pass.
+
+show_help() {
+  cat << EOF
+Usage: test_model_e2e_batch.sh <device> <config_file> <artifact_root>
+
+Run end-to-end inference tests for every model listed in <config_file>, reading
+each model's artifacts from <artifact_root>/<model_safe>/ and its export status
+from <artifact_root>/_status/.
+
+Arguments:
+  device         cuda, metal, or xnnpack (required) — passed through to
+                 test_model_e2e.sh.
+  config_file    Path to the shared model list (required). One model per line:
+                   <hf_repo>/<hf_name> <quant>
+                 Blank lines and lines starting with '#' are ignored.
+  artifact_root  Root directory containing per-model artifacts + _status/
+                 (required). Typically \${RUNNER_ARTIFACT_DIR} in CI.
+
+Example:
+  test_model_e2e_batch.sh cuda .ci/scripts/cuda_a100_models.txt "\${RUNNER_ARTIFACT_DIR}"
+EOF
+}
+
+if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
+  show_help
+  exit 0
+fi
+
+DEVICE="${1:-}"
+CONFIG_FILE="${2:-}"
+ARTIFACT_ROOT="${3:-}"
+
+if [ -z "$DEVICE" ] || [ -z "$CONFIG_FILE" ] || [ -z "$ARTIFACT_ROOT" ]; then
+  echo "Error: missing required argument(s)"
+  show_help
+  exit 1
+fi
+
+if [ ! -f "$CONFIG_FILE" ]; then
+  echo "Error: config file not found: $CONFIG_FILE"
+  exit 1
+fi
+
+# Locate the per-model e2e script and the repo root from this script's location.
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+E2E_SCRIPT="$SCRIPT_DIR/test_model_e2e.sh"
+EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+if [ ! -f "$E2E_SCRIPT" ]; then
+  echo "Error: test_model_e2e.sh not found at $E2E_SCRIPT"
+  exit 1
+fi
+
+STATUS_DIR="$ARTIFACT_ROOT/_status"
+
+# Shared runtime setup — run once for all models.
+echo "::group::Setup ExecuTorch Requirements (shared)"
+pushd "$EXECUTORCH_ROOT" > /dev/null
+./install_requirements.sh
+pip list
+popd > /dev/null
+echo "::endgroup::"
+
+# Track per-model results (parallel arrays; bash 3.2 compatible).
+MODELS=()
+QUANTS=()
+EXPORT_STATUSES=()
+E2E_STATUSES=()
+
+OVERALL_RC=0
+
+while IFS= read -r raw_line || [ -n "$raw_line" ]; do
+  line="$(echo "$raw_line" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
+  case "$line" in
+    ''|\#*) continue ;;
+  esac
+
+  HF_MODEL="$(echo "$line" | awk '{print $1}')"
+  QUANT="$(echo "$line" | awk '{print $2}')"
+  if [ -z "$QUANT" ]; then
+    QUANT="non-quantized"
+  fi
+
+  MODEL_SAFE="$(echo "$HF_MODEL" | tr '/' '_')"
+  MODEL_DIR="$ARTIFACT_ROOT/$MODEL_SAFE"
+  EXPORT_STATUS_FILE="$STATUS_DIR/$MODEL_SAFE.export.status"
+  E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log"
+
+  EXPORT_STATUS="missing"
+  if [ -f "$EXPORT_STATUS_FILE" ]; then
+    EXPORT_STATUS="$(cat "$EXPORT_STATUS_FILE")"
+  fi
+
+  MODELS+=("$HF_MODEL")
+  QUANTS+=("$QUANT")
+  EXPORT_STATUSES+=("$EXPORT_STATUS")
+
+  if [ "$EXPORT_STATUS" != "success" ]; then
+    echo "::group::Skip $HF_MODEL ($QUANT) — export status: $EXPORT_STATUS"
+    echo "Skipping e2e for $HF_MODEL: export did not succeed (status: $EXPORT_STATUS)"
+    EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+    if [ -f "$EXPORT_LOG_FILE" ]; then
+      echo "----- Export error tail -----"
+      tail -n 40 "$EXPORT_LOG_FILE"
+    fi
+    echo "::endgroup::"
+    E2E_STATUSES+=("skipped: export failed")
+    OVERALL_RC=1
+    continue
+  fi
+
+  echo "::group::E2E $HF_MODEL ($QUANT)"
+  # Run the existing per-model script as a subprocess so its `set -e`, pushd,
+  # and cwd changes are isolated and a failure never aborts this loop. The
+  # shared runtime setup already ran, so skip the per-model install.
+  set +e
+  SKIP_INSTALL_REQUIREMENTS=1 bash "$E2E_SCRIPT" "$DEVICE" "$HF_MODEL" "$QUANT" "$MODEL_DIR" 2>&1 | tee "$E2E_LOG_FILE"
+  RC=${PIPESTATUS[0]}
+  set -e
+  echo "::endgroup::"
+
+  if [ "$RC" -eq 0 ]; then
+    E2E_STATUSES+=("success")
+    echo "E2E succeeded: $HF_MODEL ($QUANT)"
+  else
+    E2E_STATUSES+=("failed")
+    OVERALL_RC=1
+    echo "E2E FAILED: $HF_MODEL ($QUANT) (exit $RC)"
+  fi
+done < "$CONFIG_FILE"
+
+if [ "${#MODELS[@]}" -eq 0 ]; then
+  echo "Error: no models found in config file: $CONFIG_FILE"
+  exit 1
+fi
+
+# ----------------------------------------------------------------------------
+# Summary section
+# ----------------------------------------------------------------------------
+emit_summary() {
+  # $1 = output sink: "stdout" or "github"
+  local sink="$1"
+  local i export_icon e2e_label
+  if [ "$sink" = "github" ]; then
+    echo "## A100 e2e summary"
+    echo ""
+    echo "| Model | Quant | Export | E2E |"
+    echo "| --- | --- | --- | --- |"
+    for i in "${!MODELS[@]}"; do
+      if [ "${EXPORT_STATUSES[$i]}" = "success" ]; then export_icon="✅"; else export_icon="❌ ${EXPORT_STATUSES[$i]}"; fi
+      case "${E2E_STATUSES[$i]}" in
+        success) e2e_label="✅ success" ;;
+        *) e2e_label="❌ ${E2E_STATUSES[$i]}" ;;
+      esac
+      echo "| ${MODELS[$i]} | ${QUANTS[$i]} | ${export_icon} | ${e2e_label} |"
+    done
+    echo ""
+  else
+    echo "============================================================"
+    echo "A100 e2e summary"
+    echo "============================================================"
+    for i in "${!MODELS[@]}"; do
+      printf '  %-40s %-26s export=%-8s e2e=%s\n' \
+        "${MODELS[$i]}" "${QUANTS[$i]}" "${EXPORT_STATUSES[$i]}" "${E2E_STATUSES[$i]}"
+    done
+    echo "============================================================"
+  fi
+}
+
+echo ""
+emit_summary stdout
+
+# Print error tails and emit GitHub annotations for any non-success model.
+for i in "${!MODELS[@]}"; do
+  if [ "${E2E_STATUSES[$i]}" = "success" ]; then
+    continue
+  fi
+  MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')"
+  echo ""
+  echo "----- Error tail for ${MODELS[$i]} (${QUANTS[$i]}) -----"
+  if [ "${EXPORT_STATUSES[$i]}" != "success" ]; then
+    # Export failed/missing: surface the export log.
+    EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+    if [ -f "$EXPORT_LOG_FILE" ]; then tail -n 40 "$EXPORT_LOG_FILE"; fi
+    echo "::error::E2E skipped for ${MODELS[$i]} (${QUANTS[$i]}): export ${EXPORT_STATUSES[$i]}"
+  else
+    # Export succeeded but e2e failed: surface the e2e log.
+    E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log"
+    ERR_LINE=""
+    if [ -f "$E2E_LOG_FILE" ]; then
+      tail -n 40 "$E2E_LOG_FILE"
+      ERR_LINE="$(grep -E -i 'error|fail|expected' "$E2E_LOG_FILE" | tail -n 1)"
+      if [ -z "$ERR_LINE" ]; then ERR_LINE="$(tail -n 1 "$E2E_LOG_FILE")"; fi
+    fi
+    echo "::error::E2E failed for ${MODELS[$i]} (${QUANTS[$i]}): ${ERR_LINE}"
+  fi
+done
+
+# Mirror summary to the GitHub job summary panel when available.
+if [ -n "${GITHUB_STEP_SUMMARY:-}" ]; then
+  {
+    emit_summary github
+    for i in "${!MODELS[@]}"; do
+      if [ "${E2E_STATUSES[$i]}" = "success" ]; then continue; fi
+      MODEL_SAFE="$(echo "${MODELS[$i]}" | tr '/' '_')"
+      echo "<details><summary>Error tail: ${MODELS[$i]} (${QUANTS[$i]})</summary>"
+      echo ""
+      echo '```'
+      if [ "${EXPORT_STATUSES[$i]}" != "success" ]; then
+        EXPORT_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.export.log"
+        if [ -f "$EXPORT_LOG_FILE" ]; then tail -n 40 "$EXPORT_LOG_FILE"; fi
+      else
+        E2E_LOG_FILE="$STATUS_DIR/$MODEL_SAFE.e2e.log"
+        if [ -f "$E2E_LOG_FILE" ]; then tail -n 40 "$E2E_LOG_FILE"; fi
+      fi
+      echo '```'
+      echo "</details>"
+      echo ""
+    done
+  } >> "$GITHUB_STEP_SUMMARY"
+fi
+
+if [ "$OVERALL_RC" -ne 0 ]; then
+  echo "One or more models failed (export or e2e)."
+else
+  echo "All models passed export and e2e."
+fi
+
+exit "$OVERALL_RC"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index f19b937994f..a2b1df058da 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -183,10 +183,6 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
-          - repo: "SocialLocalMobile"
-            name: "Qwen3.5-35B-A3B-HQQ-INT4"
-          - repo: "SocialLocalMobile"
-            name: "gemma-4-31B-it-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -197,24 +193,6 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
-          # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
-          - model:
-              repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
-            quant: "non-quantized"
-          - model:
-              repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
-            quant: "quantized-int4-weight-only"
-          # Gemma 4 31B uses a prequantized checkpoint, only tile-packed
-          - model:
-              repo: "SocialLocalMobile"
-              name: "gemma-4-31B-it-HQQ-INT4"
-            quant: "non-quantized"
-          - model:
-              repo: "SocialLocalMobile"
-              name: "gemma-4-31B-it-HQQ-INT4"
-            quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
               repo: "mistralai"
@@ -269,7 +247,7 @@ jobs:
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
-      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false
@@ -325,10 +303,6 @@ jobs:
             name: "parakeet-tdt"
           - repo: "facebook"
             name: "dinov2-small-imagenet1k-1-layer"
-          - repo: "SocialLocalMobile"
-            name: "Qwen3.5-35B-A3B-HQQ-INT4"
-          - repo: "SocialLocalMobile"
-            name: "gemma-4-31B-it-HQQ-INT4"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -339,24 +313,6 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
-          # Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
-          - model:
-              repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
-            quant: "non-quantized"
-          - model:
-              repo: "SocialLocalMobile"
-              name: "Qwen3.5-35B-A3B-HQQ-INT4"
-            quant: "quantized-int4-weight-only"
-          # Gemma 4 31B uses a prequantized checkpoint, only tile-packed
-          - model:
-              repo: "SocialLocalMobile"
-              name: "gemma-4-31B-it-HQQ-INT4"
-            quant: "non-quantized"
-          - model:
-              repo: "SocialLocalMobile"
-              name: "gemma-4-31B-it-HQQ-INT4"
-            quant: "quantized-int4-weight-only"
           # Voxtral Realtime only supports int4-tile-packed on CUDA
           - model:
               repo: "mistralai"
@@ -405,7 +361,7 @@ jobs:
             quant: "non-quantized"
     with:
       timeout: 90
-      runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
+      runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: "13.0"
       use-custom-docker-registry: false
@@ -415,6 +371,71 @@ jobs:
       script: |
         source .ci/scripts/test_model_e2e.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
+  # Consolidated A100 export job: sharing initialization stage for better resource utilization.
+  export-models-cuda-a100:
+    name: export-models-cuda-a100
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    secrets: inherit
+    with:
+      timeout: 150
+      secrets-env: EXECUTORCH_HF_TOKEN
+      runner: linux.aws.a100
+      gpu-arch-type: cuda
+      gpu-arch-version: "13.0"
+      use-custom-docker-registry: false
+      submodules: recursive
+      upload-artifact: cuda-a100-models
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        # Disable MKL to avoid duplicate target error when conda has multiple MKL installations
+        export USE_MKL=OFF
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]>=1.2.1,<2.0" accelerate "optimum~=2.0.0" "transformers==5.0.0rc1"
+        HF_AUTH_TOKEN="$(printf '%s' "$SECRET_EXECUTORCH_HF_TOKEN" | tr -d '\r\n')"
+        hf auth login --token "$HF_AUTH_TOKEN"
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install --no-deps git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        echo "::endgroup::"
+
+        # Export every A100 model listed in the shared config on this one runner.
+        source .ci/scripts/export_model_artifact_batch.sh cuda .ci/scripts/cuda_a100_models.txt "${RUNNER_ARTIFACT_DIR}"
+
+  # Consolidated A100 runtime jobs: sharing initialization stage for better resource utilization.
+  test-models-cuda-a100-e2e:
+    name: test-models-cuda-a100-e2e
+    needs: export-models-cuda-a100
+    # Run even when export partially failed (so exported models still get tested),
+    # but keep the same fork guard so we don't run on forks where export is skipped.
+    if: ${{ always() && (github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request') }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      timeout: 150
+      runner: linux.aws.a100
+      gpu-arch-type: cuda
+      gpu-arch-version: "13.0"
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: cuda-a100-models
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        # Run inference for every A100 model listed in the shared config on this
+        # one runner; reads per-model export status from the downloaded artifact.
+        source .ci/scripts/test_model_e2e_batch.sh cuda .ci/scripts/cuda_a100_models.txt "${RUNNER_ARTIFACT_DIR}"
+
   test-cuda-pybind:
     name: test-cuda-pybind
     needs: export-model-cuda-artifact