diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index e6122c498ce..7c4a80044e4 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -92,18 +92,6 @@ esac
 TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
 BUILD_DOCS=1
 
-# Pull channel + spec/url helpers out of torch_pin.py so install_pytorch.sh
-# (which runs inside the docker build, where torch_pin.py isn't available)
-# can decide between wheel install (test/release) and source build (nightly).
-# Self-hosted runners often have python3 but not the unversioned python alias.
-PYTHON_BIN=$(command -v python3 || command -v python)
-TORCH_PIN_HELPERS=$(cd ../.. && "$PYTHON_BIN" -c "from torch_pin import CHANNEL, torch_spec, torchaudio_spec, torchvision_spec, torch_index_url_base; print(CHANNEL); print(torch_spec()); print(torchaudio_spec()); print(torchvision_spec()); print(torch_index_url_base())")
-TORCH_CHANNEL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '1p')
-TORCH_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '2p')
-TORCHAUDIO_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '3p')
-TORCHVISION_SPEC=$(echo "${TORCH_PIN_HELPERS}" | sed -n '4p')
-TORCH_INDEX_URL=$(echo "${TORCH_PIN_HELPERS}" | sed -n '5p')
-
 # Copy requirements-lintrunner.txt from root to here
 cp ../../requirements-lintrunner.txt ./
 
@@ -116,11 +104,6 @@ docker build \
   --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
   --build-arg "MINICONDA_VERSION=${MINICONDA_VERSION}" \
   --build-arg "TORCH_VERSION=${TORCH_VERSION}" \
-  --build-arg "TORCH_CHANNEL=${TORCH_CHANNEL}" \
-  --build-arg "TORCH_SPEC=${TORCH_SPEC}" \
-  --build-arg "TORCHAUDIO_SPEC=${TORCHAUDIO_SPEC}" \
-  --build-arg "TORCHVISION_SPEC=${TORCHVISION_SPEC}" \
-  --build-arg "TORCH_INDEX_URL=${TORCH_INDEX_URL}" \
   --build-arg "BUCK2_VERSION=${BUCK2_VERSION}" \
   --build-arg "LINTRUNNER=${LINTRUNNER:-}" \
   --build-arg "BUILD_DOCS=${BUILD_DOCS}" \
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
index 0932a9ef6b8..f6e39a63b92 100644
--- a/.ci/docker/ci_commit_pins/pytorch.txt
+++ b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-release/2.11
+release/2.11
\ No newline at end of file
diff --git a/.ci/docker/common/install_pytorch.sh b/.ci/docker/common/install_pytorch.sh
index ddf2f21baa9..548a24f885d 100755
--- a/.ci/docker/common/install_pytorch.sh
+++ b/.ci/docker/common/install_pytorch.sh
@@ -17,24 +17,6 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    # Test/release: install the published wheels directly. The specs and URL
-    # are passed in as docker build args (computed from torch_pin.py by
-    # .ci/docker/build.sh). RC wheels at /whl/test/ get re-uploaded under the
-    # same version, so use --no-cache-dir there to avoid stale cache hits.
-    local cache_flag=""
-    if [ "${TORCH_CHANNEL}" = "test" ]; then
-      cache_flag="--no-cache-dir"
-    fi
-    pip_install --force-reinstall ${cache_flag} \
-      "${TORCH_SPEC}" "${TORCHVISION_SPEC}" "${TORCHAUDIO_SPEC}" \
-      --index-url "${TORCH_INDEX_URL}/cpu"
-    return
-  fi
-
-  # Nightly: build pytorch from source against the pinned SHA in pytorch.txt
-  # so we catch upstream regressions, then install audio/vision from the
-  # commits that pytorch itself pins.
   git clone https://github.com/pytorch/pytorch.git
 
   # Fetch the target commit
@@ -45,19 +27,14 @@ install_pytorch_and_domains() {
   chown -R ci-user .
 
   export _GLIBCXX_USE_CXX11_ABI=1
-  # PyTorch's FindARM.cmake hard-fails when the SVE+BF16 compile probe
-  # doesn't pass — gcc-11 in this image is too old to accept the combined
-  # NEON/SVE/bfloat16 intrinsics the probe exercises. Executorch's aarch64
-  # runtime targets (phones, embedded) don't use SVE, so bypass the check.
-  export BUILD_IGNORE_SVE_UNAVAILABLE=1
   # Then build and install PyTorch
   conda_run python setup.py bdist_wheel
   pip_install "$(echo dist/*.whl)"
 
-  # Defer to PyTorch's own pinned audio/vision commits.
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  # Grab the pinned audio and vision commits from PyTorch
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile
index 98268d49675..0e2d7e48eb9 100644
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@@ -64,11 +64,6 @@ ENV SCCACHE_S3_KEY_PREFIX executorch
 ENV SCCACHE_REGION us-east-1
 
 ARG TORCH_VERSION
-ARG TORCH_CHANNEL
-ARG TORCH_SPEC
-ARG TORCHAUDIO_SPEC
-ARG TORCHVISION_SPEC
-ARG TORCH_INDEX_URL
 ARG SKIP_PYTORCH
 COPY ./common/install_pytorch.sh install_pytorch.sh
 COPY ./common/utils.sh utils.sh
diff --git a/.ci/scripts/download_hf_hub.sh b/.ci/scripts/download_hf_hub.sh
index b47fc5dd215..c0487e687c7 100644
--- a/.ci/scripts/download_hf_hub.sh
+++ b/.ci/scripts/download_hf_hub.sh
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
+
 # Function to download files from the Hugging Face Hub
 # Arguments:
 # 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 78053c33e7a..1f75d850e84 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -67,6 +67,9 @@ if [ -z "${1:-}" ]; then
   exit 1
 fi
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
+
 set -eux
 
 DEVICE="$1"
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 4b43a730710..6bd26e0b171 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -116,7 +116,6 @@ setup_macos_env_variables
 # buck2 atm
 install_buck
 brew install libomp
-install_pip_dependencies
 
 # TODO(huydhn): Unlike our self-hosted runner, GitHub runner doesn't have access
 # to our infra, so compiler caching needs to be setup differently using GitHub
@@ -125,10 +124,17 @@ if [[ -z "${GITHUB_RUNNER:-}" ]]; then
   install_sccache
 fi
 
+# Install pinned torch before requirements-ci.txt so torchsr's transitive
+# torch dep is satisfied by the existing install and pip does not pull a
+# separate copy from PyPI. sccache is initialized above so source-build
+# cache misses still hit the cache.
 print_cmake_info
 install_pytorch_and_domains
-# We build PyTorch from source here instead of using nightly. This allows CI to test against
-# the pinned commit from PyTorch
+
+install_pip_dependencies
+
+# install_executorch's --use-pt-pinned-commit skips re-installing torch since
+# install_pytorch_and_domains already installed the pinned build above.
 if [[ "$EDITABLE" == "true" ]]; then
   install_executorch --use-pt-pinned-commit --editable
 else
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
index e035b44cb62..a7f89f820b2 100755
--- a/.ci/scripts/test_backend.sh
+++ b/.ci/scripts/test_backend.sh
@@ -35,6 +35,7 @@ export PYTHON_EXECUTABLE=python
 
 # CMake options to use, in addition to the defaults.
 EXTRA_BUILD_ARGS=""
+PYTEST_RETRY_ARGS=()
 
 if [[ "$FLOW" == *qnn* ]]; then
     # Setup QNN sdk and deps - note that this is a bit hacky due to the nature of the
@@ -57,6 +58,9 @@ if [[ "$FLOW" == *vulkan* ]]; then
 fi
 
 if [[ "$FLOW" == *arm* ]]; then
+    if [[ "$SUITE" == "operators" ]]; then
+        PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
+    fi
 
     # Setup ARM deps.
     if [[ "$FLOW" == *vgf* ]]; then
@@ -95,6 +99,11 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
 
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+PYTEST_ARGS=(-c /dev/null -n auto)
+if [[ ${#PYTEST_RETRY_ARGS[@]} -gt 0 ]]; then
+    PYTEST_ARGS+=("${PYTEST_RETRY_ARGS[@]}")
+fi
+PYTEST_ARGS+=("backends/test/suite/$SUITE/" -m "flow_$FLOW" --json-report --json-report-file="$REPORT_FILE")
+${CONDA_RUN_CMD} pytest "${PYTEST_ARGS[@]}" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/.ci/scripts/test_coreml_bc.sh b/.ci/scripts/test_coreml_bc.sh
index ac379481ea5..b077726832e 100644
--- a/.ci/scripts/test_coreml_bc.sh
+++ b/.ci/scripts/test_coreml_bc.sh
@@ -23,7 +23,7 @@ source "${REPO_ROOT}/.ci/scripts/utils.sh"
 # Create a conda environment with Python 3.10 for compatibility with old ET versions
 # ET 1.0.0 only supports Python >=3.10,<3.13
 CONDA_ENV_NAME="coreml_bc_test_env"
-conda create -y -n "${CONDA_ENV_NAME}" python=3.10
+conda create -y -n "${CONDA_ENV_NAME}" python=3.10 pip packaging
 
 # Use conda run to execute commands in the new environment
 CONDA_RUN="conda run --no-capture-output -n ${CONDA_ENV_NAME}"
@@ -69,7 +69,7 @@ git submodule sync --recursive
 git submodule update --init --recursive
 
 # Install executorch
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 3: Export model
@@ -129,7 +129,7 @@ git submodule update --init --recursive
 
 # Step 5: Install current version
 echo "=== Step 5: Installing current ET version ==="
-${CONDA_RUN} pip install --upgrade pip
+${CONDA_RUN} python -m pip install --upgrade pip
 ${CONDA_RUN} python install_executorch.py
 
 # Step 6: Run the old pte file
diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py
index 59e5fa3d03a..7b0e69ff9b4 100644
--- a/.ci/scripts/test_huggingface_optimum_model.py
+++ b/.ci/scripts/test_huggingface_optimum_model.py
@@ -2,11 +2,17 @@
 import gc
 import logging
 import math
+import os
+import shutil
 import subprocess
 import tempfile
+import time
 from pathlib import Path
 from typing import List
 
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+os.environ.setdefault("HF_HUB_DISABLE_XET", "1")
+
 import torch
 from datasets import load_dataset
 
@@ -25,6 +31,17 @@
 )
 
 
+EXPORT_RETRIES = 3
+
+
+def _clear_export_dir(model_dir):
+    for path in Path(model_dir).iterdir():
+        if path.is_dir() and not path.is_symlink():
+            shutil.rmtree(path)
+        else:
+            path.unlink()
+
+
 def cli_export(command, model_dir):
     p = Path(model_dir)
     if p.exists():
@@ -34,11 +51,19 @@ def cli_export(command, model_dir):
             raise Exception(
                 f"Existing directory {model_dir} is non-empty. Please remove it first."
             )
-    try:
-        subprocess.run(command, check=True)
-        print("Export completed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Export failed with error: {e}")
+
+    for attempt in range(1, EXPORT_RETRIES + 1):
+        try:
+            subprocess.run(command, check=True)
+            print("Export completed successfully.")
+            return
+        except subprocess.CalledProcessError as e:
+            print(f"Export attempt {attempt}/{EXPORT_RETRIES} failed with error: {e}")
+            if attempt == EXPORT_RETRIES:
+                raise
+            if p.exists():
+                _clear_export_dir(model_dir)
+            time.sleep(attempt * 10)
 
 
 def check_causal_lm_output_quality(
diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
index e323aed114d..102347a08fd 100644
--- a/.ci/scripts/test_lora.sh
+++ b/.ci/scripts/test_lora.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
@@ -33,6 +35,24 @@ cleanup_files() {
   rm result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Hosting lora adapter in personal repo for now.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -139,7 +159,15 @@ Okay, so I need to calculate 15% of 80."
 EXPECTED_QUANT_LORA_PREFIX="
 <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
 To calculate 15% of 80, we can multiply 80 by 15/100.
-So, 15% of 80 is equal to (80 * 15) / 100 = 1200 / 100 = 12.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
+#### 12
+The answer is: 12<|im_end|>"
+EXPECTED_QUANT_LORA_ALTERNATE_PREFIX="
+<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
+To calculate 15% of 80, we can multiply 80 by 15/100.
+80 * 15/100 = 12.
+So, 15% of 80 is 12.
 #### 12
 The answer is: 12<|im_end|>"
 
@@ -186,7 +214,7 @@ cmake-out/examples/models/llama/llama_main --model_path=qwen_q.pte --data_paths=
 NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_PREFIX}"* ]]; then
+if matches_base_response_prefix result.txt; then
   echo "Expected result prefix: ${EXPECTED_QUANT_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 3: Success"
@@ -207,12 +235,13 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result.txt)
-if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]]; then
+if [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_PREFIX}"* ]] || [[ "${RESULT}" == "${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"* ]]; then
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Success"
 else
   echo "Expected result prefix: ${EXPECTED_QUANT_LORA_PREFIX}"
+  echo "Alternate expected result prefix: ${EXPECTED_QUANT_LORA_ALTERNATE_PREFIX}"
   echo "Actual result: ${RESULT}"
   echo "Test 4: Failure; results not the same"
   cleanup_files
diff --git a/.ci/scripts/test_lora_multimethod.sh b/.ci/scripts/test_lora_multimethod.sh
index 8f4ae1a4f68..f0b30bd4be1 100755
--- a/.ci/scripts/test_lora_multimethod.sh
+++ b/.ci/scripts/test_lora_multimethod.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
@@ -33,6 +35,24 @@ cleanup_files() {
   rm -f result*.txt
 }
 
+matches_base_response_prefix() {
+  local output_file="$1"
+  python - "$output_file" <<'PY'
+import pathlib
+import re
+import sys
+
+text = pathlib.Path(sys.argv[1]).read_text()
+pattern = re.compile(
+    r"^<\|im_start\|>user Calculate 15% of 80\?<\|im_end\|><\|im_start\|>assistant:\n"
+    r"(?:<think>\n)+"
+    r"Okay, so I need to calculate 15% of 80\.",
+    re.MULTILINE,
+)
+sys.exit(0 if pattern.match(text) else 1)
+PY
+}
+
 # Download LoRA adapter.
 python -m pip install -q huggingface_hub
 HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
@@ -107,7 +127,7 @@ NOW=$(date +"%H:%M:%S")
 echo "Finished at ${NOW}"
 
 RESULT=$(cat result_base.txt)
-if [[ "${RESULT}" == "${EXPECTED_BASE_PREFIX}"* ]]; then
+if matches_base_response_prefix result_base.txt; then
   echo "Test 2 (base_forward): Success"
 else
   echo "Test 2 (base_forward): Failure"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index 7205bb6d49c..1678b0a4fbb 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -258,12 +258,21 @@ fi
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
 elif [[ "$MODEL_NAME" == *whisper* ]] || [ "$MODEL_NAME" = "voxtral_realtime" ]; then
-  conda install -y -c conda-forge "ffmpeg<8"
+  if ! command -v ffmpeg >/dev/null; then
+    if [ "$(uname -s)" = "Linux" ] && command -v apt-get >/dev/null; then
+      if [ "$(id -u)" -eq 0 ]; then
+        apt-get update
+        apt-get install -y --no-install-recommends ffmpeg
+      else
+        sudo apt-get update
+        sudo apt-get install -y --no-install-recommends ffmpeg
+      fi
+    else
+      conda install -y -c conda-forge ffmpeg
+    fi
+  fi
   pip install datasets soundfile
-  # We pushd'd into EXECUTORCH_ROOT above, so torch_pin is importable here.
-  TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")
-  TORCHCODEC_INDEX=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-  pip install "$TORCHCODEC_PKG" --extra-index-url "${TORCHCODEC_INDEX}/cpu"
+  pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
 fi
 
diff --git a/.ci/scripts/test_phi_3_mini.sh b/.ci/scripts/test_phi_3_mini.sh
index 086822bbad4..76ebb2ff3d1 100644
--- a/.ci/scripts/test_phi_3_mini.sh
+++ b/.ci/scripts/test_phi_3_mini.sh
@@ -6,6 +6,8 @@
 # LICENSE file in the root directory of this source tree.
 
 set -exu
+# Disable HF Xet storage to avoid stalled downloads on CI runners
+export HF_HUB_DISABLE_XET=1
 
 BUILD_TYPE=${1:-Debug}
 BUILD_DIR=${3:-cmake-out}
diff --git a/.ci/scripts/test_wheel_package_qnn.sh b/.ci/scripts/test_wheel_package_qnn.sh
index f44fafadb58..763bd8733c1 100644
--- a/.ci/scripts/test_wheel_package_qnn.sh
+++ b/.ci/scripts/test_wheel_package_qnn.sh
@@ -150,26 +150,25 @@ run_core_tests () {
   echo "=== [$LABEL] Installing wheel & deps ==="
   "$PIPBIN" install --upgrade pip
   "$PIPBIN" install "$WHEEL_FILE"
-  # runpy.run_path uses a relative path, so the caller must run this script
-  # from the executorch repo root (where torch_pin.py lives).
-  TORCH_SPEC=$(
+  TORCH_VERSION=$(
   "$PYBIN" - <<'PY'
 import runpy
 module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["torch_spec"]())
+print(module_vars["TORCH_VERSION"])
 PY
 )
-  TORCH_INDEX=$(
-  "$PYBIN" - <<'PY'
-import runpy
-module_vars = runpy.run_path("torch_pin.py")
-print(module_vars["torch_index_url_base"]())
-PY
-)
-  echo "=== [$LABEL] Install $TORCH_SPEC from ${TORCH_INDEX}/cpu ==="
 
-  # Install torch based on the pinned PyTorch version from the channel index.
-  "$PIPBIN" install "$TORCH_SPEC" --index-url "${TORCH_INDEX}/cpu"
+#   NIGHTLY_VERSION=$(
+#   "$PYBIN" - <<'PY'
+# import runpy
+# module_vars = runpy.run_path("torch_pin.py")
+# print(module_vars["NIGHTLY_VERSION"])
+# PY
+# )
+  echo "=== [$LABEL] Install torch==${TORCH_VERSION} ==="
+
+  # Install torch based on the pinned PyTorch version, preferring the PyTorch test index
+  "$PIPBIN" install torch=="${TORCH_VERSION}" --extra-index-url "https://download.pytorch.org/whl/test"
   "$PIPBIN" install wheel
 
   # Install torchao based on the pinned commit from third-party/ao submodule
diff --git a/.ci/scripts/tests/test_torch_pin.py b/.ci/scripts/tests/test_torch_pin.py
deleted file mode 100644
index 6c475aeaa05..00000000000
--- a/.ci/scripts/tests/test_torch_pin.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import importlib
-
-import pytest
-
-
-@pytest.fixture
-def pin():
-    """Yield a fresh import of torch_pin so tests can mutate CHANNEL safely."""
-    import torch_pin
-
-    yield torch_pin
-    importlib.reload(torch_pin)
-
-
-@pytest.mark.parametrize(
-    "channel, expected_torch, expected_url",
-    [
-        (
-            "nightly",
-            "torch=={TORCH_VERSION}.{NIGHTLY_VERSION}",
-            "https://download.pytorch.org/whl/nightly",
-        ),
-        ("test", "torch=={TORCH_VERSION}", "https://download.pytorch.org/whl/test"),
-        ("release", "torch=={TORCH_VERSION}", "https://download.pytorch.org/whl"),
-    ],
-)
-def test_channel_resolution(pin, channel, expected_torch, expected_url):
-    pin.CHANNEL = channel
-    expected = expected_torch.format(
-        TORCH_VERSION=pin.TORCH_VERSION, NIGHTLY_VERSION=pin.NIGHTLY_VERSION
-    )
-    assert pin.torch_spec() == expected
-    assert pin.torch_index_url_base() == expected_url
-
-
-def test_all_specs_share_nightly_suffix(pin):
-    pin.CHANNEL = "nightly"
-    suffix = f".{pin.NIGHTLY_VERSION}"
-    assert pin.torch_spec().endswith(suffix)
-    assert pin.torchaudio_spec().endswith(suffix)
-    assert pin.torchcodec_spec().endswith(suffix)
-    assert pin.torchvision_spec().endswith(suffix)
-
-
-def test_specs_drop_suffix_off_nightly(pin):
-    pin.CHANNEL = "test"
-    assert pin.torch_spec() == f"torch=={pin.TORCH_VERSION}"
-    assert pin.torchaudio_spec() == f"torchaudio=={pin.TORCHAUDIO_VERSION}"
-    assert pin.torchcodec_spec() == f"torchcodec=={pin.TORCHCODEC_VERSION}"
-    assert pin.torchvision_spec() == f"torchvision=={pin.TORCHVISION_VERSION}"
-
-
-def test_torch_branch_derived_from_version(pin):
-    assert pin.torch_branch() == f"release/{pin.TORCH_VERSION.rsplit('.', 1)[0]}"
diff --git a/.ci/scripts/utils.sh b/.ci/scripts/utils.sh
index 12e7f3d2067..b291374d667 100644
--- a/.ci/scripts/utils.sh
+++ b/.ci/scripts/utils.sh
@@ -53,7 +53,7 @@ dedupe_macos_loader_path_rpaths() {
   pushd ..
   torch_lib_dir=$(python -c "import importlib.util; print(importlib.util.find_spec('torch').submodule_search_locations[0])")/lib
   popd
-  
+
   if [[ -z "${torch_lib_dir}" || ! -d "${torch_lib_dir}" ]]; then
     return
   fi
@@ -89,30 +89,6 @@ install_domains() {
 }
 
 install_pytorch_and_domains() {
-  # CWD is the executorch repo root, where torch_pin.py lives.
-  TORCH_CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    # Test/release: install the published wheels directly from torch_pin.py's
-    # channel index, skipping the source-build path entirely. RC wheels at
-    # /whl/test/ get re-uploaded under the same version, so use --no-cache-dir
-    # there to avoid stale cache hits.
-    local torch_spec=$(python -c "from torch_pin import torch_spec; print(torch_spec())")
-    local torchvision_spec=$(python -c "from torch_pin import torchvision_spec; print(torchvision_spec())")
-    local torchaudio_spec=$(python -c "from torch_pin import torchaudio_spec; print(torchaudio_spec())")
-    local torch_index_url=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-    local cache_flag=""
-    if [ "${TORCH_CHANNEL}" = "test" ]; then
-      cache_flag="--no-cache-dir"
-    fi
-    pip install --force-reinstall ${cache_flag} \
-      "${torch_spec}" "${torchvision_spec}" "${torchaudio_spec}" \
-      --index-url "${torch_index_url}/cpu"
-    return
-  fi
-
-  # Nightly: source-build pytorch from the pinned SHA so CI catches upstream
-  # regressions; pytorch's own audio/vision pins drive those installs.
   pushd .ci/docker || return
   TORCH_VERSION=$(cat ci_commit_pins/pytorch.txt)
   popd || return
@@ -129,7 +105,11 @@ install_pytorch_and_domains() {
   fi
   local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
   local torch_release=$(cat version.txt)
-  local torch_short_hash=${TORCH_VERSION:0:7}
+  # Download key must match the upload key below (basename of dist/*.whl,
+  # which always carries setup.py's resolved +gitHASH). Branch-ref pins
+  # like `release/2.11` would otherwise produce `+gitrelease` here and
+  # never hit the cache.
+  local torch_short_hash=$(git rev-parse --short=7 HEAD)
   local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
   local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
 
@@ -147,10 +127,38 @@ install_pytorch_and_domains() {
   if [[ "${torch_wheel_not_found}" == "1" ]]; then
     echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
 
+    # Install PyTorch's own build-time deps so the source build does not
+    # silently inherit them from whatever else happens to be in the env
+    # (e.g. executorch's requirements-ci.txt).
+    pip install -r requirements-build.txt
     git submodule update --init --recursive
     USE_DISTRIBUTED=1 python setup.py bdist_wheel
     pip install "$(echo dist/*.whl)"
 
+    # Invariant: the basename setup.py just produced must match the cache
+    # URL we'd reconstruct on the next run. If they diverge (someone edits
+    # torch_wheel_name above, or PyTorch renames its wheels), the cache
+    # will silently miss and every macOS run will fall back to a ~30-min
+    # source build. Fail loudly so the regression is caught immediately.
+    shopt -s nullglob
+    local built_wheels=(dist/*.whl)
+    shopt -u nullglob
+    if [[ ${#built_wheels[@]} -ne 1 ]]; then
+      echo "ERROR: expected exactly 1 wheel in dist/, found ${#built_wheels[@]}" >&2
+      exit 1
+    fi
+    local built_wheel_name
+    built_wheel_name=$(basename "${built_wheels[0]}")
+    local expected_wheel_name="${torch_wheel_name//\%2B/+}"
+    if [[ "${built_wheel_name}" != "${expected_wheel_name}" ]]; then
+      echo "ERROR: built torch wheel name does not match cache URL key:" >&2
+      echo "  built:    ${built_wheel_name}" >&2
+      echo "  expected: ${expected_wheel_name}" >&2
+      echo "Fix torch_wheel_name construction in install_pytorch_and_domains" >&2
+      echo "in .ci/scripts/utils.sh" >&2
+      exit 1
+    fi
+
     # Only AWS runners have access to S3
     if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
       for wheel_path in dist/*.whl; do
@@ -164,10 +172,10 @@ install_pytorch_and_domains() {
   fi
 
   dedupe_macos_loader_path_rpaths
-  # We're on the nightly path here; defer to PyTorch's own pinned commits.
-  TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)
+  # Grab the pinned audio and vision commits from PyTorch
+  TORCHAUDIO_VERSION=release/2.11
   export TORCHAUDIO_VERSION
-  TORCHVISION_VERSION=$(cat .github/ci_commit_pins/vision.txt)
+  TORCHVISION_VERSION=release/0.26
   export TORCHVISION_VERSION
 
   install_domains
@@ -242,21 +250,17 @@ download_stories_model_artifacts() {
 }
 
 do_not_use_nightly_on_ci() {
-  # Sanity check that prevents accidentally landing a PR that pins to PyTorch
-  # nightly without exercising the source-build path (see #6564).
-  #
-  # For CHANNEL=nightly, CI source-builds pytorch from the SHA in pytorch.txt,
-  # so the installed torch shows up as e.g. 2.13.0a0+gitc8a648d — assert that.
-  # For CHANNEL=test/release, we install published wheels by design (e.g.
-  # 2.11.0), so the +git assertion doesn't apply.
-  TORCH_CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-  if [ "${TORCH_CHANNEL}" != "nightly" ]; then
-    return 0
-  fi
-
+  # An assert to make sure that we are not using PyTorch nightly on CI to prevent
+  # regression as documented in https://github.com/pytorch/executorch/pull/6564
   TORCH_VERSION=$(pip list | grep -w 'torch ' | awk -F ' ' {'print $2'} | tr -d '\n')
+
+  # The version of PyTorch building from source looks like 2.6.0a0+gitc8a648d that
+  # includes the commit while nightly (2.6.0.dev20241019+cpu) or release (2.6.0)
+  # won't have that. Note that we couldn't check for the exact commit from the pin
+  # ci_commit_pins/pytorch.txt here because the value will be different when running
+  # this on PyTorch CI
   if [[ "${TORCH_VERSION}" != *"+git"* ]]; then
-    echo "Unexpected torch version. Expected binary built from source for CHANNEL=nightly, got ${TORCH_VERSION}"
+    echo "Unexpected torch version. Expected binary built from source, got ${TORCH_VERSION}"
     exit 1
   fi
 }
diff --git a/.github/scripts/docathon-label-sync.py b/.github/scripts/docathon-label-sync.py
new file mode 100644
index 00000000000..4f00067905d
--- /dev/null
+++ b/.github/scripts/docathon-label-sync.py
@@ -0,0 +1,54 @@
+import os
+import re
+import sys
+
+from github import Github
+
+
+def main() -> None:
+    token = os.environ.get("GITHUB_TOKEN")
+
+    repo_owner = "pytorch"
+    repo_name = "pytorch"
+    pull_request_number = int(sys.argv[1])
+
+    g = Github(token)
+    repo = g.get_repo(f"{repo_owner}/{repo_name}")
+    pull_request = repo.get_pull(pull_request_number)
+    pull_request_body = pull_request.body
+    # PR without description
+    if pull_request_body is None:
+        return
+
+    # get issue number from the PR body
+    if not re.search(r"#\d{1,6}", pull_request_body):
+        print("The pull request does not mention an issue.")
+        return
+    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
+    issue = repo.get_issue(issue_number)
+    issue_labels = issue.labels
+    docathon_label_present = any(
+        label.name == "docathon-2026" for label in issue_labels
+    )
+
+    # if the issue has a docathon label, add all labels from the issue to the PR.
+    if not docathon_label_present:
+        print("The 'docathon-2026' label is not present in the issue.")
+        return
+    pull_request_labels = pull_request.get_labels()
+    pull_request_label_names = [label.name for label in pull_request_labels]
+    issue_label_names = [label.name for label in issue_labels]
+    labels_to_add = [
+        label
+        for label in issue_label_names
+        if label not in pull_request_label_names and label != "actionable"
+    ]
+    if not labels_to_add:
+        print("The pull request already has the same labels.")
+        return
+    pull_request.add_to_labels(*labels_to_add)
+    print("Labels added to the pull request!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/scripts/update_pytorch_pin.py b/.github/scripts/update_pytorch_pin.py
index 9f2698917b2..dbc48552d9b 100644
--- a/.github/scripts/update_pytorch_pin.py
+++ b/.github/scripts/update_pytorch_pin.py
@@ -8,12 +8,6 @@
 import urllib.request
 from pathlib import Path
 
-# torch_pin.py lives at the repo root. Locate it relative to this script so
-# the import works regardless of where the script is invoked from.
-_REPO_ROOT = Path(__file__).resolve().parents[2]
-sys.path.insert(0, str(_REPO_ROOT))
-from torch_pin import CHANNEL, NIGHTLY_VERSION, torch_branch
-
 
 def parse_nightly_version(nightly_version):
     """
@@ -33,6 +27,23 @@ def parse_nightly_version(nightly_version):
     return f"{year}-{month}-{day}"
 
 
+def get_torch_nightly_version():
+    """
+    Read NIGHTLY_VERSION from torch_pin.py.
+
+    Returns:
+        NIGHTLY_VERSION string
+    """
+    with open("torch_pin.py", "r") as f:
+        content = f.read()
+
+    match = re.search(r'NIGHTLY_VERSION\s*=\s*["\']([^"\']+)["\']', content)
+    if not match:
+        raise ValueError("Could not find NIGHTLY_VERSION in torch_pin.py")
+
+    return match.group(1)
+
+
 def get_commit_hash_for_nightly(date_str):
     """
     Fetch commit hash from PyTorch nightly branch for a given date.
@@ -80,16 +91,17 @@ def extract_hash_from_title(title):
     return match.group(1)
 
 
-def update_pytorch_pin(ref):
+def update_pytorch_pin(commit_hash):
     """
-    Update .ci/docker/ci_commit_pins/pytorch.txt with the new ref.
+    Update .ci/docker/ci_commit_pins/pytorch.txt with the new commit hash.
 
     Args:
-        ref: Either a commit SHA (nightly) or a branch name (test/release).
+        commit_hash: Commit hash to write
     """
-    pin_file = _REPO_ROOT / ".ci/docker/ci_commit_pins/pytorch.txt"
-    pin_file.write_text(f"{ref}\n")
-    print(f"Updated {pin_file} with ref: {ref}")
+    pin_file = ".ci/docker/ci_commit_pins/pytorch.txt"
+    with open(pin_file, "w") as f:
+        f.write(f"{commit_hash}\n")
+    print(f"Updated {pin_file} with commit hash: {commit_hash}")
 
 
 def should_skip_file(filename):
@@ -106,20 +118,18 @@ def should_skip_file(filename):
     return filename in skip_files
 
 
-def fetch_file_content(ref, file_path):
+def fetch_file_content(commit_hash, file_path):
     """
     Fetch file content from GitHub API.
 
     Args:
-        ref: Commit SHA or branch name to fetch from
+        commit_hash: Commit hash to fetch from
         file_path: File path in the repository
 
     Returns:
         File content as bytes
     """
-    api_url = (
-        f"https://api.github.com/repos/pytorch/pytorch/contents/{file_path}?ref={ref}"
-    )
+    api_url = f"https://api.github.com/repos/pytorch/pytorch/contents/{file_path}?ref={commit_hash}"
 
     req = urllib.request.Request(api_url)
     req.add_header("Accept", "application/vnd.github.v3+json")
@@ -136,7 +146,7 @@ def fetch_file_content(ref, file_path):
         raise
 
 
-def sync_directory(et_dir, pt_path, ref):
+def sync_directory(et_dir, pt_path, commit_hash):
     """
     Sync files from PyTorch to ExecuTorch using GitHub API.
     Only syncs files that already exist in ExecuTorch - does not add new files.
@@ -144,7 +154,7 @@ def sync_directory(et_dir, pt_path, ref):
     Args:
         et_dir: ExecuTorch directory path
         pt_path: PyTorch directory path in the repository (e.g., "c10")
-        ref: Commit SHA or branch name to fetch from
+        commit_hash: Commit hash to fetch from
 
     Returns:
         Number of files grafted
@@ -171,12 +181,12 @@ def sync_directory(et_dir, pt_path, ref):
 
         # Fetch content from PyTorch and compare
         try:
-            pt_content = fetch_file_content(ref, pt_file_path)
+            pt_content = fetch_file_content(commit_hash, pt_file_path)
             et_content = et_file.read_bytes()
 
             if pt_content != et_content:
                 print(f"⚠️  Difference detected in {rel_path}")
-                print(f"📋 Grafting from PyTorch ref {ref}...")
+                print(f"📋 Grafting from PyTorch commit {commit_hash}...")
 
                 et_file.write_bytes(pt_content)
                 print(f"✅ Grafted {et_file}")
@@ -191,34 +201,37 @@ def sync_directory(et_dir, pt_path, ref):
     return files_grafted
 
 
-def sync_c10_directories(ref):
+def sync_c10_directories(commit_hash):
     """
     Sync c10 and torch/headeronly directories from PyTorch to ExecuTorch using GitHub API.
 
     Args:
-        ref: PyTorch commit SHA or branch name to sync from
+        commit_hash: PyTorch commit hash to sync from
 
     Returns:
         Total number of files grafted
     """
     print("\n🔄 Syncing c10 directories from PyTorch via GitHub API...")
 
+    # Get repository root
+    repo_root = Path.cwd()
+
     # Define directory pairs to sync (from check_c10_sync.sh)
     # Format: (executorch_dir, pytorch_path_in_repo)
     dir_pairs = [
         (
-            _REPO_ROOT / "runtime/core/portable_type/c10/c10",
+            repo_root / "runtime/core/portable_type/c10/c10",
             "c10",
         ),
         (
-            _REPO_ROOT / "runtime/core/portable_type/c10/torch/headeronly",
+            repo_root / "runtime/core/portable_type/c10/torch/headeronly",
             "torch/headeronly",
         ),
     ]
 
     total_grafted = 0
     for et_dir, pt_path in dir_pairs:
-        files_grafted = sync_directory(et_dir, pt_path, ref)
+        files_grafted = sync_directory(et_dir, pt_path, commit_hash)
         total_grafted += files_grafted
 
     if total_grafted > 0:
@@ -231,26 +244,27 @@ def sync_c10_directories(ref):
 
 def main():
     try:
-        print(f"CHANNEL: {CHANNEL}")
-        if CHANNEL == "nightly":
-            # Nightly pins to an immutable SHA looked up by date.
-            print(f"Found NIGHTLY_VERSION: {NIGHTLY_VERSION}")
-            date_str = parse_nightly_version(NIGHTLY_VERSION)
-            print(f"Parsed date: {date_str}")
-            pin_ref = get_commit_hash_for_nightly(date_str)
-        else:
-            # For test/release, pin to the branch name so CI picks up
-            # cherry-picks / security patches as they land on the branch.
-            pin_ref = torch_branch()
-        print(f"Pin ref: {pin_ref}")
+        # Read NIGHTLY_VERSION from torch_pin.py
+        nightly_version = get_torch_nightly_version()
+        print(f"Found NIGHTLY_VERSION: {nightly_version}")
+
+        # Parse to date string
+        date_str = parse_nightly_version(nightly_version)
+        print(f"Parsed date: {date_str}")
+
+        # Fetch commit hash from PyTorch nightly branch
+        commit_hash = get_commit_hash_for_nightly(date_str)
+        print(f"Found commit hash: {commit_hash}")
 
         # Update the pin file
-        update_pytorch_pin(pin_ref)
+        update_pytorch_pin(commit_hash)
 
-        # Sync c10 directories from PyTorch (ref param accepts branches too)
-        sync_c10_directories(pin_ref)
+        # Sync c10 directories from PyTorch
+        sync_c10_directories(commit_hash)
 
-        print("\n✅ Successfully updated PyTorch pin and synced c10 directories!")
+        print(
+            "\n✅ Successfully updated PyTorch commit pin and synced c10 directories!"
+        )
 
     except Exception as e:
         print(f"Error: {e}", file=sys.stderr)
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
index 54622a33cd8..6f152607d7c 100644
--- a/.github/workflows/_android.yml
+++ b/.github/workflows/_android.yml
@@ -109,6 +109,9 @@ jobs:
           ram-size: 16384M
           heap-size: 12288M
           force-avd-creation: false
-          disable-animations: true
-          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none
+          # The action's built-in animation disabling runs immediately after
+          # boot and is not retried. Software-emulated boots can briefly drop
+          # adb there, so scripts/run_android_emulator.sh handles it instead.
+          disable-animations: false
+          emulator-options: -no-snapshot-save -no-window -gpu swiftshader_indirect -noaudio -no-boot-anim -camera-back none -no-metrics
           emulator-boot-timeout: 900
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
index bfa874a440d..6323c007f4a 100644
--- a/.github/workflows/_test_backend.yml
+++ b/.github/workflows/_test_backend.yml
@@ -129,6 +129,7 @@ jobs:
 
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       ref: ${{ inputs.ref }}
       runner: macos-m1-stable
       python-version: "3.12"
diff --git a/.github/workflows/_test_cadence.yml b/.github/workflows/_test_cadence.yml
index 5b81ddc3f82..2e98d21db1c 100644
--- a/.github/workflows/_test_cadence.yml
+++ b/.github/workflows/_test_cadence.yml
@@ -45,9 +45,9 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
-        python -m pytest backends/cadence/aot/tests/ -v -n auto
+        python -m pytest backends/cadence/aot/tests/ -v -n auto --reruns 2 --reruns-delay 1
 
   test-ops:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -66,7 +66,7 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         pip install -e . --no-build-isolation > /dev/null
-        pip install beartype later pyre_extensions pytest-xdist
+        pip install beartype later pyre_extensions pytest-rerunfailures==15.1 pytest-xdist
 
         # Use the pre-built runner from the build job
         mkdir -p cmake-out/backends/cadence
@@ -74,4 +74,4 @@ jobs:
         chmod +x cmake-out/backends/cadence/cadence_runner
 
         export PYTHONPATH="${PYTHONPATH:-}:$(pwd)/backends/cadence/utils/FACTO"
-        python -m pytest examples/cadence/operators/ -v -n auto
+        python -m pytest examples/cadence/operators/ -v -n auto --reruns 2 --reruns-delay 1
diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml
index 457480099c3..15c87bd79e4 100644
--- a/.github/workflows/_unittest.yml
+++ b/.github/workflows/_unittest.yml
@@ -44,6 +44,7 @@ jobs:
   macos:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 0d8995a8259..20c7352e40d 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -88,6 +88,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -175,6 +176,7 @@ jobs:
     needs: set-version
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -315,6 +317,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/assigntome-docathon.yml b/.github/workflows/assigntome-docathon.yml
new file mode 100644
index 00000000000..92dec519296
--- /dev/null
+++ b/.github/workflows/assigntome-docathon.yml
@@ -0,0 +1,60 @@
+name: Assign User on Comment
+
+on:
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+jobs:
+  assign:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+    steps:
+      - name: Check for "/assigntome" in comment
+        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          script: |
+            const issueComment = context.payload.comment.body;
+            const assignRegex = /\/assigntome/i;
+            if (assignRegex.test(issueComment)) {
+              const assignee = context.payload.comment.user.login;
+              const issueNumber = context.payload.issue.number;
+              try {
+                const { data: issue } = await github.rest.issues.get({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber
+                });
+              const hasLabel = issue.labels.some(label => label.name === 'docathon-2026');
+              if (hasLabel) {
+                if (issue.assignee !== null) {
+                  await github.rest.issues.createComment({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    body: "The issue is already assigned. Please pick an opened and unnasigned issue with the [docathon-2026 label](https://github.com/pytorch/executorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-2026)"
+                  });
+                } else {
+                  await github.rest.issues.addAssignees({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    issue_number: issueNumber,
+                    assignees: [assignee]
+                  });
+                }
+              } else {
+                const commmentMessage = "This issue does not have the correct label. Please pick an opened and unnasigned issue with the [docathon-2026 label](https://github.com/pytorch/executorch/issues?q=is%3Aopen+is%3Aissue+label%3Adocathon-2026)";
+                await github.rest.issues.createComment({
+                  owner: context.repo.owner,
+                  repo: context.repo.repo,
+                  issue_number: issueNumber,
+                  body: commmentMessage
+                });
+               }
+              } catch (error) {
+                console.error(error);
+              }
+            }
diff --git a/.github/workflows/build-presets.yml b/.github/workflows/build-presets.yml
index 7c5a37e0f6c..37854aed174 100644
--- a/.github/workflows/build-presets.yml
+++ b/.github/workflows/build-presets.yml
@@ -20,6 +20,7 @@ jobs:
       matrix:
         preset: [macos, ios, ios-simulator, pybind, profiling, llm]
     with:
+      default-packages: ""
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       runner: macos-14-xlarge
diff --git a/.github/workflows/docathon-sync-label.yml b/.github/workflows/docathon-sync-label.yml
new file mode 100644
index 00000000000..bf8197f8d64
--- /dev/null
+++ b/.github/workflows/docathon-sync-label.yml
@@ -0,0 +1,31 @@
+name: Docathon Labels Sync
+
+on:
+  pull_request_target:
+    types: [opened, synchronize, edited]
+    branches: [main]
+
+jobs:
+  check-labels:
+    if: github.repository_owner == 'pytorch'
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 1
+      - name: Set up Python
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: 3.x
+      - name: Install dependencies
+        run: |
+          pip install requests==2.32.3
+          pip install PyGithub==2.3.0
+      - name: Run Python script
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: python ./.github/scripts/docathon-label-sync.py ${{ github.event.pull_request.number }}
diff --git a/.github/workflows/metal.yml b/.github/workflows/metal.yml
index 2ab1f1e0e22..de6507e035a 100644
--- a/.github/workflows/metal.yml
+++ b/.github/workflows/metal.yml
@@ -25,6 +25,7 @@ jobs:
     name: test-executorch-metal-build
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -41,6 +42,7 @@ jobs:
     name: test-metal-backend-modules
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -65,6 +67,7 @@ jobs:
     name: test-metal-qwen35-moe-tiny
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -187,6 +190,7 @@ jobs:
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "non-quantized"
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -252,6 +256,7 @@ jobs:
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "non-quantized"
     with:
+      default-packages: ""
       runner: macos-m2-stable
       python-version: '3.11'
       submodules: 'recursive'
diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml
index a40198ea36f..cdc9cd8a3d0 100644
--- a/.github/workflows/mlx.yml
+++ b/.github/workflows/mlx.yml
@@ -28,6 +28,7 @@ jobs:
   test-mlx:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -77,6 +78,7 @@ jobs:
   test-mlx-qwen35-moe:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-qwen35-moe
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -132,6 +134,7 @@ jobs:
         suite: [models, operators]
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-backend-${{ matrix.suite }}
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -173,6 +176,7 @@ jobs:
   test-mlx-parakeet:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-parakeet
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -231,6 +235,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-voxtral
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -291,6 +296,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-voxtral-realtime
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -300,6 +306,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         echo "::group::Install ExecuTorch"
         ${CONDA_RUN} python install_executorch.py > /dev/null
@@ -366,6 +374,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-whisper
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -375,6 +384,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         echo "::group::Install ExecuTorch and configure MLX build"
         ${CONDA_RUN} python install_executorch.py > /dev/null
@@ -413,6 +424,7 @@ jobs:
   test-mlx-stories110m:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       job-name: test-mlx-stories110m
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -492,6 +504,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
+      default-packages: ""
       job-name: test-mlx-llm-${{ matrix.model.name }}${{ matrix.use-custom && '-custom' || '' }}-${{ matrix.qconfig }}
       runner: macos-14-xlarge
       python-version: "3.12"
@@ -501,6 +514,8 @@ jobs:
       timeout: 90
       script: |
         set -eux
+        # Disable HF Xet storage to avoid stalled downloads on CI runners
+        export HF_HUB_DISABLE_XET=1
 
         MODEL_ID="${{ matrix.model.id }}"
         MODEL_NAME="${{ matrix.model.name }}"
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 6a4439b4254..97633965652 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -1521,6 +1521,7 @@ jobs:
         runner: [macos-m1-stable, macos-m2-stable]
       fail-fast: false
     with:
+      default-packages: ""
       runner: ${{ matrix.runner }}
       submodules: 'recursive'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 68c2e68436e..670517f836b 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -40,6 +40,7 @@ jobs:
             backend: portable
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -275,6 +276,7 @@ jobs:
           - build-tool: cmake
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -297,6 +299,7 @@ jobs:
           - build-tool: cmake
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -457,6 +460,7 @@ jobs:
     name: test-coreml-delegate
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-14-xlarge
       python-version: '3.11'
       submodules: 'recursive'
@@ -475,6 +479,7 @@ jobs:
     name: test-static-llama-ane
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -497,6 +502,7 @@ jobs:
     name: test-llama-torchao-lowbit
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -581,6 +587,7 @@ jobs:
         mode: [mps, coreml, xnnpack+custom+quantize_kv]
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -667,6 +674,7 @@ jobs:
       matrix:
         model: ["gemma3-4b"] # llava gives segfault so not covering.
     with:
+      default-packages: ""
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: macos-15-xlarge
       python-version: '3.11'
@@ -754,6 +762,7 @@ jobs:
         model: [dl3, edsr, efficient_sam, emformer_join, emformer_transcribe, ic3, ic4, mobilebert, mv2, mv3, resnet50, vit, w2l]
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -793,6 +802,7 @@ jobs:
     strategy:
       fail-fast: false
     with:
+      default-packages: ""
       runner: macos-m1-stable
       python-version: '3.11'
       submodules: 'recursive'
@@ -932,6 +942,7 @@ jobs:
         ]
       fail-fast: false
     with:
+      default-packages: ""
       secrets-env: EXECUTORCH_HF_TOKEN
       runner: macos-15-xlarge
       python-version: '3.11'
diff --git a/.github/workflows/weekly-pytorch-pin-bump.yml b/.github/workflows/weekly-pytorch-pin-bump.yml
index ba8f48505d5..30579c77701 100644
--- a/.github/workflows/weekly-pytorch-pin-bump.yml
+++ b/.github/workflows/weekly-pytorch-pin-bump.yml
@@ -22,46 +22,29 @@ jobs:
         with:
           python-version: '3.11'
 
-      - name: Check torch_pin channel
-        id: channel
-        run: |
-          CHANNEL=$(python -c "from torch_pin import CHANNEL; print(CHANNEL)")
-          echo "channel=${CHANNEL}" >> "$GITHUB_OUTPUT"
-          if [ "${CHANNEL}" != "nightly" ]; then
-            echo "torch_pin.py CHANNEL is '${CHANNEL}'; weekly nightly bump only runs when CHANNEL == 'nightly'."
-          fi
-
       - name: Determine nightly version
-        if: steps.channel.outputs.channel == 'nightly'
         id: nightly
         run: |
           NIGHTLY_DATE=$(date -u -d 'yesterday' '+%Y%m%d')
           NIGHTLY_VERSION="dev${NIGHTLY_DATE}"
           echo "version=${NIGHTLY_VERSION}" >> "$GITHUB_OUTPUT"
 
+      - name: Read current TORCH_VERSION
+        id: torch
+        run: |
+          TORCH_VERSION=$(python -c "exec(open('torch_pin.py').read()); print(TORCH_VERSION)")
+          echo "version=${TORCH_VERSION}" >> "$GITHUB_OUTPUT"
+
       - name: Update torch_pin.py with new NIGHTLY_VERSION
-        if: steps.channel.outputs.channel == 'nightly'
-        env:
-          NIGHTLY_VERSION: ${{ steps.nightly.outputs.version }}
         run: |
-          python - <<'PY'
-          import os, pathlib, re
-          p = pathlib.Path('torch_pin.py')
-          p.write_text(re.sub(
-              r'^NIGHTLY_VERSION\s*=\s*".*"$',
-              f'NIGHTLY_VERSION = "{os.environ["NIGHTLY_VERSION"]}"',
-              p.read_text(),
-              count=1,
-              flags=re.MULTILINE,
-          ))
-          PY
+          printf 'TORCH_VERSION = "%s"\nNIGHTLY_VERSION = "%s"\n' \
+            "${{ steps.torch.outputs.version }}" \
+            "${{ steps.nightly.outputs.version }}" > torch_pin.py
 
       - name: Run pin bump script
-        if: steps.channel.outputs.channel == 'nightly'
         run: python .github/scripts/update_pytorch_pin.py
 
       - name: Create branch and PR
-        if: steps.channel.outputs.channel == 'nightly'
         env:
           GH_TOKEN: ${{ secrets.UPDATEBOT_TOKEN }}
         run: |
diff --git a/backends/aoti/slim/cuda/test/targets.bzl b/backends/aoti/slim/cuda/test/targets.bzl
index bf38b599637..079f769a509 100644
--- a/backends/aoti/slim/cuda/test/targets.bzl
+++ b/backends/aoti/slim/cuda/test/targets.bzl
@@ -1,8 +1,8 @@
-load("@fbcode_macros//build_defs:cpp_unittest.bzl", "cpp_unittest")
+load("@fbcode_macros//build_defs:gpu_cpp_unittest.bzl", "gpu_cpp_unittest")
 load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
 
 def cuda_slim_cpp_unittest(name):
-    cpp_unittest(
+    gpu_cpp_unittest(
         name = "test_" + name,
         srcs = [
             "test_" + name + ".cpp",
@@ -16,6 +16,7 @@ def cuda_slim_cpp_unittest(name):
         external_deps = [
             ("cuda", None, "cuda-lazy"),
         ],
+        hip_compatible = False,
         keep_gpu_sections = True,
         remote_execution = re_test_utils.remote_execution(
             platform = "gpu-remote-execution",
diff --git a/backends/aoti/slim/cuda/test/test_cuda_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
index c9938bf5cd8..70da3108aba 100644
--- a/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
+++ b/backends/aoti/slim/cuda/test/test_cuda_guard.cpp
@@ -94,19 +94,22 @@ TEST_F(CUDAGuardTest, NegativeDeviceIndex) {
   EXPECT_FALSE(guard_result.ok());
 }
 
-TEST_F(CUDAGuardTest, CopyConstructorDeleted) {
+// Compile-time type-trait checks. These do not need a CUDA device, so they
+// live outside the CUDAGuardTest fixture (whose SetUp() calls GTEST_SKIP
+// when no CUDA device is available).
+TEST(CUDAGuardCompileTimeTest, CopyConstructorDeleted) {
   static_assert(
       !std::is_copy_constructible_v<CUDAGuard>,
       "CUDAGuard should not be copy constructible");
 }
 
-TEST_F(CUDAGuardTest, CopyAssignmentDeleted) {
+TEST(CUDAGuardCompileTimeTest, CopyAssignmentDeleted) {
   static_assert(
       !std::is_copy_assignable_v<CUDAGuard>,
       "CUDAGuard should not be copy assignable");
 }
 
-TEST_F(CUDAGuardTest, MoveAssignmentDeleted) {
+TEST(CUDAGuardCompileTimeTest, MoveAssignmentDeleted) {
   static_assert(
       !std::is_move_assignable_v<CUDAGuard>,
       "CUDAGuard should not be move assignable");
diff --git a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
index 613bc6ffe19..1f1acdac5db 100644
--- a/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
+++ b/backends/aoti/slim/cuda/test/test_cuda_stream_guard.cpp
@@ -234,19 +234,22 @@ TEST_F(CUDAStreamGuardTest, NegativeDeviceIndex) {
   EXPECT_FALSE(guard_result.ok());
 }
 
-TEST_F(CUDAStreamGuardTest, CopyConstructorDeleted) {
+// Compile-time type-trait checks. These do not need a CUDA device, so they
+// live outside the CUDAStreamGuardTest fixture (whose SetUp() calls
+// GTEST_SKIP when no CUDA device is available).
+TEST(CUDAStreamGuardCompileTimeTest, CopyConstructorDeleted) {
   static_assert(
       !std::is_copy_constructible_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be copy constructible");
 }
 
-TEST_F(CUDAStreamGuardTest, CopyAssignmentDeleted) {
+TEST(CUDAStreamGuardCompileTimeTest, CopyAssignmentDeleted) {
   static_assert(
       !std::is_copy_assignable_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be copy assignable");
 }
 
-TEST_F(CUDAStreamGuardTest, MoveAssignmentDeleted) {
+TEST(CUDAStreamGuardCompileTimeTest, MoveAssignmentDeleted) {
   static_assert(
       !std::is_move_assignable_v<CUDAStreamGuard>,
       "CUDAStreamGuard should not be move assignable");
diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt
index 12c46107104..0c8b241522c 100644
--- a/backends/arm/CMakeLists.txt
+++ b/backends/arm/CMakeLists.txt
@@ -54,8 +54,10 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX)
 
   set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party")
 
-  set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp
-                           backends/arm/runtime/VelaBinStream.cpp
+  set(_arm_backend_sources
+      backends/arm/runtime/EthosUBackend.cpp
+      backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
+      backends/arm/runtime/VelaBinStream.cpp
   )
   list(TRANSFORM _arm_backend_sources PREPEND "${EXECUTORCH_ROOT}/")
 
diff --git a/backends/arm/runtime/EthosUBackend.cpp b/backends/arm/runtime/EthosUBackend.cpp
index 2b17cf2c43d..4b78f9a7e28 100644
--- a/backends/arm/runtime/EthosUBackend.cpp
+++ b/backends/arm/runtime/EthosUBackend.cpp
@@ -26,6 +26,12 @@
 #include <executorch/runtime/core/exec_aten/util/dim_order_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 
+// Overridable memcpy used by the EthosU backend for input/output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 using namespace std;
 
 using executorch::aten::ScalarType;
@@ -237,8 +243,9 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       if (both_char || both_int || both_short || both_bool) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
-        // Sizes match and elt size matches so memcpy
-        memcpy(
+        // Sizes match and elt size matches so memcpy.
+        // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+        arm_ethos_io_memcpy(
             scratch_addr,
             tensor_in.mutable_data_ptr<char>(),
             tensor_in.nbytes());
@@ -389,7 +396,8 @@ Error copy_with_layout_adjustment(
   }
   const char* src_bytes = src;
   for (size_t chunk_idx = 0; chunk_idx < chunk_count; ++chunk_idx) {
-    memcpy(dest, src_bytes, chunk_size);
+    // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+    arm_ethos_io_memcpy(dest, src_bytes, chunk_size);
     src_bytes += vela_chunk_size;
     dest += chunk_size;
   }
diff --git a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
index 7962ef846df..96398762302 100644
--- a/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
+++ b/backends/arm/runtime/EthosUBackend_Cortex_M.cpp
@@ -42,6 +42,12 @@ extern "C" __attribute__((weak)) struct ethosu_driver* ethosu_reserve_driver_ex(
   return ethosu_reserve_driver();
 }
 
+// Overridable memcpy used by the EthosU backend for output scratch
+// shuffling. Default (weak) implementation in EthosUBackend_IoMemcpy.cpp does
+// std::memcpy. Firmware targets can supply a strong override (e.g. routing
+// through a DMA engine) to reduce CPU memcpy load on the host MCU.
+extern "C" void arm_ethos_io_memcpy(void* dst, const void* src, size_t size);
+
 namespace executorch {
 namespace backends {
 namespace arm {
@@ -136,7 +142,8 @@ Error platform_execute(
       }
       io_bytes_total += tensor_bytes;
     } else {
-      memcpy(
+      // Routed through arm_ethos_io_memcpy so firmware can DMA-accelerate.
+      arm_ethos_io_memcpy(
           tensor_out.mutable_data_ptr<char>(),
           static_cast<const char*>(output_addr),
           tensor_bytes);
diff --git a/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
new file mode 100644
index 00000000000..1ef5b747b81
--- /dev/null
+++ b/backends/arm/runtime/EthosUBackend_IoMemcpy.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright 2026 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstddef>
+#include <cstring>
+
+// Weak default for arm_ethos_io_memcpy. Firmware targets can provide a
+// strong-symbol override (e.g. routing through DMA on Cortex-M55) without
+// touching the upstream EthosUBackend code. Lives in its own translation
+// unit so the compiler in the call-site TUs cannot inline this body and
+// bypass the link-time override (same trick as bolt_arm_memcpy_external).
+extern "C" __attribute__((weak)) void
+arm_ethos_io_memcpy(void* dst, const void* src, size_t size) {
+  std::memcpy(dst, src, size);
+}
diff --git a/backends/arm/runtime/targets.bzl b/backends/arm/runtime/targets.bzl
index 42df03fb58b..51c0bf93f55 100644
--- a/backends/arm/runtime/targets.bzl
+++ b/backends/arm/runtime/targets.bzl
@@ -15,6 +15,7 @@ def define_common_targets():
         srcs = [
             "EthosUBackend.cpp",
             "EthosUBackend_Cortex_M.cpp",
+            "EthosUBackend_IoMemcpy.cpp",
         ],
         headers = ["EthosUBackend_Internal.h"],
         compatible_with = ["ovr_config//cpu:arm32-embedded", "ovr_config//cpu:arm32-embedded-fpu"],
diff --git a/backends/arm/scripts/fvp_utils.sh b/backends/arm/scripts/fvp_utils.sh
index 978ec363848..73f67112efd 100644
--- a/backends/arm/scripts/fvp_utils.sh
+++ b/backends/arm/scripts/fvp_utils.sh
@@ -67,10 +67,9 @@ function install_fvp() {
             log_step "fvp" "Downloading FVP ${fvp}"
             url_variable=${fvp}_url
             fvp_url=${!url_variable}
-            curl --output "FVP_${fvp}.tgz" "${fvp_url}"
             md5_variable=${fvp}_md5_checksum
             fvp_md5_checksum=${!md5_variable}
-            verify_md5 ${fvp_md5_checksum} FVP_${fvp}.tgz || exit 1
+            download_with_retry "fvp" "${fvp_url}" "FVP_${fvp}.tgz" "${fvp_md5_checksum}" || exit 1
         fi
 
         log_step "fvp" "Installing FVP ${fvp}"
diff --git a/backends/arm/scripts/toolchain_utils.sh b/backends/arm/scripts/toolchain_utils.sh
index 3d5f12e556b..5b37bcee7b4 100644
--- a/backends/arm/scripts/toolchain_utils.sh
+++ b/backends/arm/scripts/toolchain_utils.sh
@@ -107,8 +107,7 @@ function setup_toolchain() {
 
     if [[ ! -e "${toolchain_archive}" ]]; then
         log_step "toolchain" "Downloading ${toolchain_dir} toolchain"
-        curl --output "${toolchain_archive}" -L "${toolchain_url}"
-        verify_md5 ${toolchain_md5_checksum} "${toolchain_archive}" || exit 1
+        download_with_retry "toolchain" "${toolchain_url}" "${toolchain_archive}" "${toolchain_md5_checksum}" || exit 1
     fi
 
     log_step "toolchain" "Installing ${toolchain_dir} toolchain"
diff --git a/backends/arm/scripts/utils.sh b/backends/arm/scripts/utils.sh
index 2253311a19f..a7f151140f2 100644
--- a/backends/arm/scripts/utils.sh
+++ b/backends/arm/scripts/utils.sh
@@ -47,7 +47,10 @@ function verify_md5() {
     # Arg 1: Expected checksum for file
     # Arg 2: Path to file
     # Exits with return code 1 if the number of arguments is incorrect.
-    # Exits with return code 2 if the calculated mf5 does not match the given. 
+    # Returns 2 if the calculated md5 does not match the given. Returning
+    # rather than exiting lets callers like download_with_retry treat a bad
+    # checksum as a retryable failure (e.g. truncated download) instead of
+    # tearing down the whole script.
 
     [[ $# -ne 2 ]]  \
         && { echo "[${FUNCNAME[0]}] Invalid number of args, expecting 2, but got $#"; exit 1; }
@@ -60,11 +63,50 @@ function verify_md5() {
         local file_checksum="$(md5sum $file | awk '{print $1}')"
     fi
     if [[ ${ref_checksum} != ${file_checksum} ]]; then
-        echo "Mismatched MD5 checksum for file: ${file}. Expecting ${ref_checksum} but got ${file_checksum}. Exiting."
-        exit 2
+        echo "Mismatched MD5 checksum for file: ${file}. Expecting ${ref_checksum} but got ${file_checksum}."
+        return 2
     fi
 }
 
+function download_with_retry() {
+    # Download a URL to a path and validate its MD5, retrying on transport
+    # or checksum errors. developer.arm.com's CDN intermittently aborts the
+    # download mid-stream with HTTP/2 INTERNAL_ERROR (curl exit 92), and
+    # rare cases return a short error body that curl treats as success;
+    # both are caught here. --fail rejects HTTP errors,
+    # --retry-all-errors handles transport errors, and verify_md5 catches
+    # truncation / wrong-content via the published archive checksum.
+
+    # Arg 1: log context (passed to log_step)
+    # Arg 2: URL to download
+    # Arg 3: Output path
+    # Arg 4: Expected MD5 checksum
+
+    [[ $# -ne 4 ]] \
+        && { echo "[${FUNCNAME[0]}] Invalid number of args, expecting 4, but got $#"; exit 1; }
+    local context="${1}"
+    local url="${2}"
+    local output="${3}"
+    local expected_md5="${4}"
+
+    local max_attempts=5
+    for attempt in $(seq 1 ${max_attempts}); do
+        rm -f "${output}"
+        if curl --fail --retry 3 --retry-delay 5 --retry-connrefused --retry-all-errors \
+             -L --output "${output}" "${url}" \
+           && verify_md5 "${expected_md5}" "${output}"; then
+            return 0
+        fi
+        ls -l "${output}" 2>&1 || true
+        if [[ "${attempt}" = "${max_attempts}" ]]; then
+            log_step "${context}" "ERROR: download of ${url} failed after ${attempt} attempts"
+            return 1
+        fi
+        log_step "${context}" "download attempt ${attempt} failed; retrying in $((attempt * 10))s..."
+        sleep $((attempt * 10))
+    done
+}
+
 function patch_repo() {
     # Patch git repo found in $repo_dir, starting from patch $base_rev and applying patches found in $patch_dir/$name.
 
diff --git a/backends/arm/test/ops/test_addmm.py b/backends/arm/test/ops/test_addmm.py
index 799b770e863..da44992ee28 100644
--- a/backends/arm/test/ops/test_addmm.py
+++ b/backends/arm/test/ops/test_addmm.py
@@ -28,73 +28,91 @@
 
 
 test_data_suite = {
-    "basic": [
+    "basic": lambda: [
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         torch.tensor([[1.0, 0.0], [0.0, 1.0]]),
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         1.0,
         1.0,
     ],
-    "zeros": [torch.zeros(2, 2), torch.zeros(2, 3), torch.zeros(3, 2), 1.0, 1.0],
-    "beta_only": [
+    "zeros": lambda: [
+        torch.zeros(2, 2),
+        torch.zeros(2, 3),
+        torch.zeros(3, 2),
+        1.0,
+        1.0,
+    ],
+    "beta_only": lambda: [
         torch.tensor([[10.0, 20.0], [30.0, 40.0]]),
         torch.randn(2, 3),
         torch.randn(3, 2),
         0.0,
         1.0,
     ],
-    "alpha_only": [
+    "alpha_only": lambda: [
         torch.tensor([[10.0, 20.0], [30.0, 40.0]]),
         torch.randn(2, 3),
         torch.randn(3, 2),
         1.0,
         0.0,
     ],
-    "scaled": [
+    "scaled": lambda: [
         torch.ones(2, 2),
         torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
         torch.tensor([[5.0, 6.0], [7.0, 8.0]]),
         0.5,
         2.0,
     ],
-    "negative_scalars": [
+    "negative_scalars": lambda: [
         torch.tensor([[1.0, -1.0], [-1.0, 1.0]]),
         torch.tensor([[2.0, 0.0], [0.0, 2.0]]),
         torch.tensor([[1.0, 1.0], [1.0, 1.0]]),
         -1.0,
         -1.0,
     ],
-    "non_square": [torch.ones(3, 4), torch.rand(3, 2), torch.rand(2, 4), 1.0, 1.0],
-    "large_values": [
+    "non_square": lambda: [
+        torch.ones(3, 4),
+        torch.rand(3, 2),
+        torch.rand(2, 4),
+        1.0,
+        1.0,
+    ],
+    "large_values": lambda: [
         torch.full((2, 2), 1e6),
         torch.full((2, 3), 1e3),
         torch.full((3, 2), 1e3),
         1.0,
         1.0,
     ],
-    "small_values": [
+    "small_values": lambda: [
         torch.full((2, 2), 1e-6),
         torch.full((2, 3), 1e-3),
         torch.full((3, 2), 1e-3),
         1.0,
         1.0,
     ],
-    "random": [torch.randn(4, 5), torch.randn(4, 3), torch.randn(3, 5), 1.0, 1.0],
-    "broadcast_bias_row": [
+    "random": lambda: [
+        torch.randn(4, 5),
+        torch.randn(4, 3),
+        torch.randn(3, 5),
+        1.0,
+        1.0,
+    ],
+    "broadcast_bias_row": lambda: [
         torch.randn(1, 2),
         torch.randn(3, 4),
         torch.randn(4, 2),
         1.0,
         1.0,
     ],
-    "row_bias": [
+    "row_bias": lambda: [
         torch.randn(3, 1),
         torch.randn(3, 4),
         torch.randn(4, 4),
         1.0,
         1.0,
     ],
-    "scalar_bias": [
+    "scalar_bias": lambda: [
         torch.tensor(2.0),
         torch.randn(5, 3),
         torch.randn(3, 6),
@@ -120,7 +138,7 @@ def forward(
 def test_addmm_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -131,7 +149,7 @@ def test_addmm_tosa_FP(test_data: Tuple):
 def test_addmm_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
     )
@@ -143,7 +161,7 @@ def test_addmm_tosa_INT(test_data: Tuple):
 def test_addmm_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -155,7 +173,7 @@ def test_addmm_u55_INT(test_data: Tuple):
 def test_addmm_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -167,7 +185,7 @@ def test_addmm_u85_INT(test_data: Tuple):
 def test_addmm_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -180,7 +198,7 @@ def test_addmm_vgf_no_quant(test_data: input_t1):
 def test_addmm_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
         quantize=True,
@@ -197,7 +215,7 @@ def test_addmm_16a8w_tosa_INT(test_data: input_t1):
 
     pipeline = TosaPipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=[],
         per_channel_quantization=per_channel_quantization,
@@ -223,7 +241,7 @@ def test_addmm_16a8w_u55_INT(test_data: input_t1):
 
     pipeline = EthosU55PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
@@ -245,7 +263,7 @@ def test_addmm_16a8w_u85_INT(test_data: input_t1):
 
     pipeline = EthosU85PipelineINT[input_t1](
         Addmm(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=[],
         per_channel_quantization=per_channel_quantization,
diff --git a/backends/arm/test/ops/test_atan.py b/backends/arm/test/ops/test_atan.py
index 4e103dcaa82..5ceae6fa189 100644
--- a/backends/arm/test/ops/test_atan.py
+++ b/backends/arm/test/ops/test_atan.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,14 +22,14 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 
@@ -43,7 +43,7 @@ def forward(self, x: torch.Tensor):
 def test_atan_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -54,7 +54,7 @@ def test_atan_tosa_FP(test_data: Tuple):
 def test_atan_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -66,7 +66,7 @@ def test_atan_tosa_INT(test_data: Tuple):
 def test_atan_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -78,7 +78,7 @@ def test_atan_u55_INT(test_data: Tuple):
 def test_atan_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -90,7 +90,7 @@ def test_atan_u85_INT(test_data: Tuple):
 def test_atan_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -103,7 +103,7 @@ def test_atan_vgf_no_quant(test_data: Tuple):
 def test_atan_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Atan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_atanh.py b/backends/arm/test/ops/test_atanh.py
index 2eae5fcade2..2f621d8a02c 100644
--- a/backends/arm/test/ops/test_atanh.py
+++ b/backends/arm/test/ops/test_atanh.py
@@ -24,13 +24,13 @@
 
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "ramp": torch.arange(-1, 1, 0.2),
-    "near_bounds": torch.tensor([-0.99, -0.9, 0.9, 0.99]),
-    "on_bounds": torch.tensor([-1.0, 1.0]),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "ramp": lambda: torch.arange(-1, 1, 0.2),
+    "near_bounds": lambda: torch.tensor([-0.99, -0.9, 0.9, 0.99]),
+    "on_bounds": lambda: torch.tensor([-1.0, 1.0]),
 }
 
 
@@ -43,7 +43,7 @@ def forward(self, x: torch.Tensor):
 def test_atanh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -52,13 +52,14 @@ def test_atanh_tosa_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 def test_atanh_tosa_INT(test_data: Tuple):
+    input_data = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (input_data,),
         aten_op=aten_op,
         exir_op=exir_op,
     )
-    if torch.any(test_data >= 1) or torch.any(test_data <= -1):
+    if torch.any(input_data >= 1) or torch.any(input_data <= -1):
         # The quantized model will saturate to max/min values while the
         # original model will return inf/-inf, so comparison wont be valid here.
         pipeline.pop_stage("run_method_and_compare_outputs.original_model")
@@ -70,7 +71,7 @@ def test_atanh_tosa_INT(test_data: Tuple):
 def test_atanh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -82,7 +83,7 @@ def test_atanh_u55_INT(test_data: Tuple):
 def test_atanh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -94,7 +95,7 @@ def test_atanh_u85_INT(test_data: Tuple):
 def test_atanh_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -107,7 +108,7 @@ def test_atanh_vgf_no_quant(test_data: input_t1):
 def test_atanh_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Atanh(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_bitwise_not.py b/backends/arm/test/ops/test_bitwise_not.py
index a4304476183..4f390fcca15 100644
--- a/backends/arm/test/ops/test_bitwise_not.py
+++ b/backends/arm/test/ops/test_bitwise_not.py
@@ -22,20 +22,20 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite_non_bool = {
-    "zeros": torch.zeros(1, 10, 10, 10, dtype=torch.int32),
-    "ones": torch.ones(10, 2, 3, dtype=torch.int8),
-    "pattern1_int8": 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8),
-    "pattern1_int16": 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16),
-    "pattern1_int32": 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32),
-    "pattern2_int8": 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8),
-    "pattern2_int16": 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16),
-    "pattern2_int32": 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32),
-    "rand_rank2": torch.randint(-128, 127, (10, 10), dtype=torch.int8),
-    "rand_rank4": torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10, dtype=torch.int32),
+    "ones": lambda: torch.ones(10, 2, 3, dtype=torch.int8),
+    "pattern1_int8": lambda: 0xAA * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern1_int16": lambda: 0xAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern1_int32": lambda: 0xAAAAAAAA * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "pattern2_int8": lambda: 0xCC * torch.ones(1, 2, 2, 2, dtype=torch.int8),
+    "pattern2_int16": lambda: 0xCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int16),
+    "pattern2_int32": lambda: 0xCCCCCCCC * torch.ones(1, 2, 2, 2, dtype=torch.int32),
+    "rand_rank2": lambda: torch.randint(-128, 127, (10, 10), dtype=torch.int8),
+    "rand_rank4": lambda: torch.randint(-128, 127, (1, 10, 10, 10), dtype=torch.int8),
 }
 
 test_data_suite_bool = {
-    "pattern_bool": torch.tensor([True, False, True], dtype=torch.bool),
+    "pattern_bool": lambda: torch.tensor([True, False, True], dtype=torch.bool),
 }
 
 test_data_suite = {**test_data_suite_non_bool, **test_data_suite_bool}
@@ -52,7 +52,7 @@ def test_bitwise_not_tosa_FP(test_data: Tuple):
     # We don't delegate bitwise_not since it is not supported on the FP profile.
     pipeline = OpNotSupportedPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         {exir_op: 1},
         quantize=False,
     )
@@ -63,7 +63,7 @@ def test_bitwise_not_tosa_FP(test_data: Tuple):
 def test_bitwise_not_tosa_FP_bool(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         "executorch_exir_dialects_edge__ops_aten_logical_not_default",
         atol=0,
@@ -77,7 +77,7 @@ def test_bitwise_not_tosa_FP_bool(test_data: Tuple):
 def test_bitwise_not_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -89,7 +89,7 @@ def test_bitwise_not_u55_INT(test_data: Tuple):
     # We don't delegate bitwise_not since it is not supported on U55.
     pipeline = OpNotSupportedPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         {exir_op: 1},
         quantize=True,
         u55_subset=True,
@@ -102,7 +102,7 @@ def test_bitwise_not_u55_INT(test_data: Tuple):
 def test_bitwise_not_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -114,7 +114,7 @@ def test_bitwise_not_u85_INT(test_data: Tuple):
 def test_bitwise_not_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -127,7 +127,7 @@ def test_bitwise_not_vgf_no_quant(test_data: Tuple):
 def test_bitwise_not_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         BitwiseNot(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_conv_constant_pad_nd.py b/backends/arm/test/ops/test_conv_constant_pad_nd.py
index d26a1f2d90d..b732ffaf972 100644
--- a/backends/arm/test/ops/test_conv_constant_pad_nd.py
+++ b/backends/arm/test/ops/test_conv_constant_pad_nd.py
@@ -25,15 +25,31 @@
 input_t1 = Tuple[torch.Tensor]  # Input x
 
 test_data_suite = {
-    "4dim_last1dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 0, 0, 0, 0, 0), 1),
-    "4dim_last2dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 0, 0, 0, 0, 0), 2),
-    "4dim_last3dim": (torch.rand(1, 1, 16, 16), (1, 1, 0, 2, 0, 2, 0, 0), 3),
-    "4dim_last4dim": (torch.rand(1, 1, 16, 16), (1, 0, 1, 1, 0, 2, 0, 2), 4),
-    "3dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
-    "3dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
-    "3dim_last3dim": (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
-    "2dim_last1dim": (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
-    "2dim_last2dim": (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
+    "4dim_last1dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 1, 0, 0, 0, 0, 0, 0),
+        1,
+    ),
+    "4dim_last2dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 0, 1, 0, 0, 0, 0, 0),
+        2,
+    ),
+    "4dim_last3dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 1, 0, 2, 0, 2, 0, 0),
+        3,
+    ),
+    "4dim_last4dim": lambda: (
+        torch.rand(1, 1, 16, 16),
+        (1, 0, 1, 1, 0, 2, 0, 2),
+        4,
+    ),
+    "3dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0, 0, 0), 1),
+    "3dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1, 0, 0), 2),
+    "3dim_last3dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 0, 1, 1), 3),
+    "2dim_last1dim": lambda: (torch.rand(1, 1, 16), (1, 1, 0, 0), 1),
+    "2dim_last2dim": lambda: (torch.rand(1, 1, 16), (1, 0, 1, 1), 2),
 }
 """Tests conv + pad."""
 
@@ -91,7 +107,7 @@ def forward(self, x: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 def test_constant_pad_nd_tosa_FP(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineFP[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -103,7 +119,7 @@ def test_constant_pad_nd_tosa_FP(test_data: Tuple):
 
 @common.parametrize("test_data", test_data_suite)
 def test_constant_pad_nd_tosa_INT(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = TosaPipelineINT[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -118,7 +134,7 @@ def test_constant_pad_nd_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_constant_pad_nd_vgf_no_quant(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = VgfPipeline[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
@@ -132,7 +148,7 @@ def test_constant_pad_nd_vgf_no_quant(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_constant_pad_nd_vgf_quant(test_data: Tuple):
-    test_data, padding, value = test_data
+    test_data, padding, value = test_data()
     pipeline = VgfPipeline[input_t1](
         ConstantPadND(padding, value),
         (test_data,),
diff --git a/backends/arm/test/ops/test_cos.py b/backends/arm/test/ops/test_cos.py
index da5d1470028..e020c3de971 100644
--- a/backends/arm/test/ops/test_cos.py
+++ b/backends/arm/test/ops/test_cos.py
@@ -23,21 +23,21 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10, 10),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 test_data_suite_bf16 = {
-    "rand_bf16": torch.rand(4, 4, dtype=torch.bfloat16) - 0.5,
-    "ramp_bf16": torch.arange(-8, 8, 0.5, dtype=torch.bfloat16),
+    "rand_bf16": lambda: torch.rand(4, 4, dtype=torch.bfloat16) - 0.5,
+    "ramp_bf16": lambda: torch.arange(-8, 8, 0.5, dtype=torch.bfloat16),
 }
 test_data_suite_fp16 = {
-    "rand_fp16": torch.rand(4, 4, dtype=torch.float16) - 0.5,
-    "ramp_fp16": torch.arange(-8, 8, 0.5, dtype=torch.float16),
+    "rand_fp16": lambda: torch.rand(4, 4, dtype=torch.float16) - 0.5,
+    "ramp_fp16": lambda: torch.arange(-8, 8, 0.5, dtype=torch.float16),
 }
 
 
@@ -54,7 +54,7 @@ def forward(self, x: torch.Tensor):
 def test_cos_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         tosa_extensions=["bf16"],
@@ -67,7 +67,7 @@ def test_cos_tosa_FP(test_data: Tuple):
 def test_cos_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
     )
@@ -79,7 +79,7 @@ def test_cos_tosa_INT(test_data: Tuple):
 def test_cos_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -91,7 +91,7 @@ def test_cos_u55_INT(test_data: Tuple):
 def test_cos_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -103,7 +103,7 @@ def test_cos_u85_INT(test_data: Tuple):
 def test_cos_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         quantize=False,
@@ -116,7 +116,7 @@ def test_cos_vgf_no_quant(test_data: Tuple):
 def test_cos_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cos(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         quantize=True,
diff --git a/backends/arm/test/ops/test_cosh.py b/backends/arm/test/ops/test_cosh.py
index f07a87d5e2c..cc319b4087f 100644
--- a/backends/arm/test/ops/test_cosh.py
+++ b/backends/arm/test/ops/test_cosh.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -21,21 +21,21 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10),
-    "zeros_4D": torch.zeros(1, 10, 32, 7),
-    "zeros_alt_shape": torch.zeros(10, 3, 5),
-    "ones": torch.ones(15, 10, 7),
-    "ones_4D": torch.ones(1, 3, 32, 16),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
-    "rand_4D": torch.rand(1, 6, 5, 7) - 0.5,
-    "randn_pos": torch.randn(10) + 3,
-    "randn_neg": torch.randn(10) - 3,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "large": 100 * torch.ones(1, 1),
-    "small": 0.000001 * torch.ones(1, 1),
-    "small_rand": torch.rand(100) * 0.01,
-    "biggest": torch.tensor([700.0, 710.0, 750.0]),
+    "zeros": lambda: torch.zeros(10, 10, 10),
+    "zeros_4D": lambda: torch.zeros(1, 10, 32, 7),
+    "zeros_alt_shape": lambda: torch.zeros(10, 3, 5),
+    "ones": lambda: torch.ones(15, 10, 7),
+    "ones_4D": lambda: torch.ones(1, 3, 32, 16),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
+    "rand_4D": lambda: torch.rand(1, 6, 5, 7) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 3,
+    "randn_neg": lambda: torch.randn(10) - 3,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "large": lambda: 100 * torch.ones(1, 1),
+    "small": lambda: 0.000001 * torch.ones(1, 1),
+    "small_rand": lambda: torch.rand(100) * 0.01,
+    "biggest": lambda: torch.tensor([700.0, 710.0, 750.0]),
 }
 
 
@@ -48,7 +48,7 @@ def forward(self, x: torch.Tensor):
 def test_cosh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -58,7 +58,7 @@ def test_cosh_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_cosh_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Cosh(), (test_data,), aten_op=aten_op, exir_op=exir_op
+        Cosh(), (test_data(),), aten_op=aten_op, exir_op=exir_op
     )
     pipeline.run()
 
@@ -67,7 +67,7 @@ def test_cosh_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_cosh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
-        Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Cosh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -80,7 +80,7 @@ def test_cosh_u55_INT(test_data: Tuple):
 )
 def test_cosh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
-        Cosh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Cosh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -90,7 +90,7 @@ def test_cosh_u85_INT(test_data: Tuple):
 def test_cosh_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=False,
@@ -103,7 +103,7 @@ def test_cosh_vgf_no_quant(test_data: Tuple):
 def test_cosh_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Cosh(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_detach_copy.py b/backends/arm/test/ops/test_detach_copy.py
index c8715ca847a..9fba6c44ca6 100644
--- a/backends/arm/test/ops/test_detach_copy.py
+++ b/backends/arm/test/ops/test_detach_copy.py
@@ -19,10 +19,10 @@
 exir_op = "executorch_exir_dialects_edge__ops_aten__detach_copy_default"
 
 test_data_suite = {
-    "zeros_2d": torch.zeros(3, 5),
-    "ones_3d": torch.ones(2, 3, 4),
-    "rand_2d": torch.rand(10, 10) - 0.5,
-    "ramp_1d": torch.arange(-8.0, 8.0, 0.5),
+    "zeros_2d": lambda: torch.zeros(3, 5),
+    "ones_3d": lambda: torch.ones(2, 3, 4),
+    "rand_2d": lambda: torch.rand(10, 10) - 0.5,
+    "ramp_1d": lambda: torch.arange(-8.0, 8.0, 0.5),
 }
 
 
@@ -38,7 +38,7 @@ def forward(self, x: torch.Tensor):
 def test_detach_tosa_FP(test_data: torch.Tensor):
     pipeline = TosaPipelineFP[input_t1](
         DetachCopy(),
-        (test_data,),
+        (test_data(),),
         aten_op=DetachCopy.aten_op,
         exir_op=DetachCopy.exir_op,
     )
@@ -49,7 +49,7 @@ def test_detach_tosa_FP(test_data: torch.Tensor):
 def test_detach_tosa_INT(test_data: torch.Tensor):
     pipeline = TosaPipelineINT[input_t1](
         DetachCopy(),
-        (test_data,),
+        (test_data(),),
         aten_op=DetachCopy.aten_op,
         exir_op=DetachCopy.exir_op,
     )
diff --git a/backends/arm/test/ops/test_erfinv.py b/backends/arm/test/ops/test_erfinv.py
index 204a4c50455..efb6efb0028 100644
--- a/backends/arm/test/ops/test_erfinv.py
+++ b/backends/arm/test/ops/test_erfinv.py
@@ -22,26 +22,26 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "small": torch.randn(100) * 0.01,
-    "mid": torch.rand(10, 10) * 1.8 - 0.9,
-    "near_pos_bound": torch.full((32,), 0.99),
-    "near_neg_bound": torch.full((32,), -0.99),
-    "pos_one": torch.full((32,), 1.0),
-    "neg_one": torch.full((32,), -1.0),
-    "ramp": torch.arange(-0.99, 0.99, 0.02),
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "small": lambda: torch.randn(100) * 0.01,
+    "mid": lambda: torch.rand(10, 10) * 1.8 - 0.9,
+    "near_pos_bound": lambda: torch.full((32,), 0.99),
+    "near_neg_bound": lambda: torch.full((32,), -0.99),
+    "pos_one": lambda: torch.full((32,), 1.0),
+    "neg_one": lambda: torch.full((32,), -1.0),
+    "ramp": lambda: torch.arange(-0.99, 0.99, 0.02),
 }
 
 
 test_data_nan_outputs = {
-    "pos_two": torch.full((32,), 2.0),
-    "neg_two": torch.full((32,), -2.0),
+    "pos_two": lambda: torch.full((32,), 2.0),
+    "neg_two": lambda: torch.full((32,), -2.0),
 }
 
 
 test_data_fp16 = {
-    "rand_fp16": (torch.rand(8, 8, dtype=torch.float16) * 1.8 - 0.9),
-    "ramp_fp16": torch.arange(-0.9, 0.9, 0.1, dtype=torch.float16),
+    "rand_fp16": lambda: (torch.rand(8, 8, dtype=torch.float16) * 1.8 - 0.9),
+    "ramp_fp16": lambda: torch.arange(-0.9, 0.9, 0.1, dtype=torch.float16),
 }
 
 
@@ -56,7 +56,7 @@ def forward(self, x: torch.Tensor):
 def test_erfinv_tosa_FP(test_data: torch.Tensor):
     pipeline = TosaPipelineFP[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -65,7 +65,7 @@ def test_erfinv_tosa_FP(test_data: torch.Tensor):
 
 @common.parametrize("test_data", test_data_suite)
 def test_erfinv_tosa_INT(test_data: torch.Tensor):
-    pipeline = TosaPipelineINT[input_t1](Erfinv(), (test_data,), aten_op, exir_op)
+    pipeline = TosaPipelineINT[input_t1](Erfinv(), (test_data(),), aten_op, exir_op)
     pipeline.run()
 
 
@@ -74,7 +74,7 @@ def test_erfinv_tosa_INT(test_data: torch.Tensor):
 def test_erfinv_u55_INT(test_data: torch.Tensor):
     pipeline = EthosU55PipelineINT[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -86,7 +86,7 @@ def test_erfinv_u55_INT(test_data: torch.Tensor):
 def test_erfinv_u85_INT(test_data: torch.Tensor):
     pipeline = EthosU85PipelineINT[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -100,7 +100,7 @@ def test_erfinv_u85_INT(test_data: torch.Tensor):
 def test_erfinv_vgf_no_quant(test_data: torch.Tensor):
     pipeline = VgfPipeline[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -113,7 +113,7 @@ def test_erfinv_vgf_no_quant(test_data: torch.Tensor):
 def test_erfinv_vgf_quant(test_data: torch.Tensor):
     pipeline = VgfPipeline[input_t1](
         Erfinv(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_expm1.py b/backends/arm/test/ops/test_expm1.py
index 7556d1e45a8..2fb4f11d7ef 100644
--- a/backends/arm/test/ops/test_expm1.py
+++ b/backends/arm/test/ops/test_expm1.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,16 +22,16 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeroes": torch.zeros(1, 10, 10, 10),
-    "ones": torch.ones(10, 2, 3),
-    "rand": torch.rand(10, 10) - 0.5,
-    "near_zero": torch.randn(100) * 0.01,
-    "taylor_small": torch.empty(5).uniform_(
+    "zeroes": lambda: torch.zeros(1, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 2, 3),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "near_zero": lambda: torch.randn(100) * 0.01,
+    "taylor_small": lambda: torch.empty(5).uniform_(
         -0.35, 0.35
     ),  # test cases for taylor series expansion
-    "randn_large_pos": torch.randn(10) + 10,
-    "randn_large_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "randn_large_pos": lambda: torch.randn(10) + 10,
+    "randn_large_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor):
 def test_expm1_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -56,7 +56,7 @@ def test_expm1_tosa_FP(test_data: Tuple):
 def test_expm1_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -68,7 +68,7 @@ def test_expm1_tosa_INT(test_data: Tuple):
 def test_expm1_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -80,7 +80,7 @@ def test_expm1_u55_INT(test_data: Tuple):
 def test_expm1_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -92,7 +92,7 @@ def test_expm1_u85_INT(test_data: Tuple):
 def test_expm1_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=False,
@@ -105,7 +105,7 @@ def test_expm1_vgf_no_quant(test_data: Tuple):
 def test_expm1_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Expm1(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_glu.py b/backends/arm/test/ops/test_glu.py
index 0ad4e35d4ca..d8f3ed89e5c 100644
--- a/backends/arm/test/ops/test_glu.py
+++ b/backends/arm/test/ops/test_glu.py
@@ -23,14 +23,14 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": [torch.zeros(10, 10, 2), -1],
-    "ones": [torch.ones(10, 10, 2), -1],
-    "rand": [torch.rand(10, 10, 2) - 0.5, -1],
-    "randn_pos": [torch.randn(10, 2) + 10, -1],
-    "randn_neg": [torch.randn(10, 2) - 10, -1],
-    "ramp": [torch.linspace(-16, 15.8, 160).reshape(-1, 2), -1],
-    "zeros_custom_dim": [torch.zeros(7, 10, 5), 1],
-    "rand_custom_dim": [torch.rand(10, 3, 3) - 0.5, 0],
+    "zeros": lambda: [torch.zeros(10, 10, 2), -1],
+    "ones": lambda: [torch.ones(10, 10, 2), -1],
+    "rand": lambda: [torch.rand(10, 10, 2) - 0.5, -1],
+    "randn_pos": lambda: [torch.randn(10, 2) + 10, -1],
+    "randn_neg": lambda: [torch.randn(10, 2) - 10, -1],
+    "ramp": lambda: [torch.linspace(-16, 15.8, 160).reshape(-1, 2), -1],
+    "zeros_custom_dim": lambda: [torch.zeros(7, 10, 5), 1],
+    "rand_custom_dim": lambda: [torch.rand(10, 3, 3) - 0.5, 0],
 }
 
 
@@ -47,7 +47,7 @@ def forward(self, a: torch.Tensor, dim: int) -> torch.Tensor:
 def test_glu_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_op,
         exir_op,
     )
@@ -59,14 +59,15 @@ def test_glu_tosa_FP(test_data: Tuple):
     test_data_suite,
 )
 def test_glu_tosa_INT(test_data: Tuple):
+    input_data = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*input_data,),
         aten_op=[],
         exir_op=exir_op,
         # These tests don't make sense when output is ~= 0
-        frobenius_threshold=1.0 if (test_data[0].max() < 5) else 0.1,
-        cosine_threshold=0.0 if (test_data[0].max() < 5) else 0.9,
+        frobenius_threshold=1.0 if (input_data[0].max() < 5) else 0.1,
+        cosine_threshold=0.0 if (input_data[0].max() < 5) else 0.9,
     )
     pipeline.run()
 
@@ -79,7 +80,7 @@ def test_glu_tosa_INT(test_data: Tuple):
 def test_glu_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -94,7 +95,7 @@ def test_glu_u55_INT(test_data: Tuple):
 def test_glu_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -109,7 +110,7 @@ def test_glu_u85_INT(test_data: Tuple):
 def test_glu_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=False,
@@ -125,7 +126,7 @@ def test_glu_vgf_no_quant(test_data: input_t1):
 def test_glu_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Glu(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_group_norm.py b/backends/arm/test/ops/test_group_norm.py
index 32e6babf101..f31698093e9 100644
--- a/backends/arm/test/ops/test_group_norm.py
+++ b/backends/arm/test/ops/test_group_norm.py
@@ -40,29 +40,39 @@ def forward(
 
 input_t = tuple[torch.Tensor]
 test_data_suite = {
-    "rand_4_6_groups_1": ((torch.rand(4, 6),), GroupNorm(1, 6)),
-    "rand_4_6_groups_2": ((torch.rand(4, 6),), GroupNorm(2, 6)),
-    "rand_4_6_groups_6": ((torch.rand(4, 6),), GroupNorm(6, 6)),
-    "rand_4_6_8_groups_2_eps_no_affine": (
+    "rand_4_6_groups_1": lambda: ((torch.rand(4, 6),), GroupNorm(1, 6)),
+    "rand_4_6_groups_2": lambda: ((torch.rand(4, 6),), GroupNorm(2, 6)),
+    "rand_4_6_groups_6": lambda: ((torch.rand(4, 6),), GroupNorm(6, 6)),
+    "rand_4_6_8_groups_2_eps_no_affine": lambda: (
         (torch.rand(4, 6, 8),),
         GroupNorm(2, 6, eps=1e-3, affine=False),
     ),
-    "randn_1_12_8_6_groups_6_eps": (
+    "randn_1_12_8_6_groups_6_eps": lambda: (
         (torch.randn(1, 12, 8, 6),),
         GroupNorm(6, 12, eps=1e-2),
     ),
-    "randn_1_12_8_6_groups_12": ((torch.randn(1, 12, 8, 6),), GroupNorm(12, 12)),
-    "rand_6_8_10_12_groups_1": ((torch.rand(6, 8, 10, 12),), GroupNorm(1, 8)),
-    "rand_6_8_10_12_groups_4_no_affine": (
+    "randn_1_12_8_6_groups_12": lambda: (
+        (torch.randn(1, 12, 8, 6),),
+        GroupNorm(12, 12),
+    ),
+    "rand_6_8_10_12_groups_1": lambda: (
+        (torch.rand(6, 8, 10, 12),),
+        GroupNorm(1, 8),
+    ),
+    "rand_6_8_10_12_groups_4_no_affine": lambda: (
         (torch.rand(6, 8, 10, 12),),
         GroupNorm(4, 8, affine=False),
     ),
-    "rand_6_8_10_12_groups_8": ((torch.rand(6, 8, 10, 12),), GroupNorm(8, 8)),
+    "rand_6_8_10_12_groups_8": lambda: (
+        (torch.rand(6, 8, 10, 12),),
+        GroupNorm(8, 8),
+    ),
 }
 
 
 @common.parametrize("test_data", test_data_suite)
 def test_native_group_norm_tosa_FP(test_data):
+    test_data = test_data()
     aten_op = "torch.ops.aten.group_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
     pipeline = TosaPipelineFP[input_t](
@@ -79,6 +89,7 @@ def test_native_group_norm_tosa_FP(test_data):
     test_data_suite,
 )
 def test_native_group_norm_tosa_INT(test_data):
+    test_data = test_data()
     aten_op = "torch.ops.aten.sub.Tensor"  # 'sub' op arbitrarily chosen to confirm groupnorm was decomposed
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
     pipeline = TosaPipelineINT[input_t](
@@ -97,6 +108,7 @@ def test_native_group_norm_tosa_INT(test_data):
 )
 @common.XfailIfNoCorstone300
 def test_native_group_norm_u55_INT(test_data):
+    test_data = test_data()
     pipeline = EthosU55PipelineINT[input_t](
         test_data[1],
         test_data[0],
@@ -113,6 +125,7 @@ def test_native_group_norm_u55_INT(test_data):
 )
 @common.XfailIfNoCorstone320
 def test_native_group_norm_u85_INT(test_data):
+    test_data = test_data()
     pipeline = EthosU85PipelineINT[input_t](
         test_data[1],
         test_data[0],
@@ -131,7 +144,7 @@ def test_native_group_norm_u85_INT(test_data):
 def test_native_group_norm_vgf_no_quant(test_data):
     aten_op = "torch.ops.aten.group_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
-    model, inp = test_data
+    model, inp = test_data()
     pipeline = VgfPipeline[input_t](
         inp,
         model,
@@ -150,7 +163,7 @@ def test_native_group_norm_vgf_no_quant(test_data):
 def test_native_group_norm_vgf_quant(test_data):
     aten_op = "torch.ops.aten.sub.Tensor"
     exir_op = "executorch_exir_dialects_edge__ops_aten_native_group_norm_default"
-    model, inp = test_data
+    model, inp = test_data()
     pipeline = VgfPipeline[input_t](
         inp,
         model,
diff --git a/backends/arm/test/ops/test_linalg_vector_norm.py b/backends/arm/test/ops/test_linalg_vector_norm.py
index 1b2fc169fce..fa3290eb5d1 100644
--- a/backends/arm/test/ops/test_linalg_vector_norm.py
+++ b/backends/arm/test/ops/test_linalg_vector_norm.py
@@ -53,17 +53,17 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 test_modules = {
-    "default": (VectorNormModel(dim=1), (torch.rand(10, 4),)),
-    "ord1": (VectorNormModel(ord=1, dim=1), (torch.rand(10, 4),)),
-    "ord2": (VectorNormModel(ord=2, dim=1), (torch.rand(10, 20),)),
+    "default": lambda: (VectorNormModel(dim=1), (torch.rand(10, 4),)),
+    "ord1": lambda: (VectorNormModel(ord=1, dim=1), (torch.rand(10, 4),)),
+    "ord2": lambda: (VectorNormModel(ord=2, dim=1), (torch.rand(10, 20),)),
     # Norm computed along a specific dimension of a 3D tensor
-    "dim_3d": (VectorNormModel(dim=2), (torch.rand(4, 5, 6),)),
+    "dim_3d": lambda: (VectorNormModel(dim=2), (torch.rand(4, 5, 6),)),
 }
 
 
 @common.parametrize("test_module", test_modules)
 def test_vector_norm_tosa_FP(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # We decompose LinalgVectorNorm before quantize stage to have annotations
     # with q/dq nodes. In case of FP, this operator will be decomposed
@@ -79,7 +79,7 @@ def test_vector_norm_tosa_FP(test_module):
 
 @common.parametrize("test_module", test_modules)
 def test_vector_norm_tosa_INT(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # Should not found this op
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
@@ -97,7 +97,7 @@ def test_vector_norm_tosa_INT(test_module):
 @common.parametrize("test_module", test_modules)
 @common.XfailIfNoCorstone300
 def test_vector_norm_u55_INT_fvp(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     pipeline = EthosU55PipelineINT[input_t](
         model,
@@ -113,7 +113,7 @@ def test_vector_norm_u55_INT_fvp(test_module):
 @common.parametrize("test_module", test_modules)
 @common.XfailIfNoCorstone320
 def test_vector_norm_u85_INT_fvp(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
 
     # The should be decomposed and annotated in DecomposeLinalgVectorNorm pass.
     pipeline = EthosU85PipelineINT[input_t](
@@ -130,7 +130,7 @@ def test_vector_norm_u85_INT_fvp(test_module):
 @common.parametrize("test_module", test_modules)
 @common.SkipIfNoModelConverter
 def test_vector_norm_vgf_no_quant(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
     # FP VGF
     aten_op = "torch.ops.aten.linalg_vector_norm.default"
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
@@ -147,7 +147,7 @@ def test_vector_norm_vgf_no_quant(test_module):
 @common.parametrize("test_module", test_modules)
 @common.SkipIfNoModelConverter
 def test_vector_norm_vgf_quant(test_module):
-    model, input_tensor = test_module
+    model, input_tensor = test_module()
     # Should not found this op
     exir_op = "executorch_exir_dialects_edge__ops_aten_linalg_vector_norm_default"
 
diff --git a/backends/arm/test/ops/test_logit.py b/backends/arm/test/ops/test_logit.py
index cf5b4f7f07e..b0ce9aad1f5 100644
--- a/backends/arm/test/ops/test_logit.py
+++ b/backends/arm/test/ops/test_logit.py
@@ -22,16 +22,16 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": [torch.zeros((10, 10, 10)), None],
-    "ones": [torch.ones((10, 10, 10)), None],
-    "uniform_valid": [torch.rand((10, 10, 10)), None],
-    "near_zero": [torch.full((10, 10), 1e-8), None],
-    "near_one": [torch.full((10, 10), 1 - 1e-8), None],
-    "mixed": [torch.tensor([0.0, 1e-5, 0.5, 1 - 1e-5, 1.0]), None],
-    "multi_dim": [torch.rand((2, 3, 4)), None],
-    "eps": [torch.zeros((10, 10, 10)), 1e-6],
-    "invalid_neg": [torch.full((5,), -0.1), 1e-6],
-    "invalid_gt1": [torch.full((5,), 1.1), 1e-6],
+    "zeros": lambda: (torch.zeros((10, 10, 10)), None),
+    "ones": lambda: (torch.ones((10, 10, 10)), None),
+    "uniform_valid": lambda: (torch.rand((10, 10, 10)), None),
+    "near_zero": lambda: (torch.full((10, 10), 1e-8), None),
+    "near_one": lambda: (torch.full((10, 10), 1 - 1e-8), None),
+    "mixed": lambda: (torch.tensor([0.0, 1e-5, 0.5, 1 - 1e-5, 1.0]), None),
+    "multi_dim": lambda: (torch.rand((2, 3, 4)), None),
+    "eps": lambda: (torch.zeros((10, 10, 10)), 1e-6),
+    "invalid_neg": lambda: (torch.full((5,), -0.1), 1e-6),
+    "invalid_gt1": lambda: (torch.full((5,), 1.1), 1e-6),
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor, eps: torch.float32):
 def test_logit_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -56,7 +56,7 @@ def test_logit_tosa_FP(test_data: Tuple):
 def test_logit_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_op=[],
         exir_op=exir_op,
         # Quantization issues when logit(x) -> inf
@@ -71,7 +71,7 @@ def test_logit_tosa_INT(test_data: Tuple):
 def test_logit_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -83,7 +83,7 @@ def test_logit_u55_INT(test_data: Tuple):
 def test_logit_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -98,7 +98,7 @@ def test_logit_u85_INT(test_data: Tuple):
 def test_logit_vgf_no_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=False,
@@ -114,7 +114,7 @@ def test_logit_vgf_no_quant(test_data: input_t1):
 def test_logit_vgf_quant(test_data: input_t1):
     pipeline = VgfPipeline[input_t1](
         Logit(),
-        (*test_data,),
+        (*test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 98fb034e311..8864324dbd5 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -17,6 +17,7 @@
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
     EthosU85PipelineINT,
+    OpNotSupportedPipeline,
     TosaPipelineFP,
     TosaPipelineINT,
     VgfPipeline,
@@ -39,6 +40,9 @@
     "rank_3_large": lambda: (torch.rand(16, 64, 65), [1, 2, 0]),
     "reshape_large_1": lambda: (torch.rand(1, 1, 65537), [0, 2, 1]),
     "reshape_large_2": lambda: (torch.rand(65537, 1, 1), [1, 2, 0]),
+}
+
+test_data_suite_u55_reject = {
     "rank2_bool": lambda: (torch.randint(0, 2, (5, 5), dtype=torch.bool), [1, 0]),
 }
 
@@ -111,10 +115,19 @@ def test_permute_u55_INT(test_data):
         aten_op,
         exir_ops="executorch_exir_dialects_edge__ops_aten_permute_copy_default",
     )
-    if test_data[0].dtype == torch.bool:
-        pipeline.tester.use_portable_ops = True
-        pipeline.pop_stage("check_count.exir")
-        pipeline.pop_stage("check_not.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data_suite_u55_reject)
+def test_permute_u55_INT_not_delegated(test_data: torch.Tensor):
+    test_data, dims = test_data()
+    pipeline = OpNotSupportedPipeline[input_t1](
+        SimplePermute(dims=dims),
+        (test_data,),
+        non_delegated_ops={exir_op: 1},
+        quantize=True,
+        u55_subset=True,
+    )
     pipeline.run()
 
 
diff --git a/backends/arm/test/ops/test_sign.py b/backends/arm/test/ops/test_sign.py
index 62c1a0db63b..5e5a88011e9 100644
--- a/backends/arm/test/ops/test_sign.py
+++ b/backends/arm/test/ops/test_sign.py
@@ -22,17 +22,17 @@
 input_t1 = Tuple[torch.Tensor]
 
 test_data_suite = {
-    "zeros": torch.zeros(3, 5),
-    "ones": torch.ones(4, 4),
-    "neg_ones": -torch.ones(4, 4),
-    "mixed_signs": torch.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]]),
-    "positive_ramp": torch.arange(0.1, 1.1, 0.2),
-    "negative_ramp": torch.arange(-1.0, -0.1, 0.2),
-    "small_values": torch.tensor(
+    "zeros": lambda: torch.zeros(3, 5),
+    "ones": lambda: torch.ones(4, 4),
+    "neg_ones": lambda: -torch.ones(4, 4),
+    "mixed_signs": lambda: torch.tensor([[-2.0, -1.0, 0.0, 1.0, 2.0]]),
+    "positive_ramp": lambda: torch.arange(0.1, 1.1, 0.2),
+    "negative_ramp": lambda: torch.arange(-1.0, -0.1, 0.2),
+    "small_values": lambda: torch.tensor(
         [-1e-3, 0.0, 1e-3]
     ),  # Only values > observer's .eps are of interest.
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
 }
 
 
@@ -45,7 +45,7 @@ def forward(self, x: torch.Tensor):
 def test_sign_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
     )
@@ -55,7 +55,7 @@ def test_sign_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sign_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Sign(), (test_data,), aten_op=[], exir_op=exir_op, frobenius_threshold=None
+        Sign(), (test_data(),), aten_op=[], exir_op=exir_op, frobenius_threshold=None
     )
     pipeline.run()
 
@@ -66,7 +66,7 @@ def test_sign_tosa_INT(test_data: Tuple):
 def test_sign_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -78,7 +78,7 @@ def test_sign_u55_INT(test_data: Tuple):
 def test_sign_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_ops=[],
         exir_ops=exir_op,
     )
@@ -90,7 +90,7 @@ def test_sign_u85_INT(test_data: Tuple):
 def test_sign_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=aten_op,
         exir_op=exir_op,
         quantize=False,
@@ -103,7 +103,7 @@ def test_sign_vgf_no_quant(test_data: Tuple):
 def test_sign_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sign(),
-        (test_data,),
+        (test_data(),),
         aten_op=[],
         exir_op=exir_op,
         quantize=True,
diff --git a/backends/arm/test/ops/test_sin.py b/backends/arm/test/ops/test_sin.py
index 3073c44be85..f9b3c2585e6 100644
--- a/backends/arm/test/ops/test_sin.py
+++ b/backends/arm/test/ops/test_sin.py
@@ -22,20 +22,20 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10, 10),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
+    "zeros": lambda: torch.zeros(10, 10, 10, 10),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
 }
 
 test_data_suite_fp16 = {
-    "rand_fp16": torch.rand(10, 10, dtype=torch.float16),
+    "rand_fp16": lambda: torch.rand(10, 10, dtype=torch.float16),
 }
 
 test_data_suite_bf16 = {
-    "rand_bf16": torch.rand(3, 3, dtype=torch.bfloat16),
+    "rand_bf16": lambda: torch.rand(3, 3, dtype=torch.bfloat16),
 }
 
 
@@ -51,7 +51,7 @@ def forward(self, x: torch.Tensor):
 def test_sin_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
         tosa_extensions=["bf16"],
@@ -63,7 +63,7 @@ def test_sin_tosa_FP(test_data: Tuple):
 def test_sin_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op=[],
     )
@@ -75,7 +75,7 @@ def test_sin_tosa_INT(test_data: Tuple):
 def test_sin_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -87,7 +87,7 @@ def test_sin_u55_INT(test_data: Tuple):
 def test_sin_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_ops=[],
     )
@@ -99,7 +99,7 @@ def test_sin_u85_INT(test_data: Tuple):
 def test_sin_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=False,
     )
@@ -111,7 +111,7 @@ def test_sin_vgf_no_quant(test_data: Tuple):
 def test_sin_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sin(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=True,
     )
diff --git a/backends/arm/test/ops/test_sinh.py b/backends/arm/test/ops/test_sinh.py
index 703d3e52011..911d9da077b 100644
--- a/backends/arm/test/ops/test_sinh.py
+++ b/backends/arm/test/ops/test_sinh.py
@@ -1,4 +1,4 @@
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -23,16 +23,16 @@
 
 test_data_suite = {
     # (test_name, test_data)
-    "zeros": torch.zeros(10, 10, 10),
-    "zeros_alt_shape": torch.zeros(10, 3, 5),
-    "ones": torch.ones(10, 10, 10),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "large": 100 * torch.ones(1, 1),
-    "small": 0.000001 * torch.ones(1, 1),
+    "zeros": lambda: torch.zeros(10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(10, 3, 5),
+    "ones": lambda: torch.ones(10, 10, 10),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "large": lambda: 100 * torch.ones(1, 1),
+    "small": lambda: 0.000001 * torch.ones(1, 1),
 }
 
 
@@ -46,7 +46,7 @@ def forward(self, x: torch.Tensor):
 def test_sinh_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -56,7 +56,7 @@ def test_sinh_tosa_FP(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t1](
-        Sinh(), (test_data,), aten_op=aten_op, exir_op=exir_op
+        Sinh(), (test_data(),), aten_op=aten_op, exir_op=exir_op
     )
     pipeline.run()
 
@@ -65,7 +65,7 @@ def test_sinh_tosa_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
-        Sinh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Sinh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -74,7 +74,7 @@ def test_sinh_u55_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 def test_sinh_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
-        Sinh(), (test_data,), aten_ops=aten_op, exir_ops=exir_op
+        Sinh(), (test_data(),), aten_ops=aten_op, exir_ops=exir_op
     )
     pipeline.run()
 
@@ -84,7 +84,7 @@ def test_sinh_u85_INT(test_data: Tuple):
 def test_sinh_vgf_no_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=False,
     )
@@ -96,7 +96,7 @@ def test_sinh_vgf_no_quant(test_data: Tuple):
 def test_sinh_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t1](
         Sinh(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         quantize=True,
     )
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 1075055c4f0..d727eb0408a 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -5,6 +5,8 @@
 
 from typing import Callable, Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 
@@ -96,7 +98,16 @@ def test_sum_dim_intlist_tosa_INT(test_data: input_t1):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+# dim=None cases skipped: executorch.devtools.bundled_program.config rejects
+# None as a model input (cannot be serialized into the bundled program).
+_DIM_NONE_SKIP_REASON = "bundled_program cannot serialize None as a model input"
+_dim_none_skips = {
+    "dim_None": _DIM_NONE_SKIP_REASON,
+    "dim_None_4d_tensor": _DIM_NONE_SKIP_REASON,
+}
+
+
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone300
 def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t1](
@@ -108,7 +119,7 @@ def test_sum_u55_INT_1_0(test_data: Tuple):
     pipeline.run()
 
 
-@common.parametrize("test_data", Sum.test_parameters)
+@common.parametrize("test_data", Sum.test_parameters, skips=_dim_none_skips)
 @common.XfailIfNoCorstone320
 def test_sum_u85_INT_1_0(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t1](
@@ -220,3 +231,60 @@ def test_sum_tosa_FP(test_data: Callable[[], input_t2]):
 def test_sum_tosa_INT(test_data: Callable[[], input_t2]):
     pipeline = TosaPipelineINT[input_t1](SumDefault(), test_data(), SumDefault.aten_op)
     pipeline.run()
+
+
+# a16w8 (int16 IO + int8 weights) coverage for sum.dim_IntList. Surfaces the
+# Ethos-U85 int16 ReduceSum silent-zero issue tracked upstream at
+# https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/issues/23.
+
+
+class SumLastDim(torch.nn.Module):
+    """Reduce the last dim with keepdim=True."""
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.sum(dim=-1, keepdim=True)
+
+
+a16w8_sum_test_parameters = {
+    "rank1_16": lambda: (torch.rand(16),),
+    "rank3_8x1x16": lambda: (torch.rand(8, 1, 16),),
+    "rank3_4x4x16": lambda: (torch.rand(4, 4, 16),),
+}
+
+
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone300
+def test_sum_dim_intlist_a16w8_u55_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU55PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
+
+
+# All cases hit upstream Vela issue #23 (linked above). strict=False so the
+# test target stays green both on stock Vela 5.0 (cases XFAIL) and once the
+# Vela fix is in tree (cases XPASS).
+@common.parametrize("test_data", a16w8_sum_test_parameters)
+@common.XfailIfNoCorstone320
+@pytest.mark.xfail(
+    reason="Ethos-U85 int16 ReduceSum returns zero (vela#23)", strict=False
+)
+def test_sum_dim_intlist_a16w8_u85_INT(test_data: Callable[[], input_t1]):
+    pipeline = EthosU85PipelineINT[input_t1](
+        SumLastDim(),
+        test_data(),
+        aten_op,
+        exir_ops=[],
+        a16w8_quantization=True,
+        symmetric_io_quantization=True,
+        qtol=128,
+        epsilon=2**-16,
+    )
+    pipeline.run()
diff --git a/backends/arm/test/ops/test_tan.py b/backends/arm/test/ops/test_tan.py
index 62b8d1cd09a..31bd2ca85f7 100644
--- a/backends/arm/test/ops/test_tan.py
+++ b/backends/arm/test/ops/test_tan.py
@@ -25,16 +25,16 @@
 tiny32 = torch.finfo(torch.float32).tiny
 
 test_data_suite = {
-    "zeros": torch.zeros(1, 10, 10, 10),
-    "zeros_alt_shape": torch.zeros(1, 10, 3, 5),
-    "ones": torch.ones(10, 15, 25),
-    "rand": torch.rand(10, 10) - 0.5,
-    "rand_alt_shape": torch.rand(1, 10, 3, 5) - 0.5,
-    "randn_pos": torch.randn(10) + 10,
-    "randn_neg": torch.randn(10) - 10,
-    "ramp": torch.arange(-16, 16, 0.2),
-    "pi_multiples": (torch.arange(-5, 6, dtype=torch.float32) * math.pi),
-    "common_angles": torch.tensor(
+    "zeros": lambda: torch.zeros(1, 10, 10, 10),
+    "zeros_alt_shape": lambda: torch.zeros(1, 10, 3, 5),
+    "ones": lambda: torch.ones(10, 15, 25),
+    "rand": lambda: torch.rand(10, 10) - 0.5,
+    "rand_alt_shape": lambda: torch.rand(1, 10, 3, 5) - 0.5,
+    "randn_pos": lambda: torch.randn(10) + 10,
+    "randn_neg": lambda: torch.randn(10) - 10,
+    "ramp": lambda: torch.arange(-16, 16, 0.2),
+    "pi_multiples": lambda: (torch.arange(-5, 6, dtype=torch.float32) * math.pi),
+    "common_angles": lambda: torch.tensor(
         [
             -math.pi,
             -2 * math.pi / 3,
@@ -52,7 +52,7 @@
         ],
         dtype=torch.float32,
     ),
-    "near_asymptote_pos": torch.tensor(
+    "near_asymptote_pos": lambda: torch.tensor(
         [
             math.pi / 2 - 1e-7,
             math.pi / 2 - 1e-6,
@@ -63,12 +63,12 @@
         ],
         dtype=torch.float32,
     ),
-    "high_rank": torch.randn(1, 3, 7, 4, 5),
-    "very_small": torch.tensor(
+    "high_rank": lambda: torch.randn(1, 3, 7, 4, 5),
+    "very_small": lambda: torch.tensor(
         [-tiny32, -eps32, -1e-10, 0.0, 1e-10, eps32, tiny32], dtype=torch.float32
     ),
-    "large_values": torch.linspace(-1e6, 1e6, steps=257, dtype=torch.float32),
-    "undefined": torch.tensor([math.pi / 2, -math.pi / 2, 3 * math.pi / 2]),
+    "large_values": lambda: torch.linspace(-1e6, 1e6, steps=257, dtype=torch.float32),
+    "undefined": lambda: torch.tensor([math.pi / 2, -math.pi / 2, 3 * math.pi / 2]),
 }
 
 
@@ -82,7 +82,7 @@ def forward(self, x: torch.Tensor):
 def test_tan_tosa_FP(test_data: Tuple):
     pipeline = TosaPipelineFP[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
     )
@@ -93,7 +93,7 @@ def test_tan_tosa_FP(test_data: Tuple):
 def test_tan_tosa_INT(test_data: Tuple):
     pipeline = TosaPipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_op,
         exir_op,
         frobenius_threshold=None,
@@ -107,7 +107,7 @@ def test_tan_tosa_INT(test_data: Tuple):
 def test_tan_u55_INT(test_data: Tuple):
     pipeline = EthosU55PipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -119,7 +119,7 @@ def test_tan_u55_INT(test_data: Tuple):
 def test_tan_u85_INT(test_data: Tuple):
     pipeline = EthosU85PipelineINT[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         aten_ops=aten_op,
         exir_ops=exir_op,
     )
@@ -129,7 +129,7 @@ def test_tan_u85_INT(test_data: Tuple):
 @common.parametrize("test_data", test_data_suite)
 @common.SkipIfNoModelConverter
 def test_tan_vgf_no_quant(test_data: Tuple):
-    pipeline = VgfPipeline[input_t](Tan(), (test_data,), [], [], quantize=False)
+    pipeline = VgfPipeline[input_t](Tan(), (test_data(),), [], [], quantize=False)
     pipeline.run()
 
 
@@ -138,7 +138,7 @@ def test_tan_vgf_no_quant(test_data: Tuple):
 def test_tan_vgf_quant(test_data: Tuple):
     pipeline = VgfPipeline[input_t](
         Tan(),
-        (test_data,),
+        (test_data(),),
         [],
         [],
         quantize=True,
diff --git a/backends/arm/test/ops/test_upsample_bilinear2d.py b/backends/arm/test/ops/test_upsample_bilinear2d.py
index d06d1688ffe..f084e0ebe14 100644
--- a/backends/arm/test/ops/test_upsample_bilinear2d.py
+++ b/backends/arm/test/ops/test_upsample_bilinear2d.py
@@ -24,44 +24,89 @@
 
 test_data_suite_tosa = {
     # (test_name, test_data, size, scale_factor, compare_outputs)
-    "rand_double_scale": (torch.rand(2, 4, 8, 3), None, 2.0, True),
-    "rand_double_scale_one_dim": (torch.rand(2, 4, 8, 3), None, (1.0, 2.0), True),
-    "rand_double_size": (torch.rand(2, 4, 8, 3), (16, 6), None, True),
-    "rand_one_double_scale": (torch.rand(2, 4, 1, 1), None, 2.0, True),
-    "rand_one_double_size": (torch.rand(2, 4, 1, 1), (2, 2), None, True),
-    "rand_one_same_scale": (torch.rand(2, 4, 1, 1), None, 1.0, True),
-    "rand_one_same_size": (torch.rand(2, 4, 1, 1), (1, 1), None, True),
+    "rand_double_scale": lambda: (torch.rand(2, 4, 8, 3), None, 2.0, True),
+    "rand_double_scale_one_dim": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        (1.0, 2.0),
+        True,
+    ),
+    "rand_double_size": lambda: (torch.rand(2, 4, 8, 3), (16, 6), None, True),
+    "rand_one_double_scale": lambda: (torch.rand(2, 4, 1, 1), None, 2.0, True),
+    "rand_one_double_size": lambda: (torch.rand(2, 4, 1, 1), (2, 2), None, True),
+    "rand_one_same_scale": lambda: (torch.rand(2, 4, 1, 1), None, 1.0, True),
+    "rand_one_same_size": lambda: (torch.rand(2, 4, 1, 1), (1, 1), None, True),
     # Can't compare outputs as the rounding when selecting the nearest pixel is
     # different between PyTorch and TOSA. Just check the legalization went well.
     # TODO Improve the test infrastructure to support more in depth verification
     # of the TOSA legalization results.
-    "rand_half_scale": (torch.rand(2, 4, 8, 6), None, 0.5, False),
-    "rand_half_size": (torch.rand(2, 4, 8, 6), (4, 3), None, False),
-    "rand_one_and_half_scale": (torch.rand(2, 4, 8, 3), None, 1.5, False),
-    "rand_one_and_half_size": (torch.rand(2, 4, 8, 3), (12, 4), None, False),
+    "rand_half_scale": lambda: (torch.rand(2, 4, 8, 6), None, 0.5, False),
+    "rand_half_size": lambda: (torch.rand(2, 4, 8, 6), (4, 3), None, False),
+    "rand_one_and_half_scale": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        1.5,
+        False,
+    ),
+    "rand_one_and_half_size": lambda: (
+        torch.rand(2, 4, 8, 3),
+        (12, 4),
+        None,
+        False,
+    ),
     # Use randn for a bunch of tests to get random numbers from the
     # normal distribution where negative is also a possibilty
-    "randn_double_scale_negative": (torch.randn(2, 4, 8, 3), None, 2.0, True),
-    "randn_double_scale_one_dim_negative": (
+    "randn_double_scale_negative": lambda: (
+        torch.randn(2, 4, 8, 3),
+        None,
+        2.0,
+        True,
+    ),
+    "randn_double_scale_one_dim_negative": lambda: (
         torch.randn(2, 4, 8, 3),
         None,
         (1.0, 2.0),
         True,
     ),
-    "randn_double_size_negative": (torch.randn(2, 4, 8, 3), (16, 6), None, True),
-    "randn_one_double_scale_negative": (torch.randn(2, 4, 1, 1), None, 2.0, True),
-    "randn_one_double_size_negative": (torch.randn(2, 4, 1, 1), (2, 2), None, True),
-    "randn_one_same_scale_negative": (torch.randn(2, 4, 1, 1), None, 1.0, True),
-    "randn_one_same_size_negative": (torch.randn(2, 4, 1, 1), (1, 1), None, True),
+    "randn_double_size_negative": lambda: (
+        torch.randn(2, 4, 8, 3),
+        (16, 6),
+        None,
+        True,
+    ),
+    "randn_one_double_scale_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        None,
+        2.0,
+        True,
+    ),
+    "randn_one_double_size_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        (2, 2),
+        None,
+        True,
+    ),
+    "randn_one_same_scale_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        None,
+        1.0,
+        True,
+    ),
+    "randn_one_same_size_negative": lambda: (
+        torch.randn(2, 4, 1, 1),
+        (1, 1),
+        None,
+        True,
+    ),
 }
 test_data_suite_tosa_bf16 = {
-    "randn_double_scale_bf16": (
+    "randn_double_scale_bf16": lambda: (
         torch.randn(1, 2, 2, 2, dtype=torch.bfloat16),
         None,
         2.0,
         True,
     ),
-    "randn_double_size_bf16": (
+    "randn_double_size_bf16": lambda: (
         torch.randn(1, 1, 3, 2, dtype=torch.bfloat16),
         (6, 4),
         None,
@@ -69,13 +114,13 @@
     ),
 }
 test_data_suite_tosa_fp16 = {
-    "randn_double_scale_fp16": (
+    "randn_double_scale_fp16": lambda: (
         torch.randn(1, 2, 2, 2, dtype=torch.float16),
         None,
         2.0,
         True,
     ),
-    "randn_double_size_fp16": (
+    "randn_double_size_fp16": lambda: (
         torch.randn(1, 1, 3, 2, dtype=torch.float16),
         (6, 4),
         None,
@@ -84,14 +129,24 @@
 }
 
 test_data_suite_Uxx = {
-    "rand_half_scale": (torch.rand(2, 4, 8, 6), None, 0.5, False),
-    "rand_half_size": (torch.rand(2, 4, 8, 6), (4, 3), None, False),
-    "rand_one_and_half_scale": (torch.rand(2, 4, 8, 3), None, 1.5, False),
-    "rand_one_and_half_size": (torch.rand(2, 4, 8, 3), (12, 4), None, False),
+    "rand_half_scale": lambda: (torch.rand(2, 4, 8, 6), None, 0.5, False),
+    "rand_half_size": lambda: (torch.rand(2, 4, 8, 6), (4, 3), None, False),
+    "rand_one_and_half_scale": lambda: (
+        torch.rand(2, 4, 8, 3),
+        None,
+        1.5,
+        False,
+    ),
+    "rand_one_and_half_size": lambda: (
+        torch.rand(2, 4, 8, 3),
+        (12, 4),
+        None,
+        False,
+    ),
 }
 
 test_data_u55 = {
-    "rand_double_size": (torch.rand(2, 4, 8, 3), (16, 6), None, True),
+    "rand_double_size": lambda: (torch.rand(2, 4, 8, 3), (16, 6), None, True),
 }
 
 
@@ -166,7 +221,7 @@ def forward(self, x):
 def test_upsample_bilinear2d_vec_tosa_FP_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -196,7 +251,7 @@ def test_upsample_bilinear2d_vec_tosa_FP_UpsamplingBilinear2d(
 def test_upsample_bilinear2d_vec_tosa_FP_Upsample(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -227,7 +282,7 @@ def test_upsample_bilinear2d_vec_tosa_FP_Upsample(
 def test_upsample_bilinear2d_vec_tosa_FP_Interpolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     match test_data.dtype:
         case torch.bfloat16:
             atol = 1e-2
@@ -265,7 +320,7 @@ def test_upsample_bilinear2d_vec_tosa_does_not_delegate_exact_one_sixteenth_down
 def test_upsample_bilinear2d_vec_tosa_INT_intropolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = TosaPipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -282,7 +337,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_intropolate(
 def test_upsample_bilinear2d_vec_tosa_INT_Upsample(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = TosaPipelineINT[input_t1](
         Upsample(size, scale_factor),
@@ -302,7 +357,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_a16w8(
     """Test upsample_bilinear2d vector op with int16 I/O quantization for TOSA
     INT.
     """
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = TosaPipelineINT[input_t1](
         Upsample(size, scale_factor),
         (test_data,),
@@ -320,7 +375,7 @@ def test_upsample_bilinear2d_vec_tosa_INT_a16w8(
 def test_upsample_bilinear2d_vec_u55_INT_Upsample_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         Upsample(size, scale_factor),
         (test_data,),
@@ -338,7 +393,7 @@ def test_upsample_bilinear2d_vec_u55_INT_Upsample_not_delegated(
 def test_upsample_bilinear2d_vec_u55_INT_Interpolate_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         Interpolate(size, scale_factor),
         (test_data,),
@@ -356,7 +411,7 @@ def test_upsample_bilinear2d_vec_u55_INT_Interpolate_not_delegated(
 def test_upsample_bilinear2d_vec_u55_INT_UpsamplingBilinear2d_not_delegated(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
     pipeline = OpNotSupportedPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (test_data,),
@@ -372,7 +427,7 @@ def test_upsample_bilinear2d_vec_u55_INT_UpsamplingBilinear2d_not_delegated(
 @common.parametrize("test_data", test_data_suite_Uxx)
 @common.XfailIfNoCorstone320
 def test_upsample_bilinear2d_vec_u85_INT_Upsample(test_data: input_t1):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         Upsample(size, scale_factor),
@@ -391,7 +446,7 @@ def test_upsample_bilinear2d_vec_u85_INT_Upsample(test_data: input_t1):
 def test_upsample_bilinear2d_vec_u85_INT_Interpolate(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         Interpolate(size, scale_factor),
@@ -410,7 +465,7 @@ def test_upsample_bilinear2d_vec_u85_INT_Interpolate(
 def test_upsample_bilinear2d_vec_u85_INT_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    test_data, size, scale_factor, compare_outputs = test_data
+    test_data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -432,7 +487,7 @@ def test_upsample_bilinear2d_vec_u85_INT_a16w8(
     """Test upsample_bilinear2d vec op with 16A8W quantization on U85 (16-bit
     activations, 8-bit weights)
     """
-    data, size, scale_factor, compare_outputs = test_data
+    data, size, scale_factor, compare_outputs = test_data()
 
     pipeline = EthosU85PipelineINT[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
@@ -452,7 +507,7 @@ def test_upsample_bilinear2d_vec_u85_INT_a16w8(
 def test_upsample_bilinear2d_vec_vgf_no_quant_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -470,8 +525,8 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_UpsamplingBilinear2d(
 @common.parametrize("test_data", test_data_suite_tosa | test_data_suite_tosa_fp16)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_no_quant_Upsample(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
-    match test_data[0].dtype:
+    data, size, scale_factor, compare = test_data()
+    match data.dtype:
         case torch.float16:
             atol = 1e-2
             rtol = 1e-2
@@ -495,7 +550,7 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_Upsample(test_data: torch.Tensor):
 @common.parametrize("test_data", test_data_suite_tosa | test_data_suite_tosa_fp16)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_no_quant_Interpolate(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),
@@ -513,7 +568,7 @@ def test_upsample_bilinear2d_vec_vgf_no_quant_Interpolate(test_data: torch.Tenso
 def test_upsample_bilinear2d_vec_vgf_quant_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -529,7 +584,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_UpsamplingBilinear2d(
 @common.parametrize("test_data", test_data_suite_tosa)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_quant_Upsample(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Upsample(size, scale_factor),
         (data,),
@@ -545,7 +600,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_Upsample(test_data: torch.Tensor):
 @common.parametrize("test_data", test_data_suite_tosa)
 @common.SkipIfNoModelConverter
 def test_upsample_bilinear2d_vec_vgf_quant_Interpolate(test_data: torch.Tensor):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),
@@ -563,7 +618,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_Interpolate(test_data: torch.Tensor):
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_UpsamplingBilinear2d(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         UpsamplingBilinear2d(size, scale_factor),
         (data,),
@@ -583,7 +638,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_a16w8_UpsamplingBilinear2d(
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Upsample(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Upsample(size, scale_factor),
         (data,),
@@ -603,7 +658,7 @@ def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Upsample(
 def test_upsample_bilinear2d_vec_vgf_quant_a16w8_Interpolate(
     test_data: torch.Tensor,
 ):
-    data, size, scale_factor, compare = test_data
+    data, size, scale_factor, compare = test_data()
     pipeline = VgfPipeline[input_t1](
         Interpolate(size, scale_factor),
         (data,),
diff --git a/backends/arm/test/passes/test_rewrite_conv_pass.py b/backends/arm/test/passes/test_rewrite_conv_pass.py
index 09176f26f28..fc8478afee5 100644
--- a/backends/arm/test/passes/test_rewrite_conv_pass.py
+++ b/backends/arm/test/passes/test_rewrite_conv_pass.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import os
+
 import pytest
 import torch
 import torch.nn as nn
@@ -34,6 +36,8 @@
 from torch.export import Dim, export
 from torch.export.exported_program import _get_shape_env
 
+_VGF_ENABLED = "LAVAPIPE_LIB_PATH" in os.environ
+
 
 class TinyConvReluCat(nn.Module):
     def __init__(self, conv1_bias: bool = True) -> None:
@@ -214,6 +218,7 @@ def test_rewrite_conv_tosa_FP():
     pipeline.run()
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_fold_and_annotate_q_params_vgf_quant_preserves_output_qparams_on_non_fuseable_clamp() -> (
     None
 ):
@@ -228,6 +233,7 @@ def test_fold_and_annotate_q_params_vgf_quant_preserves_output_qparams_on_non_fu
     assert clamp.meta["output_qparams"]
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_rewrite_conv_vgf_quant_handles_non_fuseable_conv_clamp_cat_branch() -> None:
     exported_program = _export_quantized(TinyConvReluCat())
     compile_spec = _compile_spec()
@@ -239,6 +245,7 @@ def test_rewrite_conv_vgf_quant_handles_non_fuseable_conv_clamp_cat_branch() ->
     )
 
 
+@pytest.mark.skipif(not _VGF_ENABLED, reason="VGF not enabled")
 def test_rewrite_conv_vgf_quant_infers_quantized_bias_dtype_from_inputs() -> None:
     exported_program = _export_quantized(TinyConvReluCat(conv1_bias=False))
     edge_program = to_edge(
diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl
index 52d1b651b75..6a39d1fe5c1 100644
--- a/backends/arm/test/targets.bzl
+++ b/backends/arm/test/targets.bzl
@@ -3,7 +3,7 @@ load("@fbcode_macros//build_defs:python_pytest.bzl", "python_pytest")
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
-_ENABLE_VGF = False  # Disabled: memfd_create blocked by seccomp on Sandcastle causes segfaults before Python pre-flight check can run
+_ENABLE_VGF = True
 
 def define_arm_tests():
     # TODO [fbonly] Add more tests
@@ -30,6 +30,7 @@ def define_arm_tests():
         "ops/test_slice.py",
         "ops/test_sigmoid.py",
         "ops/test_sub.py",
+        "ops/test_sum.py",
         "ops/test_tanh.py",
         "ops/test_view.py",
         "ops/test_cos.py",
@@ -84,11 +85,12 @@ def define_arm_tests():
                 "EMULATION_LAYER_TENSOR_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Tensor_json)",
                 "EMULATION_LAYER_GRAPH_JSON": "$(location fbsource//third-party/arm-ml-emulation-layer/v0.9.0/src:VkLayer_Graph_json)",
             } if _ENABLE_VGF else {}),
-            preload_deps = [] if runtime.is_oss or not _ENABLE_VGF else [
+            preload_deps = [
                 "//executorch/kernels/quantized:custom_ops_generated_lib",
+            ] + ([] if runtime.is_oss or not _ENABLE_VGF else [
                 "fbsource//third-party/khronos:vulkan",
                 "//executorch/backends/arm/runtime:vgf_backend",
-            ],
+            ]),
             deps = [
                 "//executorch/backends/arm/test:arm_tester" if runtime.is_oss else "//executorch/backends/arm/test/tester/fb:arm_tester_fb",
                 "//executorch/backends/arm/test:conftest",
@@ -98,6 +100,7 @@ def define_arm_tests():
                 "//executorch/backends/arm/tosa:partitioner",
                 "//executorch/backends/arm:vgf",
                 "//executorch/backends/test:graph_builder",
+                "//executorch/backends/test:program_builder",
                 "//executorch/exir:lib",
                 "fbsource//third-party/pypi/pytest:pytest",
                 "fbsource//third-party/pypi/parameterized:parameterized",
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index 382ad9f633d..ad8cd8b7d3a 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -48,6 +48,7 @@ fi
 TEST_SUITE_NAME="$(basename "$0") ${TEST_SUITE}"
 
 EXCLUDE_TARGET_EXPR="(not u55) and (not u85) and (not tosa) and (not _vgf_)"
+PYTEST_RETRY_ARGS=(--reruns 2 --reruns-delay 1)
 
 all() { # Run all tests
     # This will list all lines in this file that is starting with test_ remove () { and add this script name in
@@ -80,7 +81,7 @@ test_pytest_ops_no_target() {
     echo "${TEST_SUITE_NAME}: Run pytest ops for target-less tests"
 
     # Run arm baremetal pytest tests without target
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -91,7 +92,7 @@ test_pytest_models_no_target() {
     source backends/arm/scripts/install_models_for_test.sh
 
     # Run arm baremetal pytest tests without FVP
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k "${EXCLUDE_TARGET_EXPR}"
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -101,7 +102,7 @@ test_pytest_models_no_target() {
 test_pytest_ops_tosa() {
     echo "${TEST_SUITE_NAME}: Run pytest ops for TOSA"
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k tosa
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/ --ignore=backends/arm/test/models -k tosa
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -111,7 +112,7 @@ test_pytest_models_tosa() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k tosa
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k tosa
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -134,7 +135,7 @@ test_pytest_ops_ethos_u55() {
     backends/arm/scripts/build_executorch.sh
     backends/arm/test/setup_testing.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u55
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u55
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -147,7 +148,7 @@ test_pytest_models_ethos_u55() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u55
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u55
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -188,7 +189,7 @@ test_pytest_ops_ethos_u85() {
     backends/arm/test/setup_testing.sh
 
     # Run arm baremetal pytest tests with FVP
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u85
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ --ignore=backends/arm/test/models -k u85
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -201,7 +202,7 @@ test_pytest_models_ethos_u85() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u85
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k u85
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -235,7 +236,7 @@ test_pytest_ops_vkml() {
 
     source backends/arm/test/setup_testing_vkml.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ \
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10  backends/arm/test/ \
             --ignore=backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
@@ -248,7 +249,7 @@ test_pytest_models_vkml() {
     # Install model dependencies for pytest
     source backends/arm/scripts/install_models_for_test.sh
 
-    pytest  --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=0 backends/arm/test/models -k _vgf_
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
@@ -295,6 +296,7 @@ test_smaller_stories_llama() {
     # Get path to source directory
     pytest \
     -c /dev/null \
+    "${PYTEST_RETRY_ARGS[@]}" \
     --verbose \
     --color=yes \
     --numprocesses=auto \
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index 60fda2853a3..754b781cb7b 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -399,7 +399,7 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out
 
-- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index 3b1932d01ec..bf9ef2976a9 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -574,7 +574,7 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv1d_ncl_per_tensor_out
 
-- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv1d_nlc_per_tensor_out
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
index aaf13562388..42be54d48b9 100644
--- a/backends/cadence/aot/fuse_ops.py
+++ b/backends/cadence/aot/fuse_ops.py
@@ -190,7 +190,7 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
         #    is True)
         # 2. The single successor of addmm is not a view op.
         if len(addmm_node.users) == 0:
-            return False
+            return True
 
         addmm_user = list(addmm_node.users.keys())[0]
         if intermediate_view and not self._is_view_node(addmm_user):
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index a1d3ab871e1..f3e73028169 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -263,10 +263,10 @@ def register_fake(
     "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
-    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None) -> (Tensor Z)"
 )
 lib.define(
-    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)"
 )
 lib.define(
     "quantized_depthwise_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
@@ -1305,6 +1305,7 @@ def quantized_conv1d_nlc_per_tensor_meta(
     output_zero_point: int,
     out_multiplier: int,
     out_shift: int,
+    offset: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     torch._check(bias.dtype == torch.int32, lambda: "expected int32")
     # NLC format: input is [N, L, C], weight is [OC, K, IC/groups]
diff --git a/backends/cadence/aot/reorder_ops.py b/backends/cadence/aot/reorder_ops.py
index a8eda5cc457..2ca766316f3 100644
--- a/backends/cadence/aot/reorder_ops.py
+++ b/backends/cadence/aot/reorder_ops.py
@@ -11,7 +11,7 @@
 
 from collections import defaultdict
 from math import prod
-from typing import cast, DefaultDict, List, Tuple
+from typing import Callable, cast, DefaultDict, List, Tuple
 
 import torch
 import torch.fx
@@ -719,6 +719,182 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
         return True
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=1))
+class PropagateSlice(RemoveOrReplacePassInterface):
+    """Propagate slice_copy before element-wise ops when the cost model
+    indicates it reduces total data movement.
+
+    Supported ops (extensible via dispatch table):
+        - quantize_per_tensor: unary element-wise
+        - dequantize_per_tensor: unary element-wise
+        - add.Tensor: binary with broadcast — slices non-broadcasting inputs
+        - mul.Tensor: binary with broadcast — slices non-broadcasting inputs
+
+    Handles any slice dim and any step size.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        elementwise_targets = [
+            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+        ]
+        binary_targets = [
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.mul.Tensor,
+        ]
+        self._dispatch: dict[
+            EdgeOpOverload,
+            tuple[
+                Callable[[torch.fx.Node, torch.fx.Node], bool],
+                Callable[[torch.fx.Node, torch.fx.Node], bool],
+            ],
+        ] = {}
+        for t in elementwise_targets:
+            self._dispatch[t] = (
+                self._should_swap_elementwise,
+                self._swap_elementwise_slice,
+            )
+
+        for t in binary_targets:
+            self._dispatch[t] = (
+                self._should_swap_binary_elementwise,
+                self._swap_binary_elementwise_slice,
+            )
+
+    @property
+    def targets(self) -> list[EdgeOpOverload]:
+        return [exir_ops.edge.aten.slice_copy.Tensor]
+
+    def _should_swap_elementwise(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        full_size = prod(op_node.meta["val"].shape)
+        sliced_size = prod(slice_node.meta["val"].shape)
+        return sliced_size < full_size
+
+    def _swap_elementwise_slice(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        op_input = get_arg(op_node, "input", torch.fx.Node)
+        graph = slice_node.graph
+
+        slice_dim = get_arg(slice_node, "dim", int)
+        slice_start = get_arg(slice_node, "start")
+        slice_end = get_arg(slice_node, "end")
+        slice_step = get_arg(slice_node, "step", int)
+
+        with graph.inserting_before(op_node):
+            new_slice = graph.call_function(
+                exir_ops.edge.aten.slice_copy.Tensor,
+                args=(op_input, slice_dim, slice_start, slice_end, slice_step),
+            )
+            new_slice.meta["val"] = exir_ops.edge.aten.slice_copy.Tensor(
+                op_input.meta["val"], slice_dim, slice_start, slice_end, slice_step
+            )
+
+            new_args = list(op_node.args)
+            new_args[0] = new_slice
+            target = cast(EdgeOpOverload, op_node.target)
+            new_op = graph.call_function(
+                target,
+                args=tuple(new_args),
+                kwargs=op_node.kwargs,
+            )
+            new_op.meta["val"] = target(
+                new_slice.meta["val"],
+                *[
+                    a.meta["val"] if isinstance(a, torch.fx.Node) else a
+                    for a in new_args[1:]
+                ],
+                **{
+                    k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
+                    for k, v in op_node.kwargs.items()
+                },
+            )
+
+        slice_node.replace_all_uses_with(new_op)
+        graph.erase_node(slice_node)
+        graph.erase_node(op_node)
+        return True
+
+    def _should_swap_binary_elementwise(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        lhs, rhs = op_node.args[0], op_node.args[1]
+        assert isinstance(lhs, torch.fx.Node) and isinstance(rhs, torch.fx.Node)
+        if lhs.meta["val"].shape == rhs.meta["val"].shape:
+            return False
+        full_size = prod(op_node.meta["val"].shape)
+        sliced_size = prod(slice_node.meta["val"].shape)
+        return sliced_size < full_size
+
+    def _swap_binary_elementwise_slice(
+        self, op_node: torch.fx.Node, slice_node: torch.fx.Node
+    ) -> bool:
+        lhs, rhs = op_node.args[0], op_node.args[1]
+        assert isinstance(lhs, torch.fx.Node) and isinstance(rhs, torch.fx.Node)
+        graph = slice_node.graph
+
+        slice_dim = get_arg(slice_node, "dim", int)
+        slice_start = get_arg(slice_node, "start")
+        slice_end = get_arg(slice_node, "end")
+        slice_step = get_arg(slice_node, "step", int)
+
+        output_shape = op_node.meta["val"].shape
+
+        new_args = list(op_node.args)
+        with graph.inserting_before(op_node):
+            for i, inp in enumerate([lhs, rhs]):
+                if inp.meta["val"].shape[slice_dim] == output_shape[slice_dim]:
+                    new_slice = graph.call_function(
+                        exir_ops.edge.aten.slice_copy.Tensor,
+                        args=(inp, slice_dim, slice_start, slice_end, slice_step),
+                    )
+                    new_slice.meta["val"] = exir_ops.edge.aten.slice_copy.Tensor(
+                        inp.meta["val"], slice_dim, slice_start, slice_end, slice_step
+                    )
+                    new_args[i] = new_slice
+
+            target = cast(EdgeOpOverload, op_node.target)
+            new_op = graph.call_function(
+                target,
+                args=tuple(new_args),
+                kwargs=op_node.kwargs,
+            )
+            new_op.meta["val"] = target(
+                *[
+                    a.meta["val"] if isinstance(a, torch.fx.Node) else a
+                    for a in new_args
+                ],
+                **{
+                    k: v.meta["val"] if isinstance(v, torch.fx.Node) else v
+                    for k, v in op_node.kwargs.items()
+                },
+            )
+
+        slice_node.replace_all_uses_with(new_op)
+        graph.erase_node(slice_node)
+        graph.erase_node(op_node)
+        return True
+
+    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
+        parent = get_arg(node, "input", torch.fx.Node)
+        if len(parent.users) != 1:
+            return False
+        if not isinstance(parent.target, EdgeOpOverload):
+            return False
+
+        entry = self._dispatch.get(parent.target)
+        if entry is None:
+            return False
+
+        should_swap, do_swap = entry
+        return should_swap(parent, node) and do_swap(parent, node)
+
+
 # The following class consolidates functions to reoder ops (i.e., either hoist
 # or sink some ops in the graph).
 class CadenceReorderOpsInGraph:
diff --git a/backends/cadence/aot/tests/test_reorder_ops_passes.py b/backends/cadence/aot/tests/test_reorder_ops_passes.py
index ba9089a652e..ea8943df8e8 100644
--- a/backends/cadence/aot/tests/test_reorder_ops_passes.py
+++ b/backends/cadence/aot/tests/test_reorder_ops_passes.py
@@ -26,6 +26,7 @@
     MoveSliceBeforePermutePass,
     PostponeDequantizeOpBelowUseChainPass,
     PostponePermuteOpBelowSqueezeOrUnsqueezeLikeView,
+    PropagateSlice,
     SinkOpsCloserToUsePass,
 )
 from executorch.backends.test.graph_builder import GraphBuilder
@@ -761,3 +762,265 @@ def test_non_dim0_slice_always_moved(self) -> None:
             MoveSliceBeforePermutePass(),
         )
         self.assertTrue(result.modified)
+
+
+class TestPropagateSlice(unittest.TestCase):
+    def test_swap_quantize_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        slice_node = slice_nodes[0]
+        self.assertEqual(slice_node.args[0].name, "x")
+        self.assertEqual(list(slice_node.meta["val"].shape), [2, 60, 1, 1])
+
+        quant_nodes = gm.graph.find_nodes(
+            op="call_function",
+            target=exir_ops.edge.cadence.quantize_per_tensor.default,
+        )
+        self.assertEqual(len(quant_nodes), 1)
+        self.assertEqual(quant_nodes[0].args[0], slice_node)
+        self.assertEqual(list(quant_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_swap_dequantize_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder(
+            "x", torch.randint(0, 255, (4, 60, 4, 4), dtype=torch.uint8)
+        )
+        dequant = builder.call_operator(
+            exir_ops.edge.cadence.dequantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(dequant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "x")
+
+    def test_step_2_through_quantize(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[4], 2)
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_non_batch_dim_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 4, 4))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 1, 0, 30, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [4, 30, 4, 4])
+
+    def test_no_swap_when_multi_user(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 2),
+        )
+        builder.output([sliced, quant])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
+
+    def test_no_swap_noop_slice(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        quant = builder.call_operator(
+            exir_ops.edge.cadence.quantize_per_tensor.default,
+            args=(x, 0.5, 0, 0, 255, torch.uint8),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(quant, 0, 0, 4, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
+
+    def test_unsupported_parent_not_swapped(self) -> None:
+        builder = GraphBuilder()
+        x = builder.placeholder("x", torch.randn(4, 60, 1, 1))
+        relu = builder.call_operator(
+            exir_ops.edge.aten.relu.default,
+            args=(x,),
+        )
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(relu, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
+
+    def test_swap_broadcast_mul_slice_on_broadcast_dim(self) -> None:
+        """[1,60,1,1] * [4,1,1,1] → [4,60,1,1] → slice(dim=0, step=2)
+        Only the [4,1,1,1] input should be sliced."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(1, 60, 1, 1))
+        b = builder.placeholder("b", torch.randn(4, 1, 1, 1))
+        mul = builder.call_operator(exir_ops.edge.aten.mul.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(mul, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "b")
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [2, 1, 1, 1])
+
+        mul_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.mul.Tensor
+        )
+        self.assertEqual(len(mul_nodes), 1)
+        self.assertEqual(list(mul_nodes[0].meta["val"].shape), [2, 60, 1, 1])
+
+    def test_swap_broadcast_add_lhs_broadcasts(self) -> None:
+        """[1,60,4,4] + [4,60,4,4] → [4,60,4,4] → slice(dim=0, step=2)
+        Only the [4,60,4,4] (rhs) should be sliced."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(1, 60, 4, 4))
+        b = builder.placeholder("b", torch.randn(4, 60, 4, 4))
+        add = builder.call_operator(exir_ops.edge.aten.add.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(add, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "b")
+
+    def test_swap_broadcast_mul_slice_on_non_broadcast_dim(self) -> None:
+        """[4,60,1,1] * [4,1,1,1] → [4,60,1,1] → slice(dim=1, start=0, end=30)
+        Only the [4,60,1,1] (lhs) should be sliced since rhs has dim1=1."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(4, 60, 1, 1))
+        b = builder.placeholder("b", torch.randn(4, 1, 1, 1))
+        mul = builder.call_operator(exir_ops.edge.aten.mul.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(mul, 1, 0, 30, 1),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertTrue(result.modified)
+
+        slice_nodes = gm.graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.slice_copy.Tensor
+        )
+        self.assertEqual(len(slice_nodes), 1)
+        self.assertEqual(slice_nodes[0].args[0].name, "a")
+        self.assertEqual(list(slice_nodes[0].meta["val"].shape), [4, 30, 1, 1])
+
+    def test_no_swap_binary_same_shape(self) -> None:
+        """Same-shape binary ops are not swapped (no broadcast)."""
+        builder = GraphBuilder()
+        a = builder.placeholder("a", torch.randn(4, 60, 4, 4))
+        b = builder.placeholder("b", torch.randn(4, 60, 4, 4))
+        add = builder.call_operator(exir_ops.edge.aten.add.Tensor, args=(a, b))
+        sliced = builder.call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            args=(add, 0, 0, 4, 2),
+        )
+        builder.output([sliced])
+        gm = builder.get_graph_module()
+
+        result = PropagateSlice().call(gm)
+
+        self.assertFalse(result.modified)
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
index b4e253ef366..6f42543cfc1 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.cpp
@@ -256,6 +256,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     __ET_UNUSED int64_t out_multiplier,
     __ET_UNUSED int64_t out_shift,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   (void)ctx;
   quantized_conv1d_nlc(
diff --git a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
index 7713121cf97..4f4d2877b27 100644
--- a/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
+++ b/backends/cadence/generic/operators/op_quantized_conv1d_nlc.h
@@ -54,6 +54,7 @@ ::executorch::aten::Tensor& quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
+    const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out);
 
 } // namespace native
diff --git a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 2ae06a651d2..a8f98a76ffc 100644
--- a/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/generic/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -57,6 +57,7 @@ ::executorch::aten::Tensor& quantized_depthwise_conv1d_nlc_per_tensor_out(
       output_zero_point,
       out_multiplier,
       out_shift,
+      ::executorch::aten::optional<Tensor>(),
       out);
 }
 
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
index d4631752495..5171c2908bc 100644
--- a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc.cpp
@@ -238,6 +238,7 @@ void quantized_conv1d_nlc_per_tensor_out(
     int64_t output_zero_point,
     int64_t out_multiplier,
     int64_t out_shift,
+    __ET_UNUSED const ::executorch::aten::optional<Tensor>& offset,
     Tensor& out) {
   // HiFi nnlib kernels only support dilation=1.
   // Fall back to generic implementation for dilation > 1.
@@ -258,6 +259,7 @@ void quantized_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
+        offset,
         out);
     return;
   }
@@ -284,6 +286,7 @@ void quantized_conv1d_nlc_per_tensor_out(
           output_zero_point,
           out_multiplier,
           out_shift,
+          offset,
           out);
     } else {
       xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
@@ -320,6 +323,7 @@ void quantized_conv1d_nlc_per_tensor_out(
           output_zero_point,
           out_multiplier,
           out_shift,
+          offset,
           out);
     } else {
       xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
diff --git a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
index 9e7e13477ca..4299990b52a 100644
--- a/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
+++ b/backends/cadence/hifi/operators/op_quantized_depthwise_conv1d_nlc.cpp
@@ -206,6 +206,7 @@ void quantized_depthwise_conv1d_nlc_per_tensor_out(
         output_zero_point,
         out_multiplier,
         out_shift,
+        ::executorch::aten::optional<Tensor>(),
         out);
     return;
   }
diff --git a/backends/cuda/tests/test_fused_moe.py b/backends/cuda/tests/test_fused_moe.py
index bbc351bc47b..324fd88907d 100644
--- a/backends/cuda/tests/test_fused_moe.py
+++ b/backends/cuda/tests/test_fused_moe.py
@@ -503,6 +503,11 @@ class TestFusedMoEBatchedInt8(unittest.TestCase):
         (55, 64, 64, 32, 4, 2, 32, "64tok"),
         (99, 128, 128, 64, 8, 2, 32, "128tok"),
         (0, 256, 128, 64, 8, 2, 32, "256tok"),
+        # Realistic-scale configs to catch precision/alignment issues with
+        # K > PREQUANT_BLOCK_K (matches Qwen3.5-MoE shapes: hidden=2048,
+        # intermediate=1024, num_experts=8, top_k=2, group_size=128).
+        (77, 512, 2048, 1024, 8, 2, 128, "512tok_real_dims"),
+        (21, 1, 2048, 1024, 8, 2, 128, "1tok_decode"),
     ]
 
     def test_int8_correctness(self):
diff --git a/backends/cuda/tests/test_int4_matmul.py b/backends/cuda/tests/test_int4_matmul.py
index 2f33f888ac1..ed0ca47f3f6 100644
--- a/backends/cuda/tests/test_int4_matmul.py
+++ b/backends/cuda/tests/test_int4_matmul.py
@@ -19,7 +19,6 @@
 import unittest
 
 import torch
-
 from executorch.backends.cuda.triton.kernels.int4_matmul import (
     dequant_w4_to_bf16,
     int4_matmul,
@@ -28,6 +27,41 @@
 
 ATOL = 0.01
 DEVICE = "cuda"
+SNR_THRESHOLD_DB = 50.0
+
+
+def _assert_snr(test_case, actual, expected, label, threshold_db=SNR_THRESHOLD_DB):
+    """Assert signal-to-noise ratio (in dB) of `actual` vs `expected` >= threshold.
+
+    SNR = 20*log10(||expected||_2 / ||actual - expected||_2)
+
+    Why SNR rather than torch.allclose(atol/rtol):
+      * Size-invariant: ||signal|| and ||noise|| both scale with sqrt(N) and
+        with sqrt(K) (CLT + random-walk rounding), so the ratio is independent
+        of tensor size and reduction depth. The same threshold works for
+        K=64 and K=4096, M=1 and M=1024.
+      * Robust to bf16 ULP outliers: with K=2048 and output magnitudes ~200,
+        a single element can differ by ~1.0 just from differing reduction
+        orders (Triton fused vs cuBLAS). atol/rtol false-fails on these;
+        SNR averages them out.
+      * Sensitive to real bugs: wrong stride, flipped nibble, off-by-one
+        group_idx, or a missing mask all collapse SNR to <20 dB. The 50 dB
+        threshold (≈0.3% RMS error) sits comfortably between observed clean
+        noise floor (~80-90 dB) and any genuine functional break.
+    """
+    a = actual.float()
+    b = expected.float()
+    diff = a - b
+    signal = b.norm()
+    noise = diff.norm()
+    snr_db = (20.0 * torch.log10(signal / noise.clamp(min=1e-9))).item()
+    test_case.assertGreater(
+        snr_db,
+        threshold_db,
+        f"{label}: SNR={snr_db:.1f} dB (threshold {threshold_db:.1f} dB), "
+        f"max_abs_err={diff.abs().max().item():.4f}, "
+        f"signal_norm={signal.item():.2f}, noise_norm={noise.item():.4f}",
+    )
 
 
 def _quantize_simple(w_bf16, group_size):
@@ -118,12 +152,7 @@ def _run_matmul(self, M, N, K, group_size):
 
         self.assertEqual(out.shape, (M, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=ATOL, rtol=0.01),
-            f"int4_matmul M={M} [{N}x{K}] gs={group_size}: "
-            f"max_abs_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel_err={((out.float() - ref.float()).abs() / ref.float().abs().clamp(min=1e-6)).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matmul M={M} [{N}x{K}] gs={group_size}")
 
     # --- Decode (M=1) ---
     def test_decode_square(self):
@@ -189,13 +218,7 @@ def _run_matvec(self, N, K, group_size):
 
         self.assertEqual(out.shape, (1, N))
         self.assertEqual(out.dtype, torch.bfloat16)
-        # atol=1.0 for large accumulation across K, rtol=0.01 for relative
-        self.assertTrue(
-            torch.allclose(out.float(), ref.float(), atol=1.0, rtol=0.01),
-            f"int4_matvec [{N}x{K}] gs={group_size}: "
-            f"max_err={(out.float() - ref.float()).abs().max().item():.4f}, "
-            f"max_rel={((out.float()-ref.float()).abs()/(ref.float().abs().clamp(min=0.1))).max().item():.4f}",
-        )
+        _assert_snr(self, out, ref, f"int4_matvec [{N}x{K}] gs={group_size}")
 
     def test_qkv_proj(self):
         self._run_matvec(2048, 2048, 128)
@@ -226,10 +249,7 @@ def test_matches_int4_matmul(self):
         out_mv = int4_matvec(x, packed, scale, gs)
         out_mm = int4_matmul(x, packed, scale, gs)
 
-        self.assertTrue(
-            torch.allclose(out_mv.float(), out_mm.float(), atol=1.0, rtol=0.01),
-            f"matvec vs matmul: max_err={(out_mv.float() - out_mm.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_mv, out_mm, "matvec vs matmul")
 
 
 class TestDequantThenMatmul(unittest.TestCase):
@@ -248,13 +268,7 @@ def _run(self, M, N, K, group_size):
         w_bf16 = dequant_w4_to_bf16(packed, scale, group_size)
         out_dequant = torch.nn.functional.linear(x, w_bf16)
 
-        self.assertTrue(
-            torch.allclose(
-                out_fused.float(), out_dequant.float(), atol=ATOL, rtol=0.01
-            ),
-            f"fused vs dequant M={M} [{N}x{K}]: "
-            f"max_abs_err={(out_fused.float() - out_dequant.float()).abs().max().item():.4f}",
-        )
+        _assert_snr(self, out_fused, out_dequant, f"fused vs dequant M={M} [{N}x{K}]")
 
     def test_decode(self):
         self._run(1, 2048, 2048, 128)
diff --git a/backends/cuda/triton/kernels/__init__.py b/backends/cuda/triton/kernels/__init__.py
index d9f76f9909e..4db10fbf82d 100644
--- a/backends/cuda/triton/kernels/__init__.py
+++ b/backends/cuda/triton/kernels/__init__.py
@@ -8,6 +8,7 @@
     fused_moe,
     fused_moe_batched,
     fused_moe_batched_gemm,
+    fused_moe_batched_gemm_int8,
     moe_align_block_size,
 )
 
@@ -23,6 +24,7 @@
     "fused_moe",
     "fused_moe_batched",
     "fused_moe_batched_gemm",
+    "fused_moe_batched_gemm_int8",
     "int4_matvec",
     "moe_align_block_size",
     "sdpa",
diff --git a/backends/cuda/triton/kernels/fused_moe.py b/backends/cuda/triton/kernels/fused_moe.py
index e35c3008a24..9aeb8ef7dbe 100644
--- a/backends/cuda/triton/kernels/fused_moe.py
+++ b/backends/cuda/triton/kernels/fused_moe.py
@@ -42,6 +42,131 @@
 from torch.library import triton_op, wrap_triton
 
 
+# ---------------------------------------------------------------------------
+# W4A8 batched MoE kernels (INT8 activations + INT4 weights).
+#
+# Activation INT8 quantization is HOISTED out of the GEMM K-loop into a
+# dedicated pre-quantization kernel:
+#   - _quantize_activations_int8_kernel writes [max_padded, K] INT8 +
+#     [max_padded, num_k_tiles] float32 per-row-per-tile scales.
+#   - _fused_moe_batched_int8_kernel (GEMM1) loads pre-quantized INT8 + scale.
+#   - _silu_quantize_int8_kernel fuses SiLU(gate)*up with INT8 quantization
+#     between GEMM1 and GEMM2.
+#   - _fused_moe_silu_batched_int8_kernel (GEMM2) loads pre-quantized INT8.
+#
+# Hoisting eliminates ~256 redundant tl.max reductions per program
+# (cdiv(K, BLOCK_SIZE_K) tiles * BLOCK_SIZE_M rows) and halves activation HBM
+# bandwidth in the GEMM K-loop (bf16 -> int8).
+#
+# BLOCK_SIZE_K is fixed at PREQUANT_BLOCK_K (= 32, matches the llama.cpp
+# group_size) so the per-tile activation scales line up with the GEMM K-loop.
+# ---------------------------------------------------------------------------
+PREQUANT_BLOCK_K = 32
+
+
+@triton.jit
+def _quantize_activations_int8_kernel(
+    A,  # [M+1, K] bf16 input activations (with sentinel zero row)
+    A_int8,  # [max_padded, K] int8 output (sorted order)
+    A_scale,  # [max_padded, num_k_tiles] float32 per-row-per-tile scales
+    sorted_token_ids,  # [max_padded] int64 pair indices
+    K: tl.constexpr,
+    NUM_K_TILES: tl.constexpr,
+    top_k: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am,
+    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
+):
+    """Quantize one sorted M-row to INT8 with per-tile scales.
+
+    Grid: (max_padded,) — one program per sorted row. Each program loops
+    over K-tiles. Sentinel pair_ids map to the appended zero row in A.
+    """
+    row_id = tl.program_id(0)
+    pair_id = tl.load(sorted_token_ids + row_id)
+    token_id = pair_id // top_k
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    for k_tile in range(NUM_K_TILES):
+        k_offset = k_tile * BLOCK_SIZE_K
+        k_full_offs = k_offset + offs_k
+        k_mask = k_full_offs < K
+
+        # Load bf16 activation slice [BLOCK_SIZE_K]
+        a_ptrs = A + token_id * stride_am + k_full_offs * stride_ak
+        a_bf16 = tl.load(a_ptrs, mask=k_mask, other=0.0)
+
+        # Compute per-tile scale (scalar)
+        a_f32 = a_bf16.to(tl.float32)
+        a_absmax = tl.max(tl.abs(a_f32))
+        a_scale_val = a_absmax / 127.0 + 1e-12
+
+        # Quantize to INT8
+        a_scaled = a_f32 / a_scale_val
+        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+
+        # Store quantized activations
+        q_ptrs = A_int8 + row_id * stride_qm + k_full_offs * stride_qk
+        tl.store(q_ptrs, a_int8, mask=k_mask)
+
+        # Store scale
+        s_ptr = A_scale + row_id * stride_sm + k_tile * stride_sk
+        tl.store(s_ptr, a_scale_val)
+
+
+@triton.jit
+def _silu_quantize_int8_kernel(
+    A,  # [num_tokens_post_padded, 2*inter] bf16 GEMM1 output (sorted)
+    A_int8,  # [num_tokens_post_padded, inter] int8 SiLU-quantized output
+    A_scale,  # [num_tokens_post_padded, num_k_tiles] float32 per-tile scales
+    K: tl.constexpr,  # intermediate_size
+    NUM_K_TILES: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am,
+    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
+):
+    """SiLU(gate)*up + INT8 quantization for the batched GEMM2 input.
+
+    Grid: (max_padded,). Reads gate at columns [0, K), up at [K, 2K),
+    computes SiLU(gate)*up, quantizes to INT8 with per-tile scales.
+    """
+    row_id = tl.program_id(0)
+
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+
+    for k_tile in range(NUM_K_TILES):
+        k_offset = k_tile * BLOCK_SIZE_K
+        k_full_offs = k_offset + offs_k
+        k_mask = k_full_offs < K
+
+        gate_ptrs = A + row_id * stride_am + k_full_offs * stride_ak
+        up_ptrs = gate_ptrs + K * stride_ak
+
+        gate = tl.load(gate_ptrs, mask=k_mask, other=0.0).to(tl.float32)
+        up = tl.load(up_ptrs, mask=k_mask, other=0.0).to(tl.float32)
+        silu_out = gate * tl.sigmoid(gate) * up
+
+        a_absmax = tl.max(tl.abs(silu_out))
+        a_scale_val = a_absmax / 127.0 + 1e-12
+        a_scaled = silu_out / a_scale_val
+        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+
+        q_ptrs = A_int8 + row_id * stride_qm + k_full_offs * stride_qk
+        tl.store(q_ptrs, a_int8, mask=k_mask)
+
+        s_ptr = A_scale + row_id * stride_sm + k_tile * stride_sk
+        tl.store(s_ptr, a_scale_val)
+
+
 # Autotune configs for GEMM1 (_fused_moe_kernel).
 # Top performers from CI benchmark on A100-SXM4-80GB, Qwen3.5 MoE dimensions
 # (M=1, N=1024, K=2048, 8 experts, group_size=128).
@@ -68,6 +193,7 @@
     triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=4, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 16, "BLOCK_SIZE_K": 128}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_SIZE_N": 32, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 8, "BLOCK_SIZE_K": 256}, num_warps=2, num_stages=2),
 ]
 
 
@@ -451,9 +577,12 @@ def _fused_moe_fake(
 # ---------------------------------------------------------------------------
 
 # Fixed BLOCK_M for the batched kernel. Not autotuned because the token
-# sorting layout depends on it. 16 is the minimum for tl.dot and wastes
-# the least padding with typical Qwen3.5 expert load (~30 tokens/expert).
-_BATCHED_BLOCK_M = 16
+# sorting layout depends on it. Microbenchmarked on Qwen3.5 MoE prefill
+# (M=1696, top_k=8, 256 experts) — BLOCK_M=64 is ~1.32x faster than 16
+# despite the extra padding, because the per-expert M block (~30 tokens
+# × 8 top_k = ~53 active rows/expert) saturates 64-row tensor-core MMAs
+# and reduces total program count.
+_BATCHED_BLOCK_M = 64
 
 
 def moe_align_block_size(
@@ -712,35 +841,39 @@ def _fused_moe_batched_kernel(
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
 
-# Autotune configs for batched INT8 GEMM1 (gate+up projection, W4A8).
+# Autotune configs for the prequant GEMM1 INT8 kernel.
+# BLOCK_SIZE_K is FIXED at PREQUANT_BLOCK_K — only N/warps/stages tunable.
 _BATCHED_GEMM1_INT8_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=3),
-    triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
-    ),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=2, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=2),
 ]
 
 
 @triton.autotune(configs=_BATCHED_GEMM1_INT8_CONFIGS, key=["N", "K"])
 @triton.jit
 def _fused_moe_batched_int8_kernel(
-    # Pointers
-    A,  # [M+1, K] bf16 activations (row M is zero-padding sentinel)
+    # Pointers — A is INT8 pre-quantized in sorted order, A_scale per-tile
+    A_int8,  # [max_padded, K] int8 pre-quantized activations
+    A_scale,  # [max_padded, num_k_tiles] float32 per-tile scales
     B,  # [E, N, K//2] int8 packed INT4 weights
     C,  # [num_tokens_post_padded, N] bf16 output (sorted order)
     B_scale,  # [E, N, K//group_size] bf16 scales
-    sorted_token_ids,  # [num_tokens_post_padded] int64 pair indices
     expert_ids,  # [num_expert_blocks] int64
     # Dimensions
     N: tl.constexpr,
     K: tl.constexpr,
     # Strides
-    stride_am,
-    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
     stride_be,
     stride_bk,
     stride_bn,
@@ -750,18 +883,14 @@ def _fused_moe_batched_int8_kernel(
     stride_bsk,
     stride_bsn,
     # Config
-    top_k: tl.constexpr,
     group_size: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
     compute_type: tl.constexpr,
 ):
-    """Batched GEMM1 (gate+up) with INT8 tensor cores (W4A8).
-
-    Dynamically quantizes bf16 activations to INT8 per-row per-tile,
-    dequantizes INT4 weights to INT8 (skipping bf16), and uses
-    tl.dot(int8, int8) → int32 accumulation with per-tile float32 rescale.
+    """Batched GEMM1 (gate+up) with INT8 tensor cores, consuming pre-quantized
+    activations + per-row-per-tile scales. No quantization in the K-loop.
     """
     pid = tl.program_id(0)
     num_n_blocks = tl.cdiv(N, BLOCK_SIZE_N)
@@ -771,14 +900,13 @@ def _fused_moe_batched_int8_kernel(
     expert_id = tl.load(expert_ids + expert_block_idx).to(tl.int64)
 
     offs_m = expert_block_idx * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-    pair_ids = tl.load(sorted_token_ids + offs_m)
-    token_ids = pair_ids // top_k
 
     offs_n = n_block * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
     n_mask = offs_n < N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
 
-    a_ptrs = A + token_ids[:, None] * stride_am + offs_k[None, :] * stride_ak
+    # A_int8 is in sorted order, indexed directly by offs_m
+    a_ptrs = A_int8 + offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk
 
     b_ptrs = (
         B
@@ -788,27 +916,22 @@ def _fused_moe_batched_int8_kernel(
     )
     b_shifter = (offs_k[:, None] % 2) * 4
 
-    # Float32 accumulator for cross-tile summation (rescaled per tile)
     acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
 
     for k_step in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load bf16 activation tile [BLOCK_M, BLOCK_K]
-        a_bf16 = tl.load(a_ptrs, mask=k_mask[None, :], other=0.0)
+        # Load pre-quantized INT8 activation tile [BLOCK_M, BLOCK_K]
+        a_int8 = tl.load(a_ptrs, mask=k_mask[None, :], other=0)
 
-        # Per-row dynamic INT8 quantization
-        a_f32 = a_bf16.to(tl.float32)
-        a_absmax = tl.max(tl.abs(a_f32), axis=1)  # [BLOCK_M]
-        a_scale = a_absmax / 127.0 + 1e-12  # avoid division by zero
-        a_scaled = a_f32 / a_scale[:, None]
-        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+        # Load pre-computed per-row-per-tile scale [BLOCK_M]
+        a_scale = tl.load(A_scale + offs_m * stride_sm + k_step * stride_sk)
 
         # Load and unpack INT4 weights to INT8 [BLOCK_K, BLOCK_N]
         b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
         b = (b >> b_shifter) & 0xF
-        b_int8 = (b - 8).to(tl.int8)  # symmetric dequant to [-8, 7]
+        b_int8 = (b - 8).to(tl.int8)
 
         # Per-group weight scale
         if BLOCK_SIZE_K <= group_size:
@@ -822,6 +945,8 @@ def _fused_moe_batched_int8_kernel(
             b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
                 tl.float32
             )
+            dot_i32 = tl.dot(a_int8, b_int8)
+            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
         else:
             scale_ptrs = (
                 B_scale
@@ -832,24 +957,15 @@ def _fused_moe_batched_int8_kernel(
             b_scale = tl.load(
                 scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
             ).to(tl.float32)
-
-        if BLOCK_SIZE_K <= group_size:
-            # INT8 tensor core GEMM: [BLOCK_M, BLOCK_K] @ [BLOCK_K, BLOCK_N] → int32
-            dot_i32 = tl.dot(a_int8, b_int8)
-            # b_scale is [1, BLOCK_N], broadcast
-            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
-        else:
-            # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
             acc += (
                 tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
                 * a_scale[:, None]
             )
 
-        a_ptrs += BLOCK_SIZE_K * stride_ak
+        a_ptrs += BLOCK_SIZE_K * stride_qk
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
 
-    # Write output in sorted order [BLOCK_M, BLOCK_N]
     c_ptrs = C + offs_m[:, None] * stride_cm + offs_n[None, :] * stride_cn
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
@@ -985,37 +1101,38 @@ def _fused_moe_silu_batched_kernel(
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
 
 
-# Autotune configs for batched INT8 GEMM2 (down projection + SiLU, W4A8).
 _BATCHED_GEMM2_INT8_CONFIGS = [
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2),
-    triton.Config(
-        {"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 128}, num_warps=4, num_stages=2
-    ),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 64}, num_warps=4, num_stages=3),
-    triton.Config({"BLOCK_SIZE_N": 64, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
-    triton.Config({"BLOCK_SIZE_N": 128, "BLOCK_SIZE_K": 32}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=2, num_stages=2),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=2, num_stages=3),  # num_warps=2
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_SIZE_N": 128}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 64}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=3),
+    triton.Config({"BLOCK_SIZE_N": 256}, num_warps=8, num_stages=2),
 ]
 
 
 @triton.autotune(configs=_BATCHED_GEMM2_INT8_CONFIGS, key=["N", "K"])
 @triton.jit
 def _fused_moe_silu_batched_int8_kernel(
-    # Pointers
-    A,  # [num_tokens_post_padded, 2*inter] bf16 GEMM1 output (sorted order)
+    A_int8,  # [max_padded, K] int8 pre-quantized SiLU output
+    A_scale,  # [max_padded, num_k_tiles] float32 per-tile scales
     B,  # [E, N, K//2] int8 packed INT4 weights
-    C,  # [M*top_k + 1, N] bf16 output (scatter to original pair order)
+    C,  # [M*top_k + 1, N] bf16 output (scatter to pair order)
     B_scale,  # [E, N, K//group_size] bf16 scales
     sorted_token_ids,  # [num_tokens_post_padded] int64 pair indices
     expert_ids,  # [num_expert_blocks] int64
-    topk_weights,  # [M*top_k] float32 router weights (flat)
+    topk_weights,  # [M*top_k] float32 router weights
     # Dimensions
     N: tl.constexpr,
-    K: tl.constexpr,  # intermediate_size
-    num_pairs,  # M * top_k (for clamping sentinel weight lookups)
+    K: tl.constexpr,
+    num_pairs,
     # Strides
-    stride_am,
-    stride_ak,
+    stride_qm,
+    stride_qk,
+    stride_sm,
+    stride_sk,
     stride_be,
     stride_bk,
     stride_bn,
@@ -1025,18 +1142,14 @@ def _fused_moe_silu_batched_int8_kernel(
     stride_bsk,
     stride_bsn,
     # Config
-    top_k: tl.constexpr,
     group_size: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
     compute_type: tl.constexpr,
 ):
-    """Batched GEMM2 with fused SiLU, INT8 tensor cores, and scatter-back (W4A8).
-
-    SiLU(gate)*up is computed in float32, then dynamically quantized to INT8
-    per-row per-tile. INT4 weights are dequantized directly to INT8.
-    tl.dot(int8, int8) → int32, with per-tile float32 rescale.
+    """GEMM2 with INT8 tensor cores, consuming pre-quantized SiLU(gate)*up
+    activations + per-row-per-tile scales. Scatter-back to pair order.
     """
     pid = tl.program_id(0)
     num_n_blocks = tl.cdiv(N, BLOCK_SIZE_N)
@@ -1052,9 +1165,7 @@ def _fused_moe_silu_batched_int8_kernel(
     n_mask = offs_n < N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
 
-    # A pointers: gate at [0, K), up at [K, 2K)
-    a_gate_ptrs = A + offs_m[:, None] * stride_am + offs_k[None, :] * stride_ak
-    a_up_ptrs = a_gate_ptrs + K * stride_ak
+    a_ptrs = A_int8 + offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk
 
     b_ptrs = (
         B
@@ -1070,23 +1181,13 @@ def _fused_moe_silu_batched_int8_kernel(
         k_remaining = K - k_step * BLOCK_SIZE_K
         k_mask = offs_k < k_remaining
 
-        # Load gate and up tiles, apply SiLU in float32
-        gate = tl.load(a_gate_ptrs, mask=k_mask[None, :], other=0.0).to(tl.float32)
-        up = tl.load(a_up_ptrs, mask=k_mask[None, :], other=0.0)
-        silu_out = gate * tl.sigmoid(gate) * up.to(tl.float32)  # [BLOCK_M, BLOCK_K]
-
-        # Per-row dynamic INT8 quantization of SiLU output
-        a_absmax = tl.max(tl.abs(silu_out), axis=1)  # [BLOCK_M]
-        a_scale = a_absmax / 127.0 + 1e-12
-        a_scaled = silu_out / a_scale[:, None]
-        a_int8 = (a_scaled + tl.where(a_scaled >= 0, 0.5, -0.5)).to(tl.int8)
+        a_int8 = tl.load(a_ptrs, mask=k_mask[None, :], other=0)
+        a_scale = tl.load(A_scale + offs_m * stride_sm + k_step * stride_sk)
 
-        # Load and unpack INT4 weights to INT8 [BLOCK_K, BLOCK_N]
         b = tl.load(b_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0)
         b = (b >> b_shifter) & 0xF
         b_int8 = (b - 8).to(tl.int8)
 
-        # Per-group weight scale
         if BLOCK_SIZE_K <= group_size:
             group_idx = (BLOCK_SIZE_K * k_step) // group_size
             scale_ptrs = (
@@ -1098,6 +1199,8 @@ def _fused_moe_silu_batched_int8_kernel(
             b_scale = tl.load(scale_ptrs, mask=n_mask[None, :], other=0.0).to(
                 tl.float32
             )
+            dot_i32 = tl.dot(a_int8, b_int8)
+            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
         else:
             scale_ptrs = (
                 B_scale
@@ -1108,21 +1211,13 @@ def _fused_moe_silu_batched_int8_kernel(
             b_scale = tl.load(
                 scale_ptrs, mask=k_mask[:, None] & n_mask[None, :], other=0.0
             ).to(tl.float32)
-
-        if BLOCK_SIZE_K <= group_size:
-            # INT8 tensor core GEMM: [BLOCK_M, BLOCK_K] @ [BLOCK_K, BLOCK_N] → int32
-            dot_i32 = tl.dot(a_int8, b_int8)
-            acc += dot_i32.to(tl.float32) * a_scale[:, None] * b_scale
-        else:
-            # Multi-group tile: dequantize weights per group, use float matmul
             b_dequant = (b_int8.to(tl.float32) * b_scale).to(compute_type)
             acc += (
                 tl.dot(a_int8.to(compute_type), b_dequant).to(tl.float32)
                 * a_scale[:, None]
             )
 
-        a_gate_ptrs += BLOCK_SIZE_K * stride_ak
-        a_up_ptrs += BLOCK_SIZE_K * stride_ak
+        a_ptrs += BLOCK_SIZE_K * stride_qk
         b_ptrs += (BLOCK_SIZE_K // 2) * stride_bk
 
     # Apply router weights per row
@@ -1132,7 +1227,6 @@ def _fused_moe_silu_batched_int8_kernel(
     weights = tl.where(is_valid, weights, 0.0)
     acc = acc * weights[:, None]
 
-    # Scatter to original pair order
     scatter_ids = tl.where(is_valid, pair_ids, num_pairs)
     c_ptrs = C + scatter_ids[:, None] * stride_cm + offs_n[None, :] * stride_cn
     tl.store(c_ptrs, acc.to(compute_type), mask=n_mask[None, :])
@@ -1284,7 +1378,18 @@ def fused_moe_batched_gemm_int8(
     num_experts: int,
     group_size: int,
 ) -> torch.Tensor:
-    """Batched W4A8 GEMM1 + GEMM2+SiLU with INT8 tensor cores."""
+    """Batched W4A8 GEMM1 + GEMM2+SiLU with INT8 tensor cores.
+
+    Pipeline:
+      1. moe_align_block_size: sort pairs by expert.
+      2. _quantize_activations_int8_kernel: quantize hidden_states to INT8
+         in sorted order with per-row-per-tile scales.
+      3. _fused_moe_batched_int8_kernel (GEMM1): consumes INT8 + scales.
+      4. _silu_quantize_int8_kernel: fuse SiLU(gate)*up + INT8 quantization
+         on the GEMM1 output.
+      5. _fused_moe_silu_batched_int8_kernel (GEMM2): consumes INT8 + scales,
+         scatter-back to original pair order.
+    """
     M, K = hidden_states.shape
     N1 = w1.shape[1]
     intermediate = N1 // 2
@@ -1308,6 +1413,35 @@ def fused_moe_batched_gemm_int8(
 
     topk_weights_flat = topk_weights.reshape(-1)
 
+    # ---- Pre-quantize activations for GEMM1 ----
+    BLOCK_K_QUANT = PREQUANT_BLOCK_K
+    num_k_tiles_g1 = (K + BLOCK_K_QUANT - 1) // BLOCK_K_QUANT
+
+    a_int8_g1 = torch.empty(
+        max_padded, K, dtype=torch.int8, device=hidden_states.device
+    )
+    a_scale_g1 = torch.empty(
+        max_padded, num_k_tiles_g1, dtype=torch.float32, device=hidden_states.device
+    )
+
+    grid_quant_g1 = (max_padded,)
+    wrap_triton(_quantize_activations_int8_kernel)[grid_quant_g1](
+        hidden_padded,
+        a_int8_g1,
+        a_scale_g1,
+        sorted_token_ids,
+        K=K,
+        NUM_K_TILES=num_k_tiles_g1,
+        top_k=top_k,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
+        stride_am=hidden_padded.stride(0),
+        stride_ak=hidden_padded.stride(1),
+        stride_qm=a_int8_g1.stride(0),
+        stride_qk=a_int8_g1.stride(1),
+        stride_sm=a_scale_g1.stride(0),
+        stride_sk=a_scale_g1.stride(1),
+    )
+
     cache1 = torch.empty(
         max_padded,
         N1,
@@ -1319,16 +1453,18 @@ def grid1(meta):
         return (num_expert_blocks * triton.cdiv(N1, meta["BLOCK_SIZE_N"]),)
 
     wrap_triton(_fused_moe_batched_int8_kernel)[grid1](
-        hidden_padded,
+        a_int8_g1,
+        a_scale_g1,
         w1,
         cache1,
         w1_scale,
-        sorted_token_ids,
         expert_ids,
         N=N1,
         K=K,
-        stride_am=hidden_padded.stride(0),
-        stride_ak=hidden_padded.stride(1),
+        stride_qm=a_int8_g1.stride(0),
+        stride_qk=a_int8_g1.stride(1),
+        stride_sm=a_scale_g1.stride(0),
+        stride_sk=a_scale_g1.stride(1),
         stride_be=w1.stride(0),
         stride_bk=w1.stride(2),
         stride_bn=w1.stride(1),
@@ -1337,12 +1473,37 @@ def grid1(meta):
         stride_bse=w1_scale.stride(0),
         stride_bsk=w1_scale.stride(2),
         stride_bsn=w1_scale.stride(1),
-        top_k=top_k,
         group_size=group_size,
         BLOCK_SIZE_M=BLOCK_M,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
         compute_type=tl.bfloat16,
     )
 
+    # ---- SiLU + pre-quantize for GEMM2 ----
+    num_k_tiles_g2 = (intermediate + BLOCK_K_QUANT - 1) // BLOCK_K_QUANT
+    a_int8_g2 = torch.empty(
+        max_padded, intermediate, dtype=torch.int8, device=hidden_states.device
+    )
+    a_scale_g2 = torch.empty(
+        max_padded, num_k_tiles_g2, dtype=torch.float32, device=hidden_states.device
+    )
+
+    grid_silu = (max_padded,)
+    wrap_triton(_silu_quantize_int8_kernel)[grid_silu](
+        cache1,
+        a_int8_g2,
+        a_scale_g2,
+        K=intermediate,
+        NUM_K_TILES=num_k_tiles_g2,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
+        stride_am=cache1.stride(0),
+        stride_ak=cache1.stride(1),
+        stride_qm=a_int8_g2.stride(0),
+        stride_qk=a_int8_g2.stride(1),
+        stride_sm=a_scale_g2.stride(0),
+        stride_sk=a_scale_g2.stride(1),
+    )
+
     out_buf = torch.zeros(
         num_pairs + 1,
         N2,
@@ -1354,7 +1515,8 @@ def grid2(meta):
         return (num_expert_blocks * triton.cdiv(N2, meta["BLOCK_SIZE_N"]),)
 
     wrap_triton(_fused_moe_silu_batched_int8_kernel)[grid2](
-        cache1,
+        a_int8_g2,
+        a_scale_g2,
         w2,
         out_buf,
         w2_scale,
@@ -1364,8 +1526,10 @@ def grid2(meta):
         N=N2,
         K=intermediate,
         num_pairs=num_pairs,
-        stride_am=cache1.stride(0),
-        stride_ak=cache1.stride(1),
+        stride_qm=a_int8_g2.stride(0),
+        stride_qk=a_int8_g2.stride(1),
+        stride_sm=a_scale_g2.stride(0),
+        stride_sk=a_scale_g2.stride(1),
         stride_be=w2.stride(0),
         stride_bk=w2.stride(2),
         stride_bn=w2.stride(1),
@@ -1374,9 +1538,9 @@ def grid2(meta):
         stride_bse=w2_scale.stride(0),
         stride_bsk=w2_scale.stride(2),
         stride_bsn=w2_scale.stride(1),
-        top_k=top_k,
         group_size=group_size,
         BLOCK_SIZE_M=BLOCK_M,
+        BLOCK_SIZE_K=BLOCK_K_QUANT,
         compute_type=tl.bfloat16,
     )
 
diff --git a/backends/cuda/triton/kernels/int4_matmul.py b/backends/cuda/triton/kernels/int4_matmul.py
index 78cce9ea7cf..848e0057f9f 100644
--- a/backends/cuda/triton/kernels/int4_matmul.py
+++ b/backends/cuda/triton/kernels/int4_matmul.py
@@ -285,6 +285,11 @@ def _int4_matmul_fake(
     triton.Config({"BLOCK_N": 8, "BLOCK_K": 128}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_N": 8, "BLOCK_K": 256}, num_warps=2, num_stages=3),
     triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=2, num_stages=3),
+    triton.Config({"BLOCK_N": 1, "BLOCK_K": 512}, num_warps=4, num_stages=3),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=4, num_stages=4),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=1, num_stages=2),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 64}, num_warps=1, num_stages=3),
+    triton.Config({"BLOCK_N": 4, "BLOCK_K": 256}, num_warps=8, num_stages=3),
 ]
 
 
diff --git a/backends/mlx/llm/cache.py b/backends/mlx/llm/cache.py
index 9709980689b..6b281818610 100644
--- a/backends/mlx/llm/cache.py
+++ b/backends/mlx/llm/cache.py
@@ -23,7 +23,7 @@
 
 class KVCache(nn.Module):
     """
-    MLX-optimized KV cache with ExecutorTorch llama KVCache interface.
+    MLX-optimized KV cache with ExecuTorch llama KVCache interface.
 
     This class follows the same interface as examples/models/llama/attention.py KVCache,
     making it a drop-in replacement, but uses the mlx::kv_cache_update op internally
diff --git a/backends/mlx/llm/et_attention.py b/backends/mlx/llm/et_attention.py
index 10c758f94fe..ea39133db3a 100644
--- a/backends/mlx/llm/et_attention.py
+++ b/backends/mlx/llm/et_attention.py
@@ -7,7 +7,7 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-MLX-optimized attention for ExecutorTorch's Llama attention registry.
+MLX-optimized attention for ExecuTorch's Llama attention registry.
 
 Registers an "mlx" attention type that uses mlx::kv_cache_update and
 mlx::custom_sdpa for efficient execution on Apple Silicon.
diff --git a/backends/mlx/ops.py b/backends/mlx/ops.py
index 27d214e0ae9..9651d4a7b58 100644
--- a/backends/mlx/ops.py
+++ b/backends/mlx/ops.py
@@ -117,6 +117,7 @@
     RepeatNode,
     ReshapeNode,
     RMSNormNode,
+    RollNode,
     RopeNode,
     RoundNode,
     RsqrtNode,
@@ -1678,6 +1679,45 @@ def _repeat_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     return out
 
 
+@REGISTRY.register(target=[torch.ops.aten.roll.default])
+def _roll_handler(P: MLXProgramBuilder, n: Node) -> Slot:
+    args = P.args(n)
+    require_args(args, 2, 3, "aten.roll")
+    require_kwargs(P.kwargs(n), set(), "aten.roll")
+    x = args[0]
+    shifts_arg = args[1]
+    dims_arg = args[2] if len(args) > 2 else []
+
+    shifts = [shifts_arg] if isinstance(shifts_arg, int) else list(shifts_arg)
+    dims: List[int] = [dims_arg] if isinstance(dims_arg, int) else list(dims_arg)
+
+    # Flat roll (torch.roll with dims=[]) would require reshape + roll +
+    # reshape at the graph level. Not yet supported; Swin-style usage always
+    # passes explicit dims.
+    if not dims:
+        raise NotImplementedError(
+            "aten.roll without dims (flat roll) is not supported by the MLX "
+            "delegate yet."
+        )
+    if len(shifts) != len(dims):
+        raise ValueError(
+            f"aten.roll: shifts and dims must have the same length, got "
+            f"shifts={shifts} (len={len(shifts)}) dims={dims} (len={len(dims)})"
+        )
+    require_static_ints(dims, "dims", "aten.roll")
+
+    out = P.make_or_get_slot(n)
+    P.emit(
+        RollNode(
+            x=P.slot_to_tid(x),
+            out=P.slot_to_tid(out),
+            shift=[P.to_int_or_vid(s) for s in shifts],
+            axes=dims,
+        )
+    )
+    return out
+
+
 @REGISTRY.register(target=[torch.ops.aten.index.Tensor])
 def _index_handler(P: MLXProgramBuilder, n: Node) -> Slot:
     args = P.args(n)
diff --git a/backends/mlx/runtime/MLXInterpreter.h b/backends/mlx/runtime/MLXInterpreter.h
index 304fdfe9805..57f24993499 100644
--- a/backends/mlx/runtime/MLXInterpreter.h
+++ b/backends/mlx/runtime/MLXInterpreter.h
@@ -1733,6 +1733,13 @@ inline void exec_all(const AllNode& n, ExecutionState& st, StreamOrDevice s) {
   }
 }
 
+inline void exec_roll(const RollNode& n, ExecutionState& st, StreamOrDevice s) {
+  const auto& x = st.const_tensor_ref(n.x);
+  auto shifts = to_shape(n.shift, st);
+  std::vector<int> axes(n.axes.begin(), n.axes.end());
+  st.set_tensor(n.out, roll(x, shifts, axes, s));
+}
+
 inline void
 exec_repeat(const RepeatNode& n, ExecutionState& st, StreamOrDevice s) {
   const auto& x = st.const_tensor_ref(n.x);
@@ -2210,6 +2217,9 @@ class Interpreter {
       case OpCode::REPEAT:
         ops::exec_repeat(std::get<RepeatNode>(instr.node), st, s);
         break;
+      case OpCode::ROLL:
+        ops::exec_roll(std::get<RollNode>(instr.node), st, s);
+        break;
       case OpCode::SORT:
         ops::exec_sort(std::get<SortNode>(instr.node), st, s);
         break;
diff --git a/backends/mlx/serialization/schema.fbs b/backends/mlx/serialization/schema.fbs
index 67b4636f0be..42c5754f4f7 100644
--- a/backends/mlx/serialization/schema.fbs
+++ b/backends/mlx/serialization/schema.fbs
@@ -673,6 +673,16 @@ table ArgPartitionNode {
     axis: int32;
 }
 
+// Shift tensor elements along specified axes with wrap-around.
+// Maps to mlx::core::roll(a, shifts, axes).
+// Flat roll (torch.roll with dims=None) is not yet supported.
+table RollNode {
+    x: Tid (required);
+    out: Tid (required);
+    shift: [IntOrVid] (required);  // Shift amount per axis (can be dynamic)
+    axes: [int32] (required);      // Axes to roll along; len(shift) == len(axes)
+}
+
 
 // =============================================================================
 // Math ops - Unary element-wise
@@ -1119,7 +1129,8 @@ union OpNode {
     GatherQmmNode,
     ScanNode,
     MetalKernelNode,
-    BitwiseInvertNode
+    BitwiseInvertNode,
+    RollNode
     // BC: Add new op nodes here (append only)
 }
 
diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py
index 459d5aa1e73..9624d49b05e 100644
--- a/backends/mlx/test/test_ops.py
+++ b/backends/mlx/test/test_ops.py
@@ -855,6 +855,59 @@ def create_inputs(self) -> Tuple[torch.Tensor, ...]:
         return (x,)
 
 
+class RollModel(nn.Module):
+    """Model that rolls a tensor along specified dimensions."""
+
+    def __init__(self, shifts: Tuple[int, ...], dims: Tuple[int, ...]):
+        super().__init__()
+        self.shifts = shifts
+        self.dims = dims
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return torch.roll(x, shifts=self.shifts, dims=self.dims)
+
+
+@register_test
+class RollTest(OpTestCase):
+    """Test case for torch.roll()."""
+
+    name = "roll"
+    rtol = 1e-5
+    atol = 1e-5
+
+    def __init__(
+        self,
+        input_shape: Tuple[int, ...] = (4, 5),
+        shifts: Tuple[int, ...] = (1,),
+        dims: Tuple[int, ...] = (0,),
+    ):
+        self.input_shape = input_shape
+        self.shifts = shifts
+        self.dims = dims
+        shift_str = ",".join(str(s) for s in shifts)
+        dim_str = ",".join(str(d) for d in dims)
+        self.name = f"roll_shift({shift_str})_dim({dim_str})"
+
+    @classmethod
+    def get_test_configs(cls) -> List["RollTest"]:
+        return [
+            cls(input_shape=(8,), shifts=(2,), dims=(0,)),
+            cls(input_shape=(4, 5), shifts=(1,), dims=(0,)),
+            cls(input_shape=(4, 5), shifts=(-2,), dims=(1,)),
+            cls(input_shape=(3, 4, 5), shifts=(3,), dims=(2,)),
+            cls(input_shape=(3, 4, 5), shifts=(1, 2), dims=(0, 2)),
+            cls(input_shape=(3, 4, 5), shifts=(-1, -2, -3), dims=(0, 1, 2)),
+            cls(input_shape=(3, 4, 5), shifts=(2,), dims=(-1,)),
+        ]
+
+    def create_model(self) -> nn.Module:
+        return RollModel(self.shifts, self.dims)
+
+    def create_inputs(self) -> Tuple[torch.Tensor, ...]:
+        x = torch.randn(self.input_shape)
+        return (x,)
+
+
 class CatNModel(nn.Module):
     """Model that concatenates N tensors along a dimension."""
 
@@ -1757,7 +1810,7 @@ class KVCacheModel(nn.Module):
     """
     Test model wrapping KVCache from cache.py.
 
-    This tests the ExecutorTorch llama KVCache-compatible interface that uses
+    This tests the ExecuTorch llama KVCache-compatible interface that uses
     the mlx::kv_cache_update op internally.
     """
 
@@ -1792,7 +1845,7 @@ def forward(
 @register_test
 class KVCacheTest(OpTestCase):
     """
-    Test case for MLX KVCache with ExecutorTorch llama KVCache interface.
+    Test case for MLX KVCache with ExecuTorch llama KVCache interface.
 
     This verifies that KVCache:
     1. Accepts the ET llama KVCache update interface
diff --git a/backends/qualcomm/scripts/install_qnn_sdk.sh b/backends/qualcomm/scripts/install_qnn_sdk.sh
index 5bc0f7eeb1d..7921b48da2f 100644
--- a/backends/qualcomm/scripts/install_qnn_sdk.sh
+++ b/backends/qualcomm/scripts/install_qnn_sdk.sh
@@ -64,7 +64,27 @@ install_qnn() {
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
   QNN_ZIP_FILE="v${QNN_VERSION}.zip"
-  curl --retry 3 -Lo "/tmp/${QNN_ZIP_FILE}" "${QNN_ZIP_URL}"
+  # softwarecenter.qualcomm.com intermittently aborts the download with
+  # HTTP/2 INTERNAL_ERROR mid-stream, and occasionally returns a tiny
+  # error body that curl treats as success — both cases get caught here:
+  # --fail rejects HTTP errors, --retry-all-errors retries transport
+  # errors, and `unzip -t` validates the archive before we proceed.
+  QNN_DOWNLOAD_MAX_ATTEMPTS=5
+  for attempt in $(seq 1 ${QNN_DOWNLOAD_MAX_ATTEMPTS}); do
+    rm -f "/tmp/${QNN_ZIP_FILE}"
+    if curl --fail --retry 3 --retry-delay 5 --retry-connrefused --retry-all-errors \
+         -Lo "/tmp/${QNN_ZIP_FILE}" "${QNN_ZIP_URL}" \
+       && unzip -tq "/tmp/${QNN_ZIP_FILE}"; then
+      break
+    fi
+    ls -l "/tmp/${QNN_ZIP_FILE}" 2>&1 || true
+    if [ "${attempt}" = "${QNN_DOWNLOAD_MAX_ATTEMPTS}" ]; then
+      echo "ERROR: QNN SDK download failed after ${attempt} attempts" >&2
+      exit 1
+    fi
+    echo "QNN SDK download attempt ${attempt} failed; retrying in $((attempt * 10))s..."
+    sleep $((attempt * 10))
+  done
   echo "Finishing downloading qnn sdk."
   unzip -qo "/tmp/${QNN_ZIP_FILE}" -d /tmp
   echo "Finishing unzip qnn sdk."
diff --git a/backends/qualcomm/tests/BUCK b/backends/qualcomm/tests/BUCK
index 25834c7e616..c73a8f89536 100644
--- a/backends/qualcomm/tests/BUCK
+++ b/backends/qualcomm/tests/BUCK
@@ -64,6 +64,10 @@ fbcode_target(_kind = runtime.python_test,
     srcs = [
         "test_passes.py",
     ],
+    env = {} if runtime.is_oss else {
+        "LD_LIBRARY_PATH": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs)".format(get_qnn_library_version()),
+        "QNN_SDK_ROOT": "$(location fbsource//third-party/qualcomm/qnn/qnn-{0}:__dir__)".format(get_qnn_library_version()),
+    },
     deps = [
         ":models",
         "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
@@ -77,6 +81,7 @@ fbcode_target(_kind = runtime.python_test,
         "//executorch/backends/qualcomm/builders:builders",
     ] + ([] if runtime.is_oss else [
         # These deps fail in OSS: keep_gpu_sections kwarg breaks TARGETS evaluation
+        "//executorch/devtools:lib",
         "//executorch/examples/models/llama:transformer_modules",
         "//executorch/examples/qualcomm/oss_scripts/llama:masking_utils",
         "//executorch/examples/qualcomm/oss_scripts/llama:static_llama",
diff --git a/backends/vulkan/runtime/api/Context.cpp b/backends/vulkan/runtime/api/Context.cpp
index 80aef97fc04..d090a62f370 100644
--- a/backends/vulkan/runtime/api/Context.cpp
+++ b/backends/vulkan/runtime/api/Context.cpp
@@ -149,10 +149,14 @@ vkapi::DescriptorSet Context::get_descriptor_set(
 
   spec_constants.append(additional_constants);
 
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(shader_descriptor, adapter_p_);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
       {pipeline_layout_cache().retrieve(shader_layout, push_constants_size),
        shader_cache().retrieve(shader_descriptor),
-       spec_constants});
+       spec_constants,
+       resolved_required_subgroup_size});
 
   cmd_.bind_pipeline(pipeline, pipeline_layout, local_workgroup_size);
 
@@ -315,8 +319,14 @@ VkPipeline Context::get_shader_pipeline(
 
   spec_constants.append(additional_constants);
 
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(shader, adapter_p_);
+
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout, shader_cache().retrieve(shader), spec_constants});
+      {pipeline_layout,
+       shader_cache().retrieve(shader),
+       spec_constants,
+       resolved_required_subgroup_size});
 
   return pipeline;
 }
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
index dab33fb3097..d12fdce2c5c 100644
--- a/backends/vulkan/runtime/gen_vulkan_spv.py
+++ b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -281,8 +281,9 @@ def layout_declare_buffer(
     dtype: str,
     precision: str = "PRECISION",
     is_scalar_array: bool = True,
+    vec_size: int = 4,
 ) -> str:
-    array_type = buffer_gvec_type(dtype, 4)
+    array_type = buffer_gvec_type(dtype, vec_size)
     if is_scalar_array:
         array_type = buffer_scalar_type(dtype)
 
@@ -341,6 +342,7 @@ def layout_declare_tensor(
     storage_type: str,
     is_scalar_array: bool = True,
     precision: str = "PRECISION",
+    vec_size: int = 4,
 ) -> str:
     assert storage_type.lower() in ["buffer", "texture3d", "texture2d"]
 
@@ -357,6 +359,7 @@ def layout_declare_tensor(
             dtype,
             precision,
             is_scalar_array=is_scalar_array,
+            vec_size=vec_size,
         )
 
     # Create image/sampler binding
@@ -785,6 +788,10 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                     "generate_variant_forall", None
                 )
 
+                reserved_yaml_keys = {
+                    "generate_variant_forall",
+                }
+
                 for variant in params_dict["shader_variants"]:
                     default_iterated_params_names = set(
                         default_iterated_params.keys()
@@ -797,7 +804,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                         variant_params_names
                         - default_iterated_params_names
                         - params_names
-                        - {"generate_variant_forall"}
+                        - reserved_yaml_keys
                     )
                     assert len(invalid_keys) == 0
 
@@ -813,7 +820,7 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                         for combination in variant_combinations:
                             default_params_copy = copy.deepcopy(default_params)
                             for key in variant:
-                                if key != "generate_variant_forall":
+                                if key not in reserved_yaml_keys:
                                     default_params_copy[key] = variant[key]
 
                             variant_name = variant["NAME"]
@@ -842,7 +849,8 @@ def parseTemplateYaml(self, yaml_file: str) -> None:  # noqa: C901
                     else:
                         default_params_copy = copy.deepcopy(default_params)
                         for key in variant:
-                            default_params_copy[key] = variant[key]
+                            if key not in reserved_yaml_keys:
+                                default_params_copy[key] = variant[key]
 
                         self.shader_template_params[template_name].append(
                             default_params_copy
@@ -1026,6 +1034,27 @@ def generate_src_file(shader_paths_pair) -> Tuple[bool, List[str]]:
                 print(f"template_file_path: {template_file_path}")
                 output_text = preprocess(input_text, codegen_params)
 
+            # If the shader yaml declared a SUBGROUP_SIZE template parameter,
+            # embed it into the generated GLSL as a comment. getShaderInfo()
+            # parses it back out alongside TILE_SIZE, WEIGHT_STORAGE, etc.,
+            # avoiding a side-channel name -> value map.
+            subgroup_size = codegen_params.get("SUBGROUP_SIZE")
+            if subgroup_size is not None:
+                try:
+                    subgroup_size_int = int(subgroup_size)
+                except (TypeError, ValueError) as e:
+                    raise RuntimeError(
+                        f"Shader variant {src_file_name!r} declared "
+                        f"SUBGROUP_SIZE={subgroup_size!r}, which is not "
+                        f"parseable as an integer. Fix the SUBGROUP_SIZE "
+                        f"value in the shader's yaml."
+                    ) from e
+                if subgroup_size_int > 0:
+                    output_text = (
+                        f"// REQUIRED_SUBGROUP_SIZE = {subgroup_size_int}\n"
+                        + output_text
+                    )
+
             included_files = get_glsl_includes(output_text)
 
             with codecs.open(gen_out_path, "w", encoding="utf-8") as output_file:
@@ -1184,6 +1213,12 @@ class ShaderInfo:
     requires_integer_dot_product_ext: bool = False
     requires_shader_int64_ext: bool = False
     requires_shader_float64_ext: bool = False
+    # Subgroup size requirement (matches the C++ ShaderInfo encoding):
+    #   0  = no requirement
+    #   >0 = literal fixed size; sourced from the shader yaml's
+    #        `SUBGROUP_SIZE` template parameter (single source of truth for
+    #        both GLSL substitution and the Vulkan pipeline pin).
+    required_subgroup_size: int = 0
 
 
 def getName(filePath: str) -> str:
@@ -1208,6 +1243,17 @@ def findTileSizes(lineStr: str) -> List[int]:
     return [int(matches.group(1)), int(matches.group(2)), int(matches.group(3))]
 
 
+def isRequiredSubgroupSizeLine(lineStr: str) -> bool:
+    return re.search(r"^// REQUIRED_SUBGROUP_SIZE = ", lineStr) is not None
+
+
+def findRequiredSubgroupSize(lineStr: str) -> int:
+    matches = re.search(r"^// REQUIRED_SUBGROUP_SIZE = ([0-9]+)", lineStr)
+    if matches is None:
+        raise AssertionError("matches is None in findRequiredSubgroupSize")
+    return int(matches.group(1))
+
+
 def isWeightStorageTypeLine(lineStr: str) -> bool:
     weight_storage_id = r"^ \* WEIGHT_STORAGE = "
     return re.search(weight_storage_id, lineStr) is not None
@@ -1281,6 +1327,8 @@ def getShaderInfo(srcFilePath: str) -> ShaderInfo:  # noqa: C901
                 shader_info.layouts.append(determineDescriptorType(line))
             if isTileSizeLine(line):
                 shader_info.tile_size = findTileSizes(line)
+            if isRequiredSubgroupSizeLine(line):
+                shader_info.required_subgroup_size = findRequiredSubgroupSize(line)
             if isWeightStorageTypeLine(line):
                 shader_info.weight_storage_type = getWeightStorageType(line)
             if isBiasStorageTypeLine(line):
@@ -1378,6 +1426,7 @@ def to_cpp_str(val: bool):
         to_cpp_str(shader_info.requires_integer_dot_product_ext),
         to_cpp_str(shader_info.requires_shader_int64_ext),
         to_cpp_str(shader_info.requires_shader_float64_ext),
+        str(shader_info.required_subgroup_size),
     ]
 
     shader_info_str = textwrap.indent(
@@ -1406,7 +1455,9 @@ def generateShaderDispatchStr(shader_info: ShaderInfo, name: str) -> str:
 
 
 def genCppFiles(
-    spv_files: Dict[str, str], cpp_header_path: str, cpp_src_file_path: str
+    spv_files: Dict[str, str],
+    cpp_header_path: str,
+    cpp_src_file_path: str,
 ) -> None:
     spv_bin_strs = []
     register_shader_info_strs = []
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index f0b61e128bb..3accdf375cb 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -825,10 +825,20 @@ void ComputeGraph::register_pipeline_to_create(
 
   spec_constants.append(spec_vars);
 
+  // Resolve any shader-declared required subgroup size into a concrete value
+  // so the pre-built pipeline matches the one created at dispatch time. The
+  // shared helper throws ShaderNotSupportedError when the adapter cannot honor
+  // the requirement; let it propagate so a stale unused pipeline doesn't sit
+  // in the cache while dispatch later throws on the same shader.
+  const uint32_t resolved_required_subgroup_size =
+      vkapi::resolve_required_subgroup_size(
+          shader_info, context()->adapter_ptr());
+
   const vkapi::ComputePipelineCache::Key desc = {
       context()->pipeline_layout_cache().retrieve(shader_layout, pc_offset),
       context()->shader_cache().retrieve(shader_info),
-      spec_constants};
+      spec_constants,
+      resolved_required_subgroup_size};
 
   if (context_->pipeline_cache().contains(desc)) {
     return;
diff --git a/backends/vulkan/runtime/vk_api/Adapter.cpp b/backends/vulkan/runtime/vk_api/Adapter.cpp
index e965687bc00..b762c95205b 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.cpp
+++ b/backends/vulkan/runtime/vk_api/Adapter.cpp
@@ -129,6 +129,9 @@ VkDevice create_logical_device(
 #ifdef VK_NV_cooperative_matrix2
       VK_NV_COOPERATIVE_MATRIX_2_EXTENSION_NAME,
 #endif /* VK_NV_cooperative_matrix2 */
+#ifdef VK_EXT_subgroup_size_control
+      VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME,
+#endif /* VK_EXT_subgroup_size_control */
   };
 
   std::vector<const char*> enabled_device_extensions;
@@ -199,6 +202,19 @@ VkDevice create_logical_device(
   extension_list_top = &cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  // Only enable the feature struct if the extension was actually requested
+  // and the feature flag is set on the physical device. The extension itself
+  // is filtered into enabled_device_extensions by
+  // find_requested_device_extensions.
+  VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features{
+      physical_device.subgroup_size_control_features};
+  if (physical_device.supports_subgroup_size_control) {
+    subgroup_size_control_features.pNext = extension_list_top;
+    extension_list_top = &subgroup_size_control_features;
+  }
+#endif /* VK_EXT_subgroup_size_control */
+
   device_create_info.pNext = extension_list_top;
 
   VkDevice handle = nullptr;
@@ -405,7 +421,7 @@ std::string Adapter::stringize() const {
   ss << "    deviceType:    " << device_type << std::endl;
   ss << "    deviceName:    " << properties.deviceName << std::endl;
 
-#define PRINT_BOOL(value, name) \
+#define PRINT_VALUE(value, name) \
   ss << "      " << std::left << std::setw(36) << #name << value << std::endl;
 
 #define PRINT_PROP(struct, name)                                       \
@@ -452,7 +468,7 @@ std::string Adapter::stringize() const {
 #endif /* VK_KHR_8bit_storage */
 
   ss << "    Shader 16bit and 8bit Features {" << std::endl;
-  PRINT_BOOL(physical_device_.supports_int16_shader_types, shaderInt16)
+  PRINT_VALUE(physical_device_.supports_int16_shader_types, shaderInt16)
 #ifdef VK_KHR_shader_float16_int8
   PRINT_PROP(physical_device_.shader_float16_int8_types, shaderFloat16);
   PRINT_PROP(physical_device_.shader_float16_int8_types, shaderInt8);
@@ -460,8 +476,29 @@ std::string Adapter::stringize() const {
   ss << "    }" << std::endl;
 
   ss << "    Shader 64bit Features {" << std::endl;
-  PRINT_BOOL(physical_device_.supports_int64_shader_types, shaderInt64)
-  PRINT_BOOL(physical_device_.supports_float64_shader_types, shaderFloat64)
+  PRINT_VALUE(physical_device_.supports_int64_shader_types, shaderInt64)
+  PRINT_VALUE(physical_device_.supports_float64_shader_types, shaderFloat64)
+  ss << "    }" << std::endl;
+
+  ss << "    Subgroup Properties {" << std::endl;
+  PRINT_VALUE(subgroup_size(), subgroupSize)
+  PRINT_VALUE(supports_subgroup_compute_basic(), computeSubgroupBasic)
+  PRINT_VALUE(supports_subgroup_compute_shuffle(), computeSubgroupShuffle)
+  PRINT_VALUE(supports_subgroup_compute_ballot(), computeSubgroupBallot)
+  PRINT_VALUE(supports_subgroup_compute_vote(), computeSubgroupVote)
+  PRINT_VALUE(supports_subgroup_compute_arithmetic(), computeSubgroupArithmetic)
+  PRINT_VALUE(
+      supports_subgroup_compute_shuffle_relative(),
+      computeSubgroupShuffleRelative)
+  PRINT_VALUE(supports_subgroup_compute_clustered(), computeSubgroupClustered)
+  PRINT_VALUE(supports_subgroup_compute_quad(), computeSubgroupQuad)
+  PRINT_VALUE(min_subgroup_size(), minSubgroupSize)
+  PRINT_VALUE(max_subgroup_size(), maxSubgroupSize)
+  PRINT_VALUE(supports_subgroup_size_control(), subgroupSizeControl)
+  PRINT_VALUE(supports_compute_full_subgroups(), computeFullSubgroups)
+  PRINT_VALUE(
+      supports_required_subgroup_size_for_compute(),
+      requiredSubgroupSizeStages_compute)
   ss << "    }" << std::endl;
 
 #ifdef VK_KHR_shader_integer_dot_product
@@ -614,5 +651,24 @@ std::ostream& operator<<(std::ostream& os, const Adapter& adapter) {
   return os;
 }
 
+uint32_t resolve_required_subgroup_size(
+    const ShaderInfo& shader,
+    Adapter* adapter) {
+  if (shader.required_subgroup_size == 0u) {
+    return 0u;
+  }
+  if (!adapter->supports_required_subgroup_size_for_compute()) {
+    throw ShaderNotSupportedError(
+        shader.kernel_name, VulkanExtension::SUBGROUP_SIZE_CONTROL);
+  }
+  const uint32_t resolved = shader.required_subgroup_size;
+  if (resolved < adapter->min_subgroup_size() ||
+      resolved > adapter->max_subgroup_size()) {
+    throw ShaderNotSupportedError(
+        shader.kernel_name, VulkanExtension::SUBGROUP_SIZE_CONTROL);
+  }
+  return resolved;
+}
+
 } // namespace vkapi
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index 3c503deab70..68ae0b6528d 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -285,6 +285,106 @@ class Adapter final {
     return physical_device_.min_ubo_alignment;
   }
 
+  // Subgroup properties
+
+  inline uint32_t subgroup_size() const {
+    return physical_device_.subgroup_size;
+  }
+
+  inline bool supports_subgroup_compute_basic() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_BASIC_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_shuffle() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_SHUFFLE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_ballot() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_BALLOT_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_vote() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_VOTE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_arithmetic() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_shuffle_relative() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_clustered() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_CLUSTERED_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  inline bool supports_subgroup_compute_quad() const {
+    return (physical_device_.supported_subgroup_ops &
+            VK_SUBGROUP_FEATURE_QUAD_BIT) != 0 &&
+        (physical_device_.supported_subgroup_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
+  // Subgroup size control (VK_EXT_subgroup_size_control / Vulkan 1.3 core).
+
+  inline uint32_t min_subgroup_size() const {
+    return physical_device_.min_subgroup_size;
+  }
+
+  inline uint32_t max_subgroup_size() const {
+    return physical_device_.max_subgroup_size;
+  }
+
+  inline bool supports_subgroup_size_control() const {
+#ifdef ETVK_FORCE_NO_EXTENSIONS
+    return false;
+#endif
+#ifdef VK_EXT_subgroup_size_control
+    return physical_device_.supports_subgroup_size_control;
+#else
+    return false;
+#endif /* VK_EXT_subgroup_size_control */
+  }
+
+  inline bool supports_compute_full_subgroups() const {
+#ifdef ETVK_FORCE_NO_EXTENSIONS
+    return false;
+#endif
+#ifdef VK_EXT_subgroup_size_control
+    return physical_device_.supports_compute_full_subgroups;
+#else
+    return false;
+#endif /* VK_EXT_subgroup_size_control */
+  }
+
+  inline bool supports_required_subgroup_size_for_compute() const {
+    return supports_subgroup_size_control() &&
+        (physical_device_.required_subgroup_size_stages &
+         VK_SHADER_STAGE_COMPUTE_BIT) != 0;
+  }
+
   inline uint32_t max_texture2d_dim() const {
     return physical_device_.properties.limits.maxImageDimension2D;
   }
@@ -312,5 +412,15 @@ class Adapter final {
   friend std::ostream& operator<<(std::ostream&, const Adapter&);
 };
 
+// Resolve the shader-declared required subgroup size into a concrete value
+// (or 0 = no requirement) given the adapter capabilities. Throws
+// ShaderNotSupportedError when the shader declares a required subgroup size
+// but the adapter cannot honor it (extension unsupported, or value out of
+// the adapter's [min, max] range). Silent fallback would create a
+// correctness/perf landmine, so callers must be prepared for this throw.
+uint32_t resolve_required_subgroup_size(
+    const ShaderInfo& shader,
+    Adapter* adapter);
+
 } // namespace vkapi
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/vk_api/Device.cpp b/backends/vulkan/runtime/vk_api/Device.cpp
index cb6a54dc489..4deaecbe12c 100644
--- a/backends/vulkan/runtime/vk_api/Device.cpp
+++ b/backends/vulkan/runtime/vk_api/Device.cpp
@@ -78,6 +78,14 @@ PhysicalDevice::PhysicalDevice(
       cooperative_matrix2_features{
           VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_2_FEATURES_NV},
 #endif /* VK_NV_cooperative_matrix2 */
+#ifdef VK_EXT_subgroup_size_control
+      subgroup_size_control_features{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_FEATURES_EXT,
+          nullptr},
+      subgroup_size_control_properties{
+          VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT,
+          nullptr},
+#endif /* VK_EXT_subgroup_size_control */
       queue_families{},
       num_compute_queues(0),
       api_version_major(0),
@@ -89,6 +97,14 @@ PhysicalDevice::PhysicalDevice(
       has_timestamps(false),
       timestamp_period(0),
       min_ubo_alignment(0),
+      subgroup_size(0),
+      supported_subgroup_ops(0),
+      supported_subgroup_stages(0),
+      min_subgroup_size(0),
+      max_subgroup_size(0),
+      required_subgroup_size_stages(0),
+      supports_subgroup_size_control(false),
+      supports_compute_full_subgroups(false),
       device_name{},
       device_type{DeviceType::UNKNOWN} {
   // Extract physical device properties
@@ -275,6 +291,11 @@ void PhysicalDevice::query_extensions_vk_1_1() {
   extension_list_top = &cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  subgroup_size_control_features.pNext = extension_list_top;
+  extension_list_top = &subgroup_size_control_features;
+#endif /* VK_EXT_subgroup_size_control */
+
   features2.pNext = extension_list_top;
 
   vkGetPhysicalDeviceFeatures2(handle, &features2);
@@ -289,16 +310,60 @@ void PhysicalDevice::query_extensions_vk_1_1() {
     supports_float64_shader_types = true;
   }
 
+#ifdef VK_EXT_subgroup_size_control
+  supports_subgroup_size_control =
+      subgroup_size_control_features.subgroupSizeControl == VK_TRUE;
+  supports_compute_full_subgroups =
+      subgroup_size_control_features.computeFullSubgroups == VK_TRUE;
+#endif /* VK_EXT_subgroup_size_control */
+
   // Query properties separately from features
   VkPhysicalDeviceProperties2 properties2{
       VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2};
 
+  void* properties_list_top = nullptr;
+
+  VkPhysicalDeviceSubgroupProperties subgroup_properties{
+      VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES};
+  subgroup_properties.pNext = properties_list_top;
+  properties_list_top = &subgroup_properties;
+
 #ifdef VK_KHR_shader_integer_dot_product
-  shader_int_dot_product_properties.pNext = nullptr;
-  properties2.pNext = &shader_int_dot_product_properties;
+  shader_int_dot_product_properties.pNext = properties_list_top;
+  properties_list_top = &shader_int_dot_product_properties;
 #endif /* VK_KHR_shader_integer_dot_product */
 
+#ifdef VK_EXT_subgroup_size_control
+  subgroup_size_control_properties.pNext = properties_list_top;
+  properties_list_top = &subgroup_size_control_properties;
+#endif /* VK_EXT_subgroup_size_control */
+
+  properties2.pNext = properties_list_top;
+
   vkGetPhysicalDeviceProperties2(handle, &properties2);
+
+  subgroup_size = subgroup_properties.subgroupSize;
+  supported_subgroup_ops = subgroup_properties.supportedOperations;
+  supported_subgroup_stages = subgroup_properties.supportedStages;
+
+#ifdef VK_EXT_subgroup_size_control
+  if (supports_subgroup_size_control) {
+    min_subgroup_size = subgroup_size_control_properties.minSubgroupSize;
+    max_subgroup_size = subgroup_size_control_properties.maxSubgroupSize;
+    required_subgroup_size_stages =
+        subgroup_size_control_properties.requiredSubgroupSizeStages;
+  } else {
+    // Default to the single subgroup_size when control is unavailable so
+    // callers can use min/max range queries unconditionally.
+    min_subgroup_size = subgroup_size;
+    max_subgroup_size = subgroup_size;
+    required_subgroup_size_stages = 0;
+  }
+#else
+  min_subgroup_size = subgroup_size;
+  max_subgroup_size = subgroup_size;
+  required_subgroup_size_stages = 0;
+#endif /* VK_EXT_subgroup_size_control */
 }
 
 void PhysicalDevice::override_device_name(const std::string& new_name) {
diff --git a/backends/vulkan/runtime/vk_api/Device.h b/backends/vulkan/runtime/vk_api/Device.h
index 9fa413b2457..05660e779b8 100644
--- a/backends/vulkan/runtime/vk_api/Device.h
+++ b/backends/vulkan/runtime/vk_api/Device.h
@@ -60,6 +60,12 @@ struct PhysicalDevice final {
   VkPhysicalDeviceCooperativeMatrix2FeaturesNV cooperative_matrix2_features;
 #endif /* VK_NV_cooperative_matrix2 */
 
+#ifdef VK_EXT_subgroup_size_control
+  VkPhysicalDeviceSubgroupSizeControlFeaturesEXT subgroup_size_control_features;
+  VkPhysicalDeviceSubgroupSizeControlPropertiesEXT
+      subgroup_size_control_properties;
+#endif /* VK_EXT_subgroup_size_control */
+
   // Available GPU queues
   std::vector<VkQueueFamilyProperties> queue_families;
 
@@ -75,6 +81,22 @@ struct PhysicalDevice final {
   float timestamp_period;
   size_t min_ubo_alignment;
 
+  // Subgroup properties (queried via VkPhysicalDeviceSubgroupProperties).
+  // Populated from VK_VERSION_1_1+ devices; otherwise left at safe defaults.
+  uint32_t subgroup_size;
+  VkSubgroupFeatureFlags supported_subgroup_ops;
+  VkShaderStageFlags supported_subgroup_stages;
+
+  // Subgroup size control (VK_EXT_subgroup_size_control / Vulkan 1.3 core).
+  // Populated only if the extension/feature is supported; default to safe
+  // values otherwise. min/max set to subgroup_size when the extension is
+  // missing so callers can use the same range queries unconditionally.
+  uint32_t min_subgroup_size;
+  uint32_t max_subgroup_size;
+  VkShaderStageFlags required_subgroup_size_stages;
+  bool supports_subgroup_size_control;
+  bool supports_compute_full_subgroups;
+
   // Device identity
   std::string device_name;
   DeviceType device_type;
diff --git a/backends/vulkan/runtime/vk_api/Exception.cpp b/backends/vulkan/runtime/vk_api/Exception.cpp
index 5bcf047aaf1..3181476543e 100644
--- a/backends/vulkan/runtime/vk_api/Exception.cpp
+++ b/backends/vulkan/runtime/vk_api/Exception.cpp
@@ -118,6 +118,9 @@ std::ostream& operator<<(std::ostream& out, const VulkanExtension result) {
     case VulkanExtension::SHADER_FLOAT64:
       out << "shaderFloat64";
       break;
+    case VulkanExtension::SUBGROUP_SIZE_CONTROL:
+      out << "VK_EXT_subgroup_size_control (compute stage required size)";
+      break;
   }
   return out;
 }
diff --git a/backends/vulkan/runtime/vk_api/Exception.h b/backends/vulkan/runtime/vk_api/Exception.h
index aa1ef1f2526..ab45ed83bcd 100644
--- a/backends/vulkan/runtime/vk_api/Exception.h
+++ b/backends/vulkan/runtime/vk_api/Exception.h
@@ -85,6 +85,7 @@ enum class VulkanExtension : uint8_t {
   INTEGER_DOT_PRODUCT,
   SHADER_INT64,
   SHADER_FLOAT64,
+  SUBGROUP_SIZE_CONTROL,
 };
 
 class ShaderNotSupportedError : public std::exception {
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.cpp b/backends/vulkan/runtime/vk_api/Pipeline.cpp
index 522c4b8589b..2b42e25c92e 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.cpp
+++ b/backends/vulkan/runtime/vk_api/Pipeline.cpp
@@ -287,9 +287,21 @@ ComputePipeline::ComputePipeline(
       descriptor.specialization_constants.data(), // pData
   };
 
+  const void* shader_stage_pnext = nullptr;
+#ifdef VK_EXT_subgroup_size_control
+  VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT
+      required_subgroup_size_info{
+          VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+          nullptr,
+          descriptor.required_subgroup_size};
+  if (descriptor.required_subgroup_size > 0u) {
+    shader_stage_pnext = &required_subgroup_size_info;
+  }
+#endif /* VK_EXT_subgroup_size_control */
+
   const VkPipelineShaderStageCreateInfo shader_stage_create_info{
       VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-      nullptr, // pNext
+      shader_stage_pnext, // pNext
       0u, // flags
       VK_SHADER_STAGE_COMPUTE_BIT, // stage
       descriptor.shader_module, // module
@@ -355,7 +367,8 @@ bool operator==(
   return (
       _1.pipeline_layout == _2.pipeline_layout &&
       _1.shader_module == _2.shader_module &&
-      _1.specialization_constants == _2.specialization_constants);
+      _1.specialization_constants == _2.specialization_constants &&
+      _1.required_subgroup_size == _2.required_subgroup_size);
 }
 
 //
@@ -489,7 +502,18 @@ void ComputePipelineCache::create_pipelines(
   std::vector<VkComputePipelineCreateInfo> create_infos;
   create_infos.reserve(num_pipelines);
 
-  for (const auto& key : keys_to_create) {
+#ifdef VK_EXT_subgroup_size_control
+  // Stable storage for any required-subgroup-size structs that need to live
+  // until vkCreateComputePipelines returns. Indexed by pipeline index; only
+  // the entries for pipelines that actually request a fixed subgroup size
+  // are populated.
+  std::vector<VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT>
+      required_subgroup_size_infos(num_pipelines);
+#endif /* VK_EXT_subgroup_size_control */
+
+  for (size_t pipeline_idx = 0; pipeline_idx < keys_to_create.size();
+       ++pipeline_idx) {
+    const auto& key = keys_to_create[pipeline_idx];
     map_entries.push_back(key.specialization_constants.generate_map_entries());
 
     specialization_infos.push_back(VkSpecializationInfo{
@@ -499,9 +523,20 @@ void ComputePipelineCache::create_pipelines(
         key.specialization_constants.data(), // pData
     });
 
+    const void* shader_stage_pnext = nullptr;
+#ifdef VK_EXT_subgroup_size_control
+    if (key.required_subgroup_size > 0u) {
+      required_subgroup_size_infos[pipeline_idx] = {
+          VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
+          nullptr,
+          key.required_subgroup_size};
+      shader_stage_pnext = &required_subgroup_size_infos[pipeline_idx];
+    }
+#endif /* VK_EXT_subgroup_size_control */
+
     shader_stage_create_infos.push_back(VkPipelineShaderStageCreateInfo{
         VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, // sType
-        nullptr, // pNext
+        shader_stage_pnext, // pNext
         0u, // flags
         VK_SHADER_STAGE_COMPUTE_BIT, // stage
         key.shader_module, // module
diff --git a/backends/vulkan/runtime/vk_api/Pipeline.h b/backends/vulkan/runtime/vk_api/Pipeline.h
index 67dfaebe75b..5e286ad77d7 100644
--- a/backends/vulkan/runtime/vk_api/Pipeline.h
+++ b/backends/vulkan/runtime/vk_api/Pipeline.h
@@ -157,6 +157,14 @@ class ComputePipeline final {
     VkPipelineLayout pipeline_layout;
     VkShaderModule shader_module;
     SpecVarList specialization_constants;
+    // Optional: when nonzero, the pipeline is created with
+    // VkPipelineShaderStageRequiredSubgroupSizeCreateInfo chained into the
+    // shader stage pNext, locking the subgroup size to this value. Must be a
+    // power of two within [adapter.min_subgroup_size(),
+    // adapter.max_subgroup_size()] and the adapter must support
+    // VK_EXT_subgroup_size_control with VK_SHADER_STAGE_COMPUTE_BIT in the
+    // required_subgroup_size_stages mask. 0 = no requirement.
+    uint32_t required_subgroup_size = 0u;
   };
 
   explicit ComputePipeline(VkDevice device, VkPipeline handle);
@@ -281,6 +289,9 @@ class ComputePipelineCache final {
         seed = utils::hash_combine(seed, new_seed);
       }
 
+      seed = utils::hash_combine(
+          seed, std::hash<uint32_t>()(descriptor.required_subgroup_size));
+
       return seed;
     }
   };
diff --git a/backends/vulkan/runtime/vk_api/Shader.cpp b/backends/vulkan/runtime/vk_api/Shader.cpp
index c932d0a264b..f55cc88e5c3 100644
--- a/backends/vulkan/runtime/vk_api/Shader.cpp
+++ b/backends/vulkan/runtime/vk_api/Shader.cpp
@@ -34,7 +34,8 @@ ShaderInfo::ShaderInfo(
     const bool requires_8bit_storage_ext,
     const bool requires_integer_dot_product_ext,
     const bool requires_shader_int64_ext,
-    const bool requires_shader_float64_ext)
+    const bool requires_shader_float64_ext,
+    const uint32_t required_subgroup_size_arg)
     : src_code{
           spirv_bin,
           size,
@@ -47,7 +48,8 @@ ShaderInfo::ShaderInfo(
       requires_8bit_storage(requires_8bit_storage_ext),
       requires_integer_dot_product(requires_integer_dot_product_ext),
       requires_shader_int64(requires_shader_int64_ext),
-      requires_shader_float64(requires_shader_float64_ext) {
+      requires_shader_float64(requires_shader_float64_ext),
+      required_subgroup_size(required_subgroup_size_arg) {
 }
 
 bool operator==(const ShaderInfo& _1, const ShaderInfo& _2) {
diff --git a/backends/vulkan/runtime/vk_api/Shader.h b/backends/vulkan/runtime/vk_api/Shader.h
index 6311710f02b..6cef4d923e9 100644
--- a/backends/vulkan/runtime/vk_api/Shader.h
+++ b/backends/vulkan/runtime/vk_api/Shader.h
@@ -69,6 +69,13 @@ struct ShaderInfo final {
   bool requires_shader_int64 = false;
   bool requires_shader_float64 = false;
 
+  // Subgroup size requirement declared in the shader's yaml.
+  //   0  = no requirement (default)
+  //  >0  = literal fixed size; pipeline is pinned to this subgroup size.
+  // Sourced from the yaml's `SUBGROUP_SIZE` template parameter — single
+  // source of truth shared with GLSL ${SUBGROUP_SIZE} substitution.
+  uint32_t required_subgroup_size = 0u;
+
   explicit ShaderInfo();
 
   explicit ShaderInfo(
@@ -82,7 +89,8 @@ struct ShaderInfo final {
       const bool requires_8bit_storage_ext,
       const bool requires_integer_dot_product_ext,
       const bool requires_shader_int64_ext,
-      const bool requires_shader_float64_ext);
+      const bool requires_shader_float64_ext,
+      const uint32_t required_subgroup_size_arg = 0u);
 
   operator bool() const {
     return src_code.bin != nullptr;
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
index 10775a428bb..7689d522aa6 100644
--- a/backends/vulkan/targets.bzl
+++ b/backends/vulkan/targets.bzl
@@ -130,7 +130,6 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False, no_volk = Fal
         },
         cmd = genrule_cmd,
         default_outs = ["."],
-        labels = ["uses_dotslash"],
     )
 
     suffix = "_no_volk" if no_volk else ""
diff --git a/backends/vulkan/test/custom_ops/targets.bzl b/backends/vulkan/test/custom_ops/targets.bzl
index 5fb0f7f4cbf..3c9a67d8ca4 100644
--- a/backends/vulkan/test/custom_ops/targets.bzl
+++ b/backends/vulkan/test/custom_ops/targets.bzl
@@ -6,14 +6,13 @@ load(
     "vulkan_spv_shader_lib",
 )
 
-def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None):
+def define_custom_op_test_binary(custom_op_name, extra_deps = [], src_file = None, include_torch = False):
     deps_list = [
         ":prototyping_utils",
         ":operator_implementations",
         ":custom_ops_shaderlib",
         "//executorch/backends/vulkan:vulkan_graph_runtime",
-        runtime.external_dep_location("libtorch"),
-    ] + extra_deps
+    ] + ([runtime.external_dep_location("libtorch")] if include_torch else []) + extra_deps
 
     src_file_str = src_file if src_file else "{}.cpp".format(custom_op_name)
 
diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt
index 625e3d2523f..1b46c993b17 100644
--- a/backends/xnnpack/CMakeLists.txt
+++ b/backends/xnnpack/CMakeLists.txt
@@ -154,12 +154,16 @@ install(
           xnnpack-normalization
           xnnpack-operators
           xnnpack-operator-run
+          xnnpack-operator-delete
           xnnpack-operator-utils
           xnnpack-pack-lh
           xnnpack-packing
           xnnpack-sanitizers
           xnnpack-subgraph
           xnnpack-datatype
+          xnnpack-fingerprint-id
+          xnnpack-fingerprint-cache
+          xnnpack-fingerprint-check
           xnnpack-reference-ukernels
           xnnpack-logging
   EXPORT ExecuTorchTargets
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index 3131afead79..1adaa7c709d 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit 3131afead790c5c69a9aa12273dfc40399789ad7
+Subproject commit 1adaa7c709d4839d29e1f219cb962b01c9e6a905
diff --git a/backends/xnnpack/third-party/cpuinfo b/backends/xnnpack/third-party/cpuinfo
index 8a9210069b5..f9a03241f8c 160000
--- a/backends/xnnpack/third-party/cpuinfo
+++ b/backends/xnnpack/third-party/cpuinfo
@@ -1 +1 @@
-Subproject commit 8a9210069b5a37dd89ed118a783945502a30a4ae
+Subproject commit f9a03241f8c3d4ed0c9728f5d70bff873d43d4e0
diff --git a/backends/xnnpack/third-party/pthreadpool b/backends/xnnpack/third-party/pthreadpool
index c2ba5c50bb5..a56dcd79c69 160000
--- a/backends/xnnpack/third-party/pthreadpool
+++ b/backends/xnnpack/third-party/pthreadpool
@@ -1 +1 @@
-Subproject commit c2ba5c50bb58d1397b693740cf75fad836a0d1bf
+Subproject commit a56dcd79c699366e7ac6466792c3025883ff7704
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index 14520b07664..ac861435af8 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -41,7 +41,10 @@ def define_xnnpack():
             "XNNPACK/src/memory.c",
             "XNNPACK/src/mutex.c",
             "XNNPACK/src/normalization.c",
+            "XNNPACK/src/operator-delete.c",
             "XNNPACK/src/operator-utils.c",
+            "XNNPACK/src/operators/fingerprint_cache.c",
+            "XNNPACK/src/operators/fingerprint_id.c",
             "XNNPACK/src/reference/packing.cc",
         ],
         headers = get_xnnpack_headers(),
@@ -1039,7 +1042,7 @@ def define_xnnpack():
     native.cxx_library(
         name = "ukernels_avx512vnnigfni",
         srcs = select({
-            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnnifgni"),
+            "DEFAULT": prod_srcs_for_arch_wrapper("avx512vnnigfni"),
             "ovr_config//cpu:arm32": DEFAULT_DUMMY_SRC,
             "ovr_config//cpu:arm64": DEFAULT_DUMMY_SRC,
         }),
@@ -1068,6 +1071,7 @@ def define_xnnpack():
         "-mavxvnni",
         "-mf16c",
         "-mfma",
+        "-mgfni",
     ]
 
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
@@ -1172,6 +1176,14 @@ def define_xnnpack():
             # "-DXNN_ENABLE_DWCONV_MULTIPLASS=0",
             "-DXNN_ENABLE_ARM_I8MM=1",
             "-DXNN_ENABLE_ARM_FP16_VECTOR=1",
+            "-DXNN_ENABLE_SSE=1",
+            "-DXNN_ENABLE_SSE2=1",
+            "-DXNN_ENABLE_SSSE3=1",
+            "-DXNN_ENABLE_SSE41=1",
+            "-DXNN_ENABLE_AVX=1",
+            "-DXNN_ENABLE_F16C=1",
+            "-DXNN_ENABLE_FMA3=1",
+            "-DXNN_ENABLE_AVX2=1",
             "-DXNN_ENABLE_AVX512F=1",
             "-DXNN_ENABLE_AVX512SKX=1",
             "-DXNN_ENABLE_AVX512VNNI=1",
diff --git a/docs/source/Doxyfile b/docs/source/Doxyfile
index 2b895b215a3..c6b8fb0275b 100644
--- a/docs/source/Doxyfile
+++ b/docs/source/Doxyfile
@@ -963,6 +963,11 @@ INPUT                  = ../devtools/bundled_program/bundled_program.h \
                          ../runtime/core/span.h \
                          ../runtime/core/tag.h \
                          ../runtime/core/tensor_shape_dynamism.h \
+                         ../extension/module/bundled_module.h \
+                         ../extension/module/module.h \
+                         ../extension/tensor/tensor_accessor.h \
+                         ../extension/tensor/tensor_ptr.h \
+                         ../extension/tensor/tensor_ptr_maker.h \
                          ../runtime/platform/compiler.h \
                          ../runtime/executor/ \
                          ../runtime/platform/
@@ -2374,7 +2379,7 @@ ENABLE_PREPROCESSING   = YES
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-MACRO_EXPANSION        = NO
+MACRO_EXPANSION        = YES
 
 # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES then
 # the macro expansion is limited to the macros specified with the PREDEFINED and
@@ -2382,7 +2387,7 @@ MACRO_EXPANSION        = NO
 # The default value is: NO.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-EXPAND_ONLY_PREDEF     = NO
+EXPAND_ONLY_PREDEF     = YES
 
 # If the SEARCH_INCLUDES tag is set to YES, the include files in the
 # INCLUDE_PATH will be searched if a #include is found.
@@ -2415,7 +2420,8 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             =
+PREDEFINED             = ET_MODULE_NAMESPACE=module \
+                         ET_BUNDLED_MODULE_NAMESPACE=bundled_module
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/docs/source/android-backends.md b/docs/source/android-backends.md
index d4da0966ed9..bbbbb1418e0 100644
--- a/docs/source/android-backends.md
+++ b/docs/source/android-backends.md
@@ -16,7 +16,7 @@ Available hardware acceleration backends for Android deployment.
 - {doc}`android-qualcomm` — Qualcomm AI Engine (NPU)
 - {doc}`android-mediatek` — MediaTek NPU acceleration
 - {doc}`android-arm-vgf` — ARM VGF Backend
-- {doc}`backends/samsung/samsung-overview` — Samsung Exynos NPU
+- {doc}`android-samsung-exynos` — Samsung Exynos NPU
 
 ```{toctree}
 :hidden:
@@ -25,4 +25,5 @@ android-vulkan
 android-qualcomm
 android-mediatek
 android-arm-vgf
-backends/samsung/samsung-overview
+android-samsung-exynos
+```
diff --git a/docs/source/api-life-cycle.md b/docs/source/api-life-cycle.md
index 0327f23a985..3ccaa4eddb1 100644
--- a/docs/source/api-life-cycle.md
+++ b/docs/source/api-life-cycle.md
@@ -104,7 +104,7 @@ decorator.
 
 Use <code>.. warning::</code> in the docstrings of deprecated and experimental
 APIs. See
-<a href="https://github.com/pytorch/pytorch/blob/cd8bbdc71a0258292381a7d54c8b353988d02ff4/torch/nn/utils/stateless.py#L170">example
+<a href="https://github.com/pytorch/pytorch/blob/main/torch/nn/utils/stateless.py#L176">example
 usage</a>.
 
 </ul>
@@ -115,7 +115,7 @@ usage</a>.
    </td>
    <td>
 
-Use the <code>ET_DEPRECATED</code> annotation macro. See <a href="https://github.com/pytorch/executorch/blob/8e0f856ee269b319ac4195509cf31e3f548aa0e8/runtime/executor/program.h#L81">example usage</a>.
+Use the <code>ET_DEPRECATED</code> annotation macro. See <a href="https://github.com/pytorch/executorch/blob/main/runtime/executor/program.h#L92">example usage</a>.
 
 <p>
 <p>
@@ -125,7 +125,7 @@ Use the <code>ET_EXPERIMENTAL</code> annotation macro.
    <td>
 
 Start Doxygen comments with <code>DEPRECATED:</code> See
-<a href="https://github.com/pytorch/executorch/blob/9d859653ae916d0a72f6b2b5c5925bed38832140/runtime/executor/program.h#L139">example
+<a href="https://github.com/pytorch/executorch/blob/main/runtime/executor/program.h#L164">example
 usage</a>.
 
 <p>
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
index 6feddcc803c..c4465c8290d 100644
--- a/docs/source/backends-qualcomm.md
+++ b/docs/source/backends-qualcomm.md
@@ -608,7 +608,7 @@ Supports:
 For details, see: backends/qualcomm/quantizer/quantizer.py
 
 ### Operator Support
-[The full operator support matrix](https://github.com/pytorch/executorch/tree/f32cdc3de6f7176d70a80228f1a60bcd45d93437/backends/qualcomm/builders#operator-support-status) is tracked and frequently updated in the ExecuTorch repository.
+[The full operator support matrix](https://github.com/pytorch/executorch/tree/main/backends/qualcomm/builders#operator-support-status) is tracked and frequently updated in the ExecuTorch repository.
 
 It lists:
 - Supported PyTorch ops (aten.*, custom ops)
@@ -633,4 +633,4 @@ If you encounter any issues while reproducing the tutorial, please file a github
 [issue](https://github.com/pytorch/executorch/issues) on ExecuTorch repo and tag use `#qcom_aisw` tag
 
  ### Debugging tips
- - Before trying any complicated models, try out [a simple model example](https://github.com/pytorch/executorch/tree/f32cdc3de6f7176d70a80228f1a60bcd45d93437/examples/qualcomm#simple-examples-to-verify-the-backend-is-working) and see it if works one device.
+ - Before trying any complicated models, try out [a simple model example](https://github.com/pytorch/executorch/tree/main/examples/qualcomm#simple-examples-to-verify-the-backend-is-working) and see if it works on your device.
diff --git a/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md b/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
index 1a2cd1b44be..faffedece35 100644
--- a/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
+++ b/docs/source/backends/arm-ethos-u/arm-ethos-u-overview.md
@@ -4,7 +4,7 @@ The Arm&reg; Ethos&trade;-U backend targets Edge/IoT-type AI use-cases by enabli
 [Arm&reg; Ethos&trade;-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55), [Arm&reg; Ethos&trade;-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65), and
 [Arm&reg; Ethos&trade;-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85), leveraging [TOSA](https://www.mlplatform.org/tosa/) and the
 [ethos-u-vela](https://pypi.org/project/ethos-u-vela/) graph compiler. This document is a technical reference for using the Ethos-U backend, for a top level view with code examples
-please refer to the [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
+please refer to the [Arm Ethos-U Backend Tutorial](tutorials/ethos-u-getting-started.md).
 
 ## Features
 
@@ -111,7 +111,7 @@ For more information on quantization, see [Quantization](arm-ethos-u-quantizatio
 
 ## Runtime Integration
 
-An example runtime application is available in [examples/arm/executor_runner](https://github.com/pytorch/executorch/blob/main/examples/arm/executor_runner/), and the steps requried for building and deploying it on a FVP it is explained in the previously mentioned [Arm Ethos-U Backend Tutorial](https://docs.pytorch.org/executorch/stable/tutorial-arm-ethos-u.html).
+An example runtime application is available in [examples/arm/executor_runner](https://github.com/pytorch/executorch/blob/main/examples/arm/executor_runner/), and the steps requried for building and deploying it on a FVP it is explained in the previously mentioned [Arm Ethos-U Backend Tutorial](tutorials/ethos-u-getting-started.md).
 The example application is recommended to use for testing basic functionality of your lowered models, as well as a starting point for developing runtime integrations for your own targets.
 For an in-depth explanation of the architecture of the executor_runner and the steps required for doing such an integration, please refer to [Ethos-U porting guide](https://github.com/pytorch/executorch/blob/main/examples/arm/ethos-u-porting-guide.md).
 
diff --git a/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md b/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
index e6d35c0646e..9fe485e9f04 100644
--- a/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
+++ b/docs/source/backends/arm-ethos-u/arm-ethos-u-troubleshooting.md
@@ -24,7 +24,7 @@ You can see how  this coupling between the memory mode and runtime application i
 
 ## Using Bundled.io and ETdump
 
-The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/0.4/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools.
+The arm_executor_runner supports [bundled-io](https://docs.pytorch.org/executorch/stable/bundled-io.html) and [ETdump](https://docs.pytorch.org/executorch/stable/etdump.html) debugging tools.
 
 To enable bundled-io, set `-DEXECUTORCH_BUILD_DEVTOOLS=ON` when building Executorch and `-DET_BUNDLE_IO=ON` when building the executor_runner. To enable ETdump, set `-DEXECUTORCH_BUILD_ARM_ETDUMP=ON` when building Executorch and `-DEXECUTORCH_ENABLE_EVENT_TRACER=ON` when building the executor_runner.
 
diff --git a/docs/source/backends/nxp/nxp-overview.md b/docs/source/backends/nxp/nxp-overview.md
index 2bf66e28e5c..6070f86e458 100644
--- a/docs/source/backends/nxp/nxp-overview.md
+++ b/docs/source/backends/nxp/nxp-overview.md
@@ -39,7 +39,7 @@ $ ./examples/nxp/setup.sh
 
 To test the eIQ Neutron Backend, both AoT flow for model preparation and Runtime for execution, refer to the [Getting started with eIQ Neutron NPU ExecuTorch backend](tutorials/nxp-basic-tutorial.md)
 
-For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/release/1.0/examples/nxp/aot_neutron_compile.py).
+For a quick overview how to convert a custom PyTorch model, take a look at our [example python script](https://github.com/pytorch/executorch/tree/main/examples/nxp/aot_neutron_compile.py).
 
 
 ## Runtime Integration
diff --git a/docs/source/backends/nxp/nxp-partitioner.rst b/docs/source/backends/nxp/nxp-partitioner.rst
index 9aa65b1d0d9..c568959883f 100644
--- a/docs/source/backends/nxp/nxp-partitioner.rst
+++ b/docs/source/backends/nxp/nxp-partitioner.rst
@@ -28,7 +28,7 @@ Following fields can be set:
 Custom Delegation Options
 -------------------------
 By default the Neutron backend is defensive, what means it does not delegate operators which cannot be decided statically during partitioning. But as the model author you typically have insight into the model and so you can allow opportunistic delegation for some cases. For list of options, see
-`CustomDelegationOptions <https://github.com/pytorch/executorch/blob/release/1.2/backends/nxp/backend/custom_delegation_options.py#L11>`_
+`CustomDelegationOptions <https://github.com/pytorch/executorch/blob/main/backends/nxp/backend/custom_delegation_options.py#L11>`_
 
 ================
 Operator Support
@@ -37,7 +37,7 @@ Operator Support
 Operators are the building blocks of the ML model. See `IRs <https://docs.pytorch.org/docs/stable/torch.compiler_ir.html>`_ for more information on the PyTorch operator set.
 
 This section lists the Edge operators supported by the Neutron backend.
-For detailed constraints of the operators see the conditions in the ``is_supported_*`` functions in the `Node converters <https://github.com/pytorch/executorch/blob/release/1.2/backends/nxp/neutron_partitioner.py#L202>`_
+For detailed constraints of the operators see the ``is_supported`` / ``_is_supported_in_IR`` / ``_is_supported_on_target`` checks in the `Node converters <https://github.com/pytorch/executorch/blob/main/backends/nxp/backend/ir/converter/node_converter.py#L118>`_
 
 
 .. csv-table:: Operator Support
diff --git a/docs/source/backends/xnnpack/xnnpack-partitioner.rst b/docs/source/backends/xnnpack/xnnpack-partitioner.rst
index a0881aa3a6a..85dc3bf9c61 100644
--- a/docs/source/backends/xnnpack/xnnpack-partitioner.rst
+++ b/docs/source/backends/xnnpack/xnnpack-partitioner.rst
@@ -2,10 +2,10 @@
 Partitioner API
 ===============
 
-The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an ``XnnpackPartitioner`` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the `constructor <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/xnnpack_partitioner.py#L31>`_:
+The XNNPACK partitioner API allows for configuration of the model delegation to XNNPACK. Passing an ``XnnpackPartitioner`` instance with no additional parameters will run as much of the model as possible on the XNNPACK backend. This is the most common use-case. For advanced use cases, the partitioner exposes the following options via the `constructor <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/xnnpack_partitioner.py#L31>`_:
 
-- ``configs``: Control which operators are delegated to XNNPACK. By default, all available operators all delegated. See `../config/__init__.py <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/__init__.py#L66>`_ for an exhaustive list of available operator configs.
-- ``config_precisions``: Filter operators by data type. By default, delegate all precisions. One or more of ``ConfigPrecisionType.FP32``, ``ConfigPrecisionType.STATIC_QUANT``, or ``ConfigPrecisionType.DYNAMIC_QUANT``. See `ConfigPrecisionType <https://github.com/pytorch/executorch/blob/release/0.6/backends/xnnpack/partition/config/xnnpack_config.py#L24>`_.
+- ``configs``: Control which operators are delegated to XNNPACK. By default, all available operators are delegated. See `../config/__init__.py <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/config/__init__.py#L76>`_ for an exhaustive list of available operator configs.
+- ``config_precisions``: Filter operators by data type. By default, delegate all precisions. One or more of ``ConfigPrecisionType.FP32``, ``ConfigPrecisionType.STATIC_QUANT``, or ``ConfigPrecisionType.DYNAMIC_QUANT``. See `ConfigPrecisionType <https://github.com/pytorch/executorch/blob/main/backends/xnnpack/partition/config/xnnpack_config.py#L30>`_.
 - ``per_op_mode``: If true, emit individual delegate calls for every operator. This is an advanced option intended to reduce memory overhead in some contexts at the cost of a small amount of runtime overhead. Defaults to false.
 - ``verbose``: If true, print additional information during lowering.
 
diff --git a/docs/source/bundled-io.md b/docs/source/bundled-io.md
index d901710bfb7..2597b991920 100644
--- a/docs/source/bundled-io.md
+++ b/docs/source/bundled-io.md
@@ -199,17 +199,17 @@ This stage mainly focuses on executing the model with the bundled inputs and com
 
 ### Get ExecuTorch Program Pointer from `BundledProgram` Buffer
 We need the pointer to ExecuTorch program to do the execution. To unify the process of loading and executing `BundledProgram` and Program flatbuffer, we create an API for this
-`executorch::bundled_program::get_program_data`. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L128-L137) of this API.
+`executorch::bundled_program::get_program_data`. Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L128-L137) of this API.
 
 ### Load Bundled Input to Method
-To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`.  Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L253-L259) of this API.
+To execute the program on the bundled input, we need to load the bundled input into the method. Here we provided an API called `executorch::bundled_program::load_bundled_input`.  Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L253-L259) of this API.
 
 ### Verify the Method's Output.
-We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Check out an [example usage](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp#L301-L307) of this API.
+We call `executorch::bundled_program::verify_method_outputs` to verify the method's output with bundled expected outputs. Check out an [example usage](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp#L301-L307) of this API.
 
 ### Runtime Example
 
-Please checkout our [example runner](https://github.com/pytorch/executorch/blob/release/0.6/examples/devtools/README.md#bundledprogram) for a bundled program. You could run these commands to test with the BundledProgram binary (`.bpte`) file you generated in the previous step:
+Please check out our [example runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/README.md#bundledprogram) for a bundled program. You could run these commands to test with the BundledProgram binary (`.bpte`) file you generated in the previous step:
 
 ```bash
 cd executorch
@@ -218,7 +218,7 @@ cd executorch
 ```
 
 It is expected to see no output from running the above mentioned snippet.
-For a detailed example of how the runner should be like, please refer to our [example runner](https://github.com/pytorch/executorch/blob/release/1.0/examples/devtools/example_runner/example_runner.cpp).
+For a detailed example of how the runner should be like, please refer to our [example runner](https://github.com/pytorch/executorch/blob/main/examples/devtools/example_runner/example_runner.cpp).
 
 
 ### Try the Complete Workflow
diff --git a/docs/source/compiler-custom-compiler-passes.md b/docs/source/compiler-custom-compiler-passes.md
index ff0013a1929..aaba70b02af 100644
--- a/docs/source/compiler-custom-compiler-passes.md
+++ b/docs/source/compiler-custom-compiler-passes.md
@@ -25,7 +25,7 @@ Our projection on the frequency of these use cases are:
 
 For level 1 uses cases (creating one-to-X mappings, performing forwards iterations,
 and looking at local node information), we can utilize a helper class called
-[`ExportPass`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/pass_base.py#L44).
+[`ExportPass`](https://github.com/pytorch/executorch/blob/main/exir/pass_base.py#L655).
 This is an
 [interpreter-based](https://pytorch.org/docs/stable/fx.html#the-interpreter-pattern)
 way where we execute each node and recreate the graph except with
@@ -35,7 +35,7 @@ metadata such as stack trace, FakeTensor values, and torch.nn.Module hierarchy
 are preserved and updated depending on the transformations made.
 
 To implement this pass, we can create a subclass of
-[`ExportPass`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/pass_base.py#L44)
+[`ExportPass`](https://github.com/pytorch/executorch/blob/main/exir/pass_base.py#L655)
 and implement the exposed functions.  When called with a graph module, it will
 run the graph module and create a new graph containing the changes specified by
 the pass. This means that the graph module passed in must be runnable on CPU,
@@ -171,7 +171,7 @@ class ScalarToTensorPass(ExportPass):
 ### Level 2
 
 For creating many-to-one mappings, we can utilize FX's [subgraph
-rewriter](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/subgraph_rewriter.py#L77).
+rewriter](https://github.com/pytorch/pytorch/blob/main/torch/fx/subgraph_rewriter.py#L96).
 Given a `pattern`, it creates a subgraph of operators matching to the pattern,
 and then replaces each matched subgraph with the `replacement`.
 
@@ -229,7 +229,7 @@ class ReplacedPatterns:
 ### Level 3
 
 For the third way of creating a pass, we can utilize the most basic
-[`PassBase`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/infra/pass_base.py#L22).
+[`PassBase`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/pass_base.py#L28).
 To create a pass, we can subclass this and implement the function `call` with
 the pass contents. Additionally, we can implement the functions `requires` and
 `ensures` which will be called before and after the function `call`. Note that
@@ -315,7 +315,7 @@ with IR Spec, so be careful when using them.
 
 For finding subgraphs within a graph that match a specific pattern, we can
 utilize FX's
-[`SubgraphMatcher`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/utils/matcher_utils.py#L51).
+[`SubgraphMatcher`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/utils/matcher_utils.py#L63).
 
 Class Attributes:
 
@@ -382,7 +382,7 @@ class InternalMatch():
 
 To find the largest subgraphs of nodes that support a specific invariant, we can
 utilize FX's
-[`CapabilityBasedPartitioner`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/infra/partitioner.py#L34C1-L34C1).
+[`CapabilityBasedPartitioner`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/infra/partitioner.py#L65).
 
 Class Attributes
 
@@ -399,14 +399,14 @@ Class Attributes
    that are allowed to be in a single node partition.
 
 The
-[`OperatorSupportBase`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L28)
+[`OperatorSupportBase`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L37)
 class is used by
 the partitioner to determine if a specific node in the graph belongs in the
 partition. This is done by overriding the `is_node_supported` function. You can
-chain multiple `OperatorSuppportBase` by using
-[`chain`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L150)(which
+chain multiple `OperatorSupportBase` by using
+[`chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L159)(which
 returns False if any of the OperatorSupportBase return False) and
-[`any_chain`](https://github.com/pytorch/pytorch/blob/8597d37536ef11bdf6b0a539ab79af876e1c92f6/torch/fx/passes/operator_support.py#L164)
+[`any_chain`](https://github.com/pytorch/pytorch/blob/main/torch/fx/passes/operator_support.py#L172)
 (which returns True if any of the OperatorSupportBase returns True).
 
 Consider the following example:
@@ -440,7 +440,7 @@ not allow `call_module` nodes.
 ### Combined
 
 We also provide a combined helper function:
-[`generate_pattern_op_partitions`](https://github.com/pytorch/executorch/blob/d9eef24bb720804aa7b400b05241487510ae0dc2/exir/backend/canonical_partitioners/pattern_op_partitioner.py#L59)
+[`generate_pattern_op_partitions`](https://github.com/pytorch/executorch/blob/main/exir/backend/canonical_partitioners/pattern_op_partitioner.py#L107)
 
 Args:
 * `graph_module (fx.GraphModule)`: Module that we want to partition
diff --git a/docs/source/compiler-memory-planning.md b/docs/source/compiler-memory-planning.md
index 5c30defada7..5a34634beec 100644
--- a/docs/source/compiler-memory-planning.md
+++ b/docs/source/compiler-memory-planning.md
@@ -82,7 +82,7 @@ program = edge_program.to_executorch(
         )
 ```
 
-Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/d62c41ca86435e5316e7ed292b6d68aff27a2fb7/exir/memory_planning.py#L459C1-L459C12).
+Users attempting to write a custom memory planning algorithm should start by looking at [the greedy algorithm's implementation](https://github.com/pytorch/executorch/blob/main/exir/memory_planning.py#L801).
 
 ## Debugging Tool
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 1a2ef3e5e5f..75757b7da27 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -283,13 +283,23 @@
     "tutorials/export-to-executorch-tutorial": "../using-executorch-export.html",
     "build-run-vulkan": "backends/vulkan/vulkan-overview.html",
     "backends-vulkan": "backends/vulkan/vulkan-overview.html",
-    "executorch-arm-delegate-tutorial": "backends-arm-ethos-u.html",
-    "build-run-coreml": "backends-coreml.html",
+    "executorch-arm-delegate-tutorial": "backends/arm-ethos-u/tutorials/ethos-u-getting-started.html",
+    "build-run-coreml": "backends/coreml/coreml-overview.html",
     "build-run-mediatek-backend": "backends-mediatek.html",
-    "build-run-mps": "backends-mps.html",
+    "build-run-mps": "backends/mps/mps-overview.html",
     "build-run-qualcomm-ai-engine-direct-backend": "backends-qualcomm.html",
     "build-run-xtensa": "backends-cadence.html",
     "apple-runtime": "using-executorch-ios.html",
+    "backends-arm-ethos-u": "backends/arm-ethos-u/arm-ethos-u-overview.html",
+    "backends-arm-vgf": "backends/arm-vgf/arm-vgf-overview.html",
+    "backends-coreml": "backends/coreml/coreml-overview.html",
+    "backends-mps": "backends/mps/mps-overview.html",
+    "backends-xnnpack": "backends/xnnpack/xnnpack-overview.html",
+    "backend-delegates-xnnpack-reference": "backends/xnnpack/xnnpack-arch-internals.html",
+    "llm/llama-demo-android": "../using-executorch-android.html",
+    "tutorial-arm-ethos-u": "backends/arm-ethos-u/tutorials/ethos-u-getting-started.html",
+    "tutorial-arm-vgf": "backends/arm-vgf/tutorials/vgf-getting-started.html",
+    "visualization": "visualize.html",
 }
 
 # Custom directives defintions to create cards on main landing page
diff --git a/docs/source/executorch-runtime-api-reference.rst b/docs/source/executorch-runtime-api-reference.rst
index 8853e5444eb..42f75e500e3 100644
--- a/docs/source/executorch-runtime-api-reference.rst
+++ b/docs/source/executorch-runtime-api-reference.rst
@@ -40,3 +40,26 @@ Values
 
 .. doxygenclass:: executorch::runtime::etensor::Tensor
   :members:
+
+Module Extension
+----------------
+
+The Module extension provides a higher-level C++ facade for loading programs,
+setting inputs and outputs, and executing methods with common runtime defaults.
+
+.. doxygenclass:: executorch::extension::module::Module
+  :members:
+
+.. doxygenclass:: executorch::extension::bundled_module::BundledModule
+  :members:
+
+Tensor Extension
+----------------
+
+The Tensor extension provides managed tensor helpers for C++ applications that
+need to create, alias, resize, or index tensors before passing them to runtime
+APIs.
+
+.. doxygennamespace:: executorch::extension
+  :members:
+  :content-only:
diff --git a/docs/source/platforms-desktop.md b/docs/source/platforms-desktop.md
index ba22786576f..b004d47c2ee 100644
--- a/docs/source/platforms-desktop.md
+++ b/docs/source/platforms-desktop.md
@@ -11,11 +11,11 @@ ExecuTorch supports desktop and laptop deployment across Linux, macOS, and Windo
 ### Linux
 - [XNNPACK (CPU)](backends/xnnpack/xnnpack-overview.md)
 - [OpenVINO (Intel)](build-run-openvino)
-- [ARM Ethos-U (ARM64)](backends-arm-ethos-u)
+- [ARM Ethos-U (ARM64)](backends/arm-ethos-u/arm-ethos-u-overview.md)
 
 ### macOS
-- [CoreML (recommended)](backends-coreml)
-- [MPS (Apple Silicon)](backends-mps)
+- [Core ML (recommended)](backends/coreml/coreml-overview.md)
+- [MPS (Apple Silicon)](backends/mps/mps-overview.md)
 - [XNNPACK (CPU)](backends/xnnpack/xnnpack-overview.md)
 
 ### Windows
diff --git a/docs/source/platforms-embedded.md b/docs/source/platforms-embedded.md
index 5ea248fc0d9..f766eddda82 100644
--- a/docs/source/platforms-embedded.md
+++ b/docs/source/platforms-embedded.md
@@ -10,10 +10,10 @@ ExecuTorch supports embedded devices from microcontrollers to edge devices.
 
 ### Microcontrollers
 - [Cadence Xtensa Backend](backends-cadence)
-- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
+- [ARM Ethos-U NPU Backend](backends/arm-ethos-u/arm-ethos-u-overview.md)
 - [Custom Backend Development](backend-delegates-integration)
 
 ### Edge Devices
-- [ARM Ethos-U NPU Backend](backends-arm-ethos-u)
-- [NXP eIQ Neutron Backend](backend-nxp)
+- [ARM Ethos-U NPU Backend](backends/arm-ethos-u/arm-ethos-u-overview.md)
+- [NXP eIQ Neutron Backend](backends/nxp/nxp-overview.md)
 - [Custom Hardware Integration](backend-delegates-integration)
diff --git a/docs/source/tools-section.md b/docs/source/tools-section.md
index c54b4933c44..6d8061dd33a 100644
--- a/docs/source/tools-section.md
+++ b/docs/source/tools-section.md
@@ -13,7 +13,7 @@ In this section, explore ExecuTorch's comprehensive developer tools for profilin
 - {doc}`model-inspector` — Model Inspector
 - {doc}`memory-planning-inspection` — Memory Planning Inspection
 - {doc}`devtools-tutorial` — Development Utilities
-- {doc}`visualization` — Model Visualization
+- {doc}`visualize` — Model Visualization
 
 ```{toctree}
 :hidden:
@@ -29,4 +29,4 @@ model-debugging
 model-inspector
 memory-planning-inspection
 devtools-tutorial
-visualization
+visualize
diff --git a/docs/source/using-executorch-android.md b/docs/source/using-executorch-android.md
index 443015b47be..ef55ade68aa 100644
--- a/docs/source/using-executorch-android.md
+++ b/docs/source/using-executorch-android.md
@@ -82,7 +82,7 @@ Starting from 2025-04-12, you can download nightly `main` branch snapshots:
 * `executorch.aar`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar`
 * `executorch.aar.sha256sums`: `https://ossci-android.s3.amazonaws.com/executorch/release/snapshot-{YYYYMMDD}/executorch.aar.sha256sums`
 * Replace `YYYYMMDD` with the actual date you want to use.
-* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/c66b37d010c88a113560693b14dc6bd112593c11/.github/workflows/android-release-artifacts.yml#L14-L15).
+* AAR file is generated by [this workflow](https://github.com/pytorch/executorch/blob/main/.github/workflows/android-release-artifacts.yml).
 
 For example:
 
diff --git a/docs/source/using-executorch-export.md b/docs/source/using-executorch-export.md
index c8814b74eda..d37dfae2ef7 100644
--- a/docs/source/using-executorch-export.md
+++ b/docs/source/using-executorch-export.md
@@ -38,7 +38,7 @@ Commonly used hardware backends are listed below. For mobile, consider using XNN
 - [Vulkan (Android GPU)](backends/vulkan/vulkan-overview.md)
 - [Qualcomm NPU](backends-qualcomm.md)
 - [MediaTek NPU](backends-mediatek.md)
-- [Arm Ethos-U NPU](backends-arm-ethos-u.md)
+- [Arm Ethos-U NPU](backends/arm-ethos-u/arm-ethos-u-overview.md)
 - [Cadence DSP](backends-cadence.md)
 
 ## Model Preparation
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 7fc111e2c34..5b3a2d03874 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -154,21 +154,32 @@ def hf_precompute_freqs_cis(
     # Partial rotary embeddings.
     dim = int(dim * partial_rotary_factor)
 
-    # Compute the RoPE table in fp64 to minimize ULP-level drift; cast to fp32
-    # once at the end. Phi-4 Mini's narrow decode-time logit margins make the
-    # exported model sensitive to 1-ULP differences in freqs_cos / freqs_sin
-    # under sampling, especially on the Vulkan delegate.
+    # fp64 precompute is required whenever cos/sin will be scaled by a
+    # non-trivial attention_factor (LongRoPE on Phi-3 / Phi-4 family). There,
+    # fp32 ULP-level rounding in the table is load-bearing on Vulkan under
+    # sampling -- a fp32-only regression manifests as decode-time n-gram
+    # looping, not a unit-test red. For vanilla HF RoPE, fp32 throughout
+    # produces cos/sin tables bit-identical to the non-HF precompute_freqs_cis
+    # path, which the static-attention vs MHA parity tests rely on.
+    #
+    # If you add a new model that needs cos/sin scaling but does not set
+    # short_factor / long_factor / attention_factor, extend the gate below.
+    longrope_active = (short_factor is not None) or (long_factor is not None)
+    needs_fp64 = longrope_active or (
+        attention_factor is not None and attention_factor != 1.0
+    )
+    compute_dtype = torch.float64 if needs_fp64 else torch.float32
+
     inv_freq = 1.0 / (
         theta
         ** (
-            torch.arange(0, dim, 2, device=device, dtype=torch.int64).to(torch.float64)
+            torch.arange(0, dim, 2, device=device, dtype=torch.int64).to(compute_dtype)
             / dim
         )
     )
 
     # LongRoPE: divide inv_freq element-wise by short_factor or long_factor.
     # Selection mirrors HF: long_factor when seq_len > original_max_position_embeddings.
-    longrope_active = (short_factor is not None) or (long_factor is not None)
     if longrope_active:
         chosen = (
             long_factor
@@ -178,7 +189,7 @@ def hf_precompute_freqs_cis(
         if chosen is None:
             # Fall back to whichever factor was provided.
             chosen = short_factor if long_factor is None else long_factor
-        ext_factors = torch.tensor(chosen, dtype=torch.float64, device=device)
+        ext_factors = torch.tensor(chosen, dtype=compute_dtype, device=device)
         assert ext_factors.numel() == inv_freq.numel(), (
             f"LongRoPE factor length {ext_factors.numel()} must equal dim/2 "
             f"({inv_freq.numel()})"
@@ -200,8 +211,8 @@ def hf_precompute_freqs_cis(
                 )
 
     # pyre-ignore Undefined attribute [16]: `float` has no attribute `device`.
-    t = torch.arange(end, device=inv_freq.device, dtype=torch.int64).to(torch.float64)
-    freqs = torch.outer(t, inv_freq).to(torch.float64)  # pyre-ignore
+    t = torch.arange(end, device=inv_freq.device, dtype=torch.int64).to(compute_dtype)
+    freqs = torch.outer(t, inv_freq).to(compute_dtype)  # pyre-ignore
     emb = torch.cat((freqs, freqs), dim=-1)
     cos_tab = torch.cos(emb)
     sin_tab = torch.sin(emb)
diff --git a/examples/models/llama/runner/static_attention_io_manager.h b/examples/models/llama/runner/static_attention_io_manager.h
index c4e851f0b0c..6f631df3ff0 100644
--- a/examples/models/llama/runner/static_attention_io_manager.h
+++ b/examples/models/llama/runner/static_attention_io_manager.h
@@ -14,6 +14,7 @@
 #include <unordered_map>
 #include <vector>
 
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
 #include <executorch/runtime/core/span.h>
 #include <executorch/runtime/executor/method.h>
 #include <executorch/runtime/platform/log.h>
@@ -53,8 +54,8 @@ class StaticKVCache {
         style_(style),
         input_ptrs_(n_caches_),
         output_ptrs_(n_caches_) {
-    size_t total_cache_len =
-        std::accumulate(cache_lengths_.begin(), cache_lengths_.end(), 0);
+    size_t total_cache_len = std::accumulate(
+        cache_lengths_.begin(), cache_lengths_.end(), size_t(0));
     cache_data_size_ = total_cache_len * n_heads_per_cache_ * head_dim_;
     update_data_size_ =
         n_caches_ * n_heads_per_cache_ * max_input_len_ * head_dim_;
@@ -867,6 +868,12 @@ class StaticAttentionIOManager {
   void set_input(executorch::runtime::Method& method, size_t idx, T* data) {
     auto methodMeta = method.method_meta();
     auto inputMeta = methodMeta.input_tensor_meta(idx);
+    ET_CHECK_MSG(
+        sizeof(T) == executorch::runtime::elementSize(inputMeta->scalar_type()),
+        "set_input: sizeof(T)=%zu but model expects element size %zu for input %zu",
+        sizeof(T),
+        executorch::runtime::elementSize(inputMeta->scalar_type()),
+        idx);
     auto impl = ::executorch::runtime::etensor::TensorImpl(
         inputMeta->scalar_type(),
         inputMeta->sizes().size(),
diff --git a/examples/models/moshi/mimi/BUCK b/examples/models/moshi/mimi/BUCK
index 9774655f951..1d52649166f 100644
--- a/examples/models/moshi/mimi/BUCK
+++ b/examples/models/moshi/mimi/BUCK
@@ -1,5 +1,4 @@
 load("@fbcode_macros//build_defs:build_file_migration.bzl", "fbcode_target", "non_fbcode_target")
-load("@fbsource//tools/target_determinator/macros:fbcode_ci_helpers.bzl", "fbcode_ci")
 load("@fbsource//tools/target_determinator/macros:ci.bzl", "ci")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
@@ -8,7 +7,9 @@ oncall("executorch")
 fbcode_target(_kind = runtime.python_test,
     name = "test_mimi",
     srcs = ["test_mimi.py"],
-    labels = ci.labels(fbcode_ci.use_opt_instead_of_dev()),
+    # Skipped in fbcode CI: setUpClass downloads the model from HuggingFace,
+    # which requires network access unavailable in this CI environment. Still runs in OSS CI.
+    labels = [ci.skip_target()],
     deps = [
         "fbsource//third-party/pypi/huggingface-hub:huggingface-hub",
         "fbsource//third-party/pypi/moshi:moshi",
diff --git a/examples/models/moshi/mimi/install_requirements.sh b/examples/models/moshi/mimi/install_requirements.sh
index eb8fe96ed05..9fc12f64bc9 100755
--- a/examples/models/moshi/mimi/install_requirements.sh
+++ b/examples/models/moshi/mimi/install_requirements.sh
@@ -7,16 +7,10 @@
 
 set -x
 
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-# torch_pin lives at the executorch repo root.
-cd "$SCRIPT_DIR/../../../.."
-
-TORCHCODEC_PKG=$(python -c "from torch_pin import torchcodec_spec; print(torchcodec_spec())")
-TORCHCODEC_INDEX=$(python -c "from torch_pin import torch_index_url_base; print(torch_index_url_base())")
-
 sudo apt install ffmpeg -y
-pip install "$TORCHCODEC_PKG" --extra-index-url "${TORCHCODEC_INDEX}/cpu"
+pip install torchcodec==0.11.0 --extra-index-url https://download.pytorch.org/whl/test/cpu
 pip install moshi==0.2.11
 pip install bitsandbytes soundfile einops
 # Run llama2/install requirements for torchao deps
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 bash "$SCRIPT_DIR"/../../llama/install_requirements.sh
diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
index e97f64b7c1d..796dde88014 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h
@@ -102,6 +102,9 @@ class LhdTokenGenerator : public TokenGenerator<T> {
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
index 83da9e7a6ba..7494afec6da 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h
@@ -108,6 +108,9 @@ class MultimodalLhdTokenGenerator
       AttentionSinkRopeRunner* attention_sink_rope_runner) override;
 
  private:
+  // Bring base class's virtual prepare_io into scope so the overload below
+  // does not hide it (-Woverloaded-virtual).
+  using TokenGenerator<T>::prepare_io;
   /**
    * @brief Fill in I/O buffers with prompt token and position.
    * @param cur_token Current token.
diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
index 0790985d231..599f7050d83 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
+++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h
@@ -40,6 +40,8 @@ class PromptProcessor {
       const std::string& method_name,
       Metadata metadata);
 
+  virtual ~PromptProcessor() = default;
+
   /**
    * @brief Initialize I/O tensor and allocate I/O data buffer.
    * @param buffer_manager Pointer to IMemAlloc instance; by default, it uses a
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
index 0e9b7860dbd..0a4a8b9abb5 100644
--- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
+++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -102,6 +102,7 @@ Runner<T>::Runner(
     std::unique_ptr<tokenizers::Tokenizer> tokenizer,
     std::unique_ptr<executorch::extension::Module> attention_sink_rope_module)
     : module_(std::move(module)),
+      attention_sink_rope_module_(std::move(attention_sink_rope_module)),
       ngram_(ngram),
       window_(window),
       gcap_(gcap),
@@ -111,8 +112,7 @@ Runner<T>::Runner(
       temperature_(temperature),
       eval_mode_(static_cast<EvalMode>(eval_mode)),
       shared_buffer_(shared_buffer),
-      tokenizer_(std::move(tokenizer)),
-      attention_sink_rope_module_(std::move(attention_sink_rope_module)) {
+      tokenizer_(std::move(tokenizer)) {
   stats_.reset();
 
   if (decoder_model_version == "llama2") {
diff --git a/exir/program/_program.py b/exir/program/_program.py
index c68d0eed945..950e203c86c 100644
--- a/exir/program/_program.py
+++ b/exir/program/_program.py
@@ -1081,7 +1081,7 @@ def _sanity_check_graph_for_non_decomp_ops(
                     logging.warning(warning_str)
 
 
-def _remove_invalid_ops_for_not_decompose(
+def _remove_invalid_ops_for_not_decompose(  # noqa: C901
     preserve_ops: List[torch._ops.OpOverload],
 ) -> List[torch._ops.OpOverload]:
     _logged_warnings = set()
@@ -1124,6 +1124,16 @@ def keep(op):
                 )
                 return False
 
+        # Fallback: torchgen does not detect alias annotations on ops
+        # returning lists of aliased tensors (e.g. split.Tensor returns
+        # Tensor(a)[]). Check op._schema.returns directly.
+        for ret in schema.returns:
+            if ret.alias_info is not None:
+                log_warning(
+                    f"Op {op} was requested for preservation by partitioner.  This request is ignored because it aliases output."
+                )
+                return False
+
         # Explicit block list of ops that don't work if asked for
         # preservation
         if op in [
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
index f683384f8f9..8a084ba491a 100644
--- a/exir/tests/test_passes.py
+++ b/exir/tests/test_passes.py
@@ -940,6 +940,43 @@ def body(i, h, h_accum):
             torch.allclose(prog.exported_program().module()(inp), model(inp))
         )
 
+    def test_remove_invalid_ops_filters_aliased_list_returns(self) -> None:
+        """Verify _remove_invalid_ops_for_not_decompose filters ops that return
+        aliased tensor lists (e.g. split, chunk) even when torchgen's
+        aliased_return_names() fails to detect them. Regression test for
+        https://github.com/pytorch/executorch/issues/11723
+        """
+        from executorch.exir.program._program import (
+            _remove_invalid_ops_for_not_decompose,
+        )
+
+        # These ops return Tensor(a)[] — a list of aliased views.
+        # torchgen's aliased_return_names() misses the alias annotation on
+        # list returns, so the fallback check on op._schema.returns is needed.
+        aliased_list_ops = [
+            torch.ops.aten.split.Tensor,
+            torch.ops.aten.chunk.default,
+            torch.ops.aten.tensor_split.sections,
+            torch.ops.aten.split_with_sizes.default,
+        ]
+        for op in aliased_list_ops:
+            result = _remove_invalid_ops_for_not_decompose([op])
+            self.assertNotIn(
+                op,
+                result,
+                f"{op} should be filtered out because it returns aliased tensors",
+            )
+
+        # Non-aliased ops should be preserved.
+        preserved_ops = [torch.ops.aten.linear.default]
+        for op in preserved_ops:
+            result = _remove_invalid_ops_for_not_decompose([op])
+            self.assertIn(
+                op,
+                result,
+                f"{op} should be preserved because it has no aliased returns",
+            )
+
     def test_convert_symb_ops(self) -> None:
         class Foo(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/extension/llm/apple/BUCK b/extension/llm/apple/BUCK
index 36da3c77935..667082e27d1 100644
--- a/extension/llm/apple/BUCK
+++ b/extension/llm/apple/BUCK
@@ -16,7 +16,17 @@ non_fbcode_target(_kind = fb_apple_library,
     ],
     sdks = IOS,
     visibility = EXECUTORCH_CLIENTS,
-    test_labels = ["long_running"],
+    # `glacial` raises the per-XCTestCase timeout from 1800s -> 5400s (90 min)
+    # via fbobjc/Tools/xctest_runner: TEST_CASE_TIMEOUT(60s) * 30 * 3.
+    # Required because LLM inference (LLaMA, Phi4, Gemma, LLaVA, Voxtral)
+    # on iOS-sim CPU regularly exceeds 30 minutes for a full forward pass.
+    test_labels = ["glacial"],
+    # Rule-level wall-clock for the whole auto-generated test bundle:
+    # ExecuTorchLLMTests currently contains 13 XCTestCase methods, and
+    # individual methods can exceed 30 minutes on iOS-sim CPU. This 4h
+    # budget is intended as the total bundle/shard wall-clock, including
+    # xctest setup/teardown overhead; it is not based on "5 testcases".
+    test_test_rule_timeout_ms = 14400000,
     test_deps = [
         ":ExecuTorchLLMTestResource",
         "//xplat/executorch/backends/xnnpack:xnnpack_backendApple",
diff --git a/extension/training/examples/XOR/targets.bzl b/extension/training/examples/XOR/targets.bzl
index 4a85c34c1bb..f332dc49b3f 100644
--- a/extension/training/examples/XOR/targets.bzl
+++ b/extension/training/examples/XOR/targets.bzl
@@ -1,4 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "is_xplat", "runtime")
 
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
@@ -23,30 +23,34 @@ def define_common_targets():
         define_static_target = True,
     )
 
-    runtime.python_library(
-        name = "model",
-        srcs = ["model.py"],
-        visibility = [],  # Private
-        deps = [
-            "//caffe2:torch",
-        ],
-    )
+    # The Python export targets depend on `//caffe2:torch` and
+    # `//executorch/exir:lib`, neither of which exist as xplat (fbsource)
+    # targets. Restrict these to fbcode only.
+    if not is_xplat():
+        runtime.python_library(
+            name = "model",
+            srcs = ["model.py"],
+            visibility = [],  # Private
+            deps = [
+                "//caffe2:torch",
+            ],
+        )
 
-    runtime.python_library(
-        name = "export_model_lib",
-        srcs = ["export_model.py"],
-        visibility = ["//executorch/extension/training/examples/XOR/..."],
-        deps = [
-            ":model",
-            "//caffe2:torch",
-            "//executorch/exir:lib",
-        ],
-    )
+        runtime.python_library(
+            name = "export_model_lib",
+            srcs = ["export_model.py"],
+            visibility = ["//executorch/extension/training/examples/XOR/..."],
+            deps = [
+                ":model",
+                "//caffe2:torch",
+                "//executorch/exir:lib",
+            ],
+        )
 
-    runtime.python_binary(
-        name = "export_model",
-        main_module = "executorch.extension.training.examples.XOR.export_model",
-        deps = [
-            ":export_model_lib",
-        ],
-    )
+        runtime.python_binary(
+            name = "export_model",
+            main_module = "executorch.extension.training.examples.XOR.export_model",
+            deps = [
+                ":export_model_lib",
+            ],
+        )
diff --git a/install_executorch.py b/install_executorch.py
index d305a06bd28..140a1163020 100644
--- a/install_executorch.py
+++ b/install_executorch.py
@@ -174,11 +174,7 @@ def _parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--use-pt-pinned-commit",
         action="store_true",
-        help="install plain `torch` (whatever pip resolves by default; CI "
-        "uses this when torch is already built from source against the "
-        "pinned ref in pytorch.txt). Without this flag, install the specific "
-        "pinned version from the channel selected in torch_pin.py "
-        "(nightly / test / release).",
+        help="build from the pinned PyTorch commit instead of nightly",
     )
     parser.add_argument(
         "--editable",
@@ -221,14 +217,13 @@ def main(args):
         return
 
     check_and_update_submodules()
-    # By default install the specific pinned version from the channel selected
-    # in torch_pin.py. With --use-pt-pinned-commit, install plain `torch` (pip's
-    # default resolution); CI uses this when torch is already built from source
-    # against the pinned ref in pytorch.txt.
-    install_pinned_version = not args.use_pt_pinned_commit
+    # This option is used in CI to make sure that PyTorch build from the pinned commit
+    # is used instead of nightly. CI jobs wouldn't be able to catch regression from the
+    # latest PT commit otherwise
+    use_pytorch_nightly = not args.use_pt_pinned_commit
 
     # Step 1: Install core dependencies first
-    install_requirements(install_pinned_version)
+    install_requirements(use_pytorch_nightly)
 
     # Step 2: Install core package
     package_spec = "."
@@ -253,7 +248,7 @@ def main(args):
 
     # Step 3: Extra (optional) packages that is only useful for running examples.
     if not args.minimal:
-        install_optional_example_requirements(install_pinned_version)
+        install_optional_example_requirements(use_pytorch_nightly)
 
 
 if __name__ == "__main__":
diff --git a/install_requirements.py b/install_requirements.py
index 1e8ab5c2d6f..b30068cbdb8 100644
--- a/install_requirements.py
+++ b/install_requirements.py
@@ -12,18 +12,9 @@
 
 from install_utils import determine_torch_url, is_intel_mac_os, python_is_compatible
 
-from torch_pin import (
-    CHANNEL,
-    torch_index_url_base,
-    torch_spec,
-    torchaudio_spec,
-    torchvision_spec,
-)
-
-# Only RC wheels at /whl/test/ get re-uploaded under the same version, so
-# pip's local cache can serve stale content. Nightly and release wheels are
-# immutable per their identifier.
-_NO_CACHE_DIR_FLAG = ["--no-cache-dir"] if CHANNEL == "test" else []
+# The pip repository that hosts nightly torch packages.
+# This will be dynamically set based on CUDA availability and CUDA backend enabled/disabled.
+TORCH_URL_BASE = "https://download.pytorch.org/whl/test"
 
 # Since ExecuTorch often uses main-branch features of pytorch, only the nightly
 # pip versions will have the required features.
@@ -32,18 +23,17 @@
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
 #
-# NOTE: If you change torch_pin.py, the pre-commit hook runs
-# .github/scripts/update_pytorch_pin.py to refresh
-# .ci/docker/ci_commit_pins/pytorch.txt and the c10 grafted headers.
-# If you bypass the hook, run that script manually.
+# NOTE: If you're changing, make the corresponding change in .ci/docker/ci_commit_pins/pytorch.txt
+# by picking the hash from the same date in
+# https://hud.pytorch.org/hud/pytorch/pytorch/nightly/ @lint-ignore
 #
 # NOTE: If you're changing, make the corresponding supported CUDA versions in
 # SUPPORTED_CUDA_VERSIONS in install_utils.py if needed.
 
 
-def install_requirements(install_pinned_version):
-    # No prebuilt wheels are available for Intel macOS, regardless of channel.
-    if install_pinned_version and is_intel_mac_os():
+def install_requirements(use_pytorch_nightly):
+    # Skip pip install on Intel macOS if using nightly.
+    if use_pytorch_nightly and is_intel_mac_os():
         print(
             "ERROR: Prebuilt PyTorch wheels are no longer available for Intel-based macOS.\n"
             "Please build from source by following https://docs.pytorch.org/executorch/main/using-executorch-building-from-source.html",
@@ -52,26 +42,25 @@ def install_requirements(install_pinned_version):
         sys.exit(1)
 
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(torch_index_url_base())
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     # pip packages needed by exir.
     TORCH_PACKAGE = [
-        # Default: install the specific pinned version from the channel selected
-        # in torch_pin.py. With --use-pt-pinned-commit, pass plain "torch" and
-        # let pip resolve its default (CI's source-build is already installed).
-        (torch_spec() if install_pinned_version else "torch"),
+        # Setting use_pytorch_nightly to false to test the pinned PyTorch commit. Note
+        # that we don't need to set any version number there because they have already
+        # been installed on CI before this step, so pip won't reinstall them
+        ("torch==2.11.0" if use_pytorch_nightly else "torch"),
     ]
 
     # Install the requirements for core ExecuTorch package.
-    # `--extra-index-url` tells pip to look for package versions on the
-    # provided URL if they aren't available on the default URL.
+    # `--extra-index-url` tells pip to look for package
+    # versions on the provided URL if they aren't available on the default URL.
     subprocess.run(
         [
             sys.executable,
             "-m",
             "pip",
             "install",
-            *_NO_CACHE_DIR_FLAG,
             "-r",
             "requirements-dev.txt",
             *TORCH_PACKAGE,
@@ -117,14 +106,14 @@ def install_requirements(install_pinned_version):
     )
 
 
-def install_optional_example_requirements(install_pinned_version):
+def install_optional_example_requirements(use_pytorch_nightly):
     # Determine the appropriate PyTorch URL based on CUDA delegate status
-    torch_url = determine_torch_url(torch_index_url_base())
+    torch_url = determine_torch_url(TORCH_URL_BASE)
 
     print("Installing torch domain libraries")
     DOMAIN_LIBRARIES = [
-        (torchvision_spec() if install_pinned_version else "torchvision"),
-        (torchaudio_spec() if install_pinned_version else "torchaudio"),
+        ("torchvision==0.26.0" if use_pytorch_nightly else "torchvision"),
+        ("torchaudio==2.11.0" if use_pytorch_nightly else "torchaudio"),
     ]
     # Then install domain libraries
     subprocess.run(
@@ -133,7 +122,6 @@ def install_optional_example_requirements(install_pinned_version):
             "-m",
             "pip",
             "install",
-            *_NO_CACHE_DIR_FLAG,
             *DOMAIN_LIBRARIES,
             "--extra-index-url",
             torch_url,
@@ -164,11 +152,7 @@ def main(args):
     parser.add_argument(
         "--use-pt-pinned-commit",
         action="store_true",
-        help="install plain `torch` (whatever pip resolves by default; CI "
-        "uses this when torch is already built from source against the "
-        "pinned ref in pytorch.txt). Without this flag, install the specific "
-        "pinned version from the channel selected in torch_pin.py "
-        "(nightly / test / release).",
+        help="build from the pinned PyTorch commit instead of nightly",
     )
     parser.add_argument(
         "--example",
@@ -176,10 +160,10 @@ def main(args):
         help="Also installs required packages for running example scripts.",
     )
     args = parser.parse_args(args)
-    install_pinned_version = not bool(args.use_pt_pinned_commit)
-    install_requirements(install_pinned_version)
+    use_pytorch_nightly = not bool(args.use_pt_pinned_commit)
+    install_requirements(use_pytorch_nightly)
     if args.example:
-        install_optional_example_requirements(install_pinned_version)
+        install_optional_example_requirements(use_pytorch_nightly)
 
 
 if __name__ == "__main__":
diff --git a/kernels/optimized/cpu/op_grid_sampler_2d.cpp b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
index aebfd292bab..7ec45860985 100644
--- a/kernels/optimized/cpu/op_grid_sampler_2d.cpp
+++ b/kernels/optimized/cpu/op_grid_sampler_2d.cpp
@@ -338,10 +338,11 @@ Tensor& opt_grid_sampler_2d_out(
   // The NEON paths index input/grid/out directly assuming a contiguous NCHW
   // default-dim-order layout — no use of .strides() or .dim_order(). Fall
   // back to portable for anything else.
-  const bool fast_eligible = tensor_is_default_dim_order(input) &&
-      tensor_is_default_dim_order(grid) && tensor_is_default_dim_order(out) &&
-      tensor_is_contiguous(input) && tensor_is_contiguous(grid) &&
-      tensor_is_contiguous(out);
+  const bool fast_eligible = input.dim() == 4 && grid.dim() == 4 &&
+      grid.size(3) == 2 && input.size(0) == grid.size(0) &&
+      tensor_is_default_dim_order(input) && tensor_is_default_dim_order(grid) &&
+      tensor_is_default_dim_order(out) && tensor_is_contiguous(input) &&
+      tensor_is_contiguous(grid) && tensor_is_contiguous(out);
 
   // The fast paths read input/grid and write out as a single dtype: float for
   // the fp32 NEON path, fp16 for both the fp16 HW path (which raw-casts the
diff --git a/kernels/portable/test/op_upsample_bilinear2d_aa_test.py b/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
index f86aa35465c..c6e09af3b5c 100644
--- a/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
+++ b/kernels/portable/test/op_upsample_bilinear2d_aa_test.py
@@ -19,6 +19,20 @@
 
 
 class UpsampleBilinear2dAATest(unittest.TestCase):
+    def setUp(self) -> None:
+        # Save RNG state so we can restore it in tearDown; without this,
+        # `torch.manual_seed` would leak determinism into other test
+        # modules that share the same process.
+        self._torch_rng_state = torch.get_rng_state()
+        # Pin RNG so torch.randn / torch.randint inputs are deterministic.
+        # Without this, the parity tests below occasionally see input values
+        # that produce ATen-vs-ExecuTorch differences just above the
+        # configured atol, surfacing as flakes on the test-issues dashboard.
+        torch.manual_seed(0)
+
+    def tearDown(self) -> None:
+        torch.set_rng_state(self._torch_rng_state)
+
     def run_upsample_aa_test(
         self,
         inp: torch.Tensor,
@@ -126,7 +140,10 @@ def test_upsample_bilinear2d_aa_aten_parity_u8(self):
             input_tensor,
             output_size=(4, 4),
             align_corners=False,
-            atol=3.5,  # Relaxed tolerance for uint8 due to implementation differences in anti-aliasing
+            # uint8 quantization: a +/-1 step at the kernel level rounds to a
+            # full unit in the output, so observed deltas vs. ATen can reach
+            # ~4 units even though the underlying float disagreement is small.
+            atol=5,
         )
 
     def test_upsample_bilinear2d_aa_downsampling(self):
@@ -144,7 +161,10 @@ def test_upsample_bilinear2d_aa_aggressive_downsampling(self):
             input_tensor,
             output_size=(2, 2),
             align_corners=False,
-            atol=0.4,  # Relaxed tolerance due to implementation differences in separable vs direct interpolation
+            # Aggressive 4x downsampling magnifies the separable-vs-direct
+            # interpolation differences between ExecuTorch and ATen; observed
+            # max abs error reaches ~0.6 for typical N(0,1) inputs.
+            atol=1.0,
         )
 
     def test_upsample_bilinear2d_aa_asymmetric_downsampling(self):
diff --git a/kernels/test/op_add_test.cpp b/kernels/test/op_add_test.cpp
index 5561ad67b66..60faa4efb47 100644
--- a/kernels/test/op_add_test.cpp
+++ b/kernels/test/op_add_test.cpp
@@ -816,8 +816,8 @@ TEST_F(OpAddOutKernelTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpAddOutKernelTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpAddOutKernelTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_addmm_test.cpp b/kernels/test/op_addmm_test.cpp
index a2251784c17..ff02d9c0a79 100644
--- a/kernels/test/op_addmm_test.cpp
+++ b/kernels/test/op_addmm_test.cpp
@@ -529,8 +529,8 @@ TEST_F(OpAddmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpAddmmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpAddmmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_bitwise_not_test.cpp b/kernels/test/op_bitwise_not_test.cpp
index 1b73574f9ff..702486f0d2a 100644
--- a/kernels/test/op_bitwise_not_test.cpp
+++ b/kernels/test/op_bitwise_not_test.cpp
@@ -155,8 +155,8 @@ TEST_F(OpBitwiseNotOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpBitwiseNotOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpBitwiseNotOutTest, DISABLED_DynamicShapeUnbound) {
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
   %rewrite(unary_op) */
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
index edf2703e393..c870c412035 100644
--- a/kernels/test/op_bmm_test.cpp
+++ b/kernels/test/op_bmm_test.cpp
@@ -407,8 +407,8 @@ TEST_F(OpBmmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpBmmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpBmmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
diff --git a/kernels/test/op_clamp_test.cpp b/kernels/test/op_clamp_test.cpp
index 81138fc8a55..aeb44f1d7ab 100644
--- a/kernels/test/op_clamp_test.cpp
+++ b/kernels/test/op_clamp_test.cpp
@@ -457,8 +457,8 @@ TEST_F(OpClampOutTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpClampOutTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
@@ -480,8 +480,8 @@ TEST_F(OpClampOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpClampOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpClampOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   auto x = tf.make(
diff --git a/kernels/test/op_clone_test.cpp b/kernels/test/op_clone_test.cpp
index 43e4576548a..57a8aed2d6c 100644
--- a/kernels/test/op_clone_test.cpp
+++ b/kernels/test/op_clone_test.cpp
@@ -209,8 +209,8 @@ TEST_F(OpCloneTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpCloneTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpCloneTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_cumsum_test.cpp b/kernels/test/op_cumsum_test.cpp
index 3e0ec164d04..720f7bd98e9 100644
--- a/kernels/test/op_cumsum_test.cpp
+++ b/kernels/test/op_cumsum_test.cpp
@@ -260,8 +260,8 @@ TEST_F(OpCumSumOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpCumSumOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpCumSumOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_detach_copy_test.cpp b/kernels/test/op_detach_copy_test.cpp
index d5c558afd9d..fba497c75ab 100644
--- a/kernels/test/op_detach_copy_test.cpp
+++ b/kernels/test/op_detach_copy_test.cpp
@@ -190,8 +190,8 @@ TEST_F(OpDetachCopyOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpDetachCopyOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpDetachCopyOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_div_test.cpp b/kernels/test/op_div_test.cpp
index 94f26d1b301..84d33fa2757 100644
--- a/kernels/test/op_div_test.cpp
+++ b/kernels/test/op_div_test.cpp
@@ -526,8 +526,8 @@ TEST_F(OpDivOutTest, BroadcastNDTest) {
   test_broadcast_3D<ScalarType::BFloat16>();
 }
 
-TEST_F(OpDivOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpDivOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_floor_divide_test.cpp b/kernels/test/op_floor_divide_test.cpp
index 8be1168eee1..166f7fdd4f9 100644
--- a/kernels/test/op_floor_divide_test.cpp
+++ b/kernels/test/op_floor_divide_test.cpp
@@ -175,8 +175,8 @@ TEST_F(OpFloorDivideTest, MismatchedOutputShapesDies) {
   ET_EXPECT_KERNEL_FAILURE(context_, op_floor_divide_out(a, b, out));
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneAB) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeIsOneAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -195,8 +195,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingAB) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeMissingAB) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -215,8 +215,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingAB) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneBA) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeIsOneBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.522396445274353, 0.6753279566764832});
@@ -235,8 +235,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeIsOneBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingBA) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_BroadcastDimSizeMissingBA) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({1, 2}, {0.522396445274353, 0.6753279566764832});
@@ -255,8 +255,8 @@ TEST_F(OpFloorDivideTest, BroadcastDimSizeMissingBA) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundSameAsExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -283,8 +283,8 @@ TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -311,8 +311,8 @@ TEST_F(OpFloorDivideTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFloorDivideTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpFloorDivideTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_full_like_test.cpp b/kernels/test/op_full_like_test.cpp
index 6e7692f5347..23ac4e685f9 100644
--- a/kernels/test/op_full_like_test.cpp
+++ b/kernels/test/op_full_like_test.cpp
@@ -181,8 +181,8 @@ TEST_F(OpFullLikeTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpFullLikeTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpFullLikeTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_gelu_test.cpp b/kernels/test/op_gelu_test.cpp
index 8fae399fb18..9303b034ca2 100644
--- a/kernels/test/op_gelu_test.cpp
+++ b/kernels/test/op_gelu_test.cpp
@@ -213,8 +213,8 @@ TEST_F(OpGeluTest, DynamicShapeUpperBoundSameAsExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGeluTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGeluTest, DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -240,8 +240,8 @@ TEST_F(OpGeluTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGeluTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGeluTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_glu_test.cpp b/kernels/test/op_glu_test.cpp
index ac931302f98..9bee3a6a5a2 100644
--- a/kernels/test/op_glu_test.cpp
+++ b/kernels/test/op_glu_test.cpp
@@ -200,8 +200,8 @@ TEST_F(OpGluOutTest, AllNonFloatOutputDTypeDies) {
 #undef TEST_ENTRY
 }
 
-TEST_F(OpGluOutTest, DynamicShapeUpperBoundSameAsExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpGluOutTest, DISABLED_DynamicShapeUpperBoundSameAsExpected) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
@@ -253,8 +253,8 @@ TEST_F(OpGluOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpGluOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpGluOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
index 0ad5790a550..9b0ba782271 100644
--- a/kernels/test/op_linear_test.cpp
+++ b/kernels/test/op_linear_test.cpp
@@ -338,8 +338,8 @@ TEST_F(OpLinearOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLinearOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpLinearOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_log_softmax_test.cpp b/kernels/test/op_log_softmax_test.cpp
index 3bcbee96a1c..84255b8a29c 100644
--- a/kernels/test/op_log_softmax_test.cpp
+++ b/kernels/test/op_log_softmax_test.cpp
@@ -421,8 +421,8 @@ TEST_F(OpLogSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLogSoftmaxOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpLogSoftmaxOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_logit_test.cpp b/kernels/test/op_logit_test.cpp
index 1bb0a43a37d..0056e984bb7 100644
--- a/kernels/test/op_logit_test.cpp
+++ b/kernels/test/op_logit_test.cpp
@@ -259,8 +259,8 @@ TEST_F(OpLogitOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpLogitOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpLogitOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_masked_fill_test.cpp b/kernels/test/op_masked_fill_test.cpp
index 41962ba5ed8..b36b54c2b81 100644
--- a/kernels/test/op_masked_fill_test.cpp
+++ b/kernels/test/op_masked_fill_test.cpp
@@ -377,8 +377,8 @@ TEST_F(OpMaskedFillTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMaskedFillTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpMaskedFillTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
   TensorFactory<ScalarType::Bool> bool_tf;
 
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
index 65d21b45518..23f4b675d68 100644
--- a/kernels/test/op_mean_test.cpp
+++ b/kernels/test/op_mean_test.cpp
@@ -465,8 +465,8 @@ TEST_F(OpMeanOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMeanOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpMeanOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_mm_test.cpp b/kernels/test/op_mm_test.cpp
index 63d06143b5d..62d5ed29e26 100644
--- a/kernels/test/op_mm_test.cpp
+++ b/kernels/test/op_mm_test.cpp
@@ -255,8 +255,8 @@ TEST_F(OpMmOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMmOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpMmOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_mul_test.cpp b/kernels/test/op_mul_test.cpp
index 28baa0cbd16..4553f8a53b6 100644
--- a/kernels/test/op_mul_test.cpp
+++ b/kernels/test/op_mul_test.cpp
@@ -711,8 +711,8 @@ TEST_F(OpMulOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpMulOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpMulOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_relu_test.cpp b/kernels/test/op_relu_test.cpp
index 7d3cfc696b2..128c4388615 100644
--- a/kernels/test/op_relu_test.cpp
+++ b/kernels/test/op_relu_test.cpp
@@ -288,8 +288,8 @@ TEST_F(OpReluTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpReluTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Unbound dynamic shape not supported";
+// DISABLED: Unbound dynamic shape not supported
+TEST_F(OpReluTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_round_test.cpp b/kernels/test/op_round_test.cpp
index e05f3a68d40..cbf9b6515d9 100644
--- a/kernels/test/op_round_test.cpp
+++ b/kernels/test/op_round_test.cpp
@@ -230,8 +230,8 @@ TEST_F(OpRoundTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_EQ(out, expected);
 }
 
-TEST_F(OpRoundTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpRoundTest, DISABLED_DynamicShapeUnbound) {
   /* %python
   out_args = "{1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND"
   %rewrite(unary_op) */
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
index a5d26d0a4f9..3c61acb7d29 100644
--- a/kernels/test/op_softmax_test.cpp
+++ b/kernels/test/op_softmax_test.cpp
@@ -302,8 +302,8 @@ TEST_F(OpSoftmaxOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpSoftmaxOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpSoftmaxOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_split_copy_test.cpp b/kernels/test/op_split_copy_test.cpp
index 76b29fa30bb..2dd112b1ace 100644
--- a/kernels/test/op_split_copy_test.cpp
+++ b/kernels/test/op_split_copy_test.cpp
@@ -563,14 +563,16 @@ TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundSameAsExpected) {
       {2, 3}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(
+    OpSplitCopyTensorOutTest,
+    DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpSplitCopyTensorOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpSplitCopyTensorOutTest, DISABLED_DynamicShapeUnbound) {
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index c8e7c69c443..41ebc2f2733 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -637,8 +637,8 @@ TEST_F(OpSubOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpSubOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpSubOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make(
diff --git a/kernels/test/op_unbind_copy_test.cpp b/kernels/test/op_unbind_copy_test.cpp
index 70825537490..c98edc5e1f7 100644
--- a/kernels/test/op_unbind_copy_test.cpp
+++ b/kernels/test/op_unbind_copy_test.cpp
@@ -363,14 +363,16 @@ TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundSameAsExpected) {
       {2, 4}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUpperBoundLargerThanExpected) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(
+    OpUnbindCopyIntOutTest,
+    DISABLED_DynamicShapeUpperBoundLargerThanExpected) {
   test_dynamic_shape(
       {10, 10}, torch::executor::TensorShapeDynamism::DYNAMIC_BOUND);
 }
 
-TEST_F(OpUnbindCopyIntOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape not supported";
+// DISABLED: Dynamic shape not supported
+TEST_F(OpUnbindCopyIntOutTest, DISABLED_DynamicShapeUnbound) {
   test_dynamic_shape(
       {1, 1}, torch::executor::TensorShapeDynamism::DYNAMIC_UNBOUND);
 }
diff --git a/kernels/test/op_var_mean_test.cpp b/kernels/test/op_var_mean_test.cpp
index 7049c21d65b..05a0281a090 100644
--- a/kernels/test/op_var_mean_test.cpp
+++ b/kernels/test/op_var_mean_test.cpp
@@ -635,8 +635,8 @@ TEST_F(OpVarMeanCorrectionOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(mean_out, expected_mean);
 }
 
-TEST_F(OpVarMeanCorrectionOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpVarMeanCorrectionOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});
diff --git a/kernels/test/op_var_test.cpp b/kernels/test/op_var_test.cpp
index bfa73bfe15c..63e7e94f982 100644
--- a/kernels/test/op_var_test.cpp
+++ b/kernels/test/op_var_test.cpp
@@ -449,8 +449,8 @@ TEST_F(OpVarOutTest, DynamicShapeUpperBoundLargerThanExpected) {
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
 
-TEST_F(OpVarOutTest, DynamicShapeUnbound) {
-  GTEST_SKIP() << "Dynamic shape unbound not supported";
+// DISABLED: Dynamic shape unbound not supported
+TEST_F(OpVarOutTest, DISABLED_DynamicShapeUnbound) {
   TensorFactory<ScalarType::Float> tf;
 
   Tensor x = tf.make({3, 2}, {0.49, 0.40, 0.56, 0.38, 0.49, 0.56});
diff --git a/profiler/BUCK b/profiler/BUCK
index 1f6bc6cd9e6..7f7a7f42ab4 100644
--- a/profiler/BUCK
+++ b/profiler/BUCK
@@ -20,7 +20,9 @@ fbcode_target(_kind = runtime.python_library,
     ],
     base_module = "executorch.profiler",
     visibility = ["PUBLIC"],
-    external_deps = ["prettytable"],
+    deps = [
+        "fbsource//third-party/pypi/prettytable:prettytable",
+    ],
 )
 
 fbcode_target(_kind = runtime.python_library,
diff --git a/scripts/run_android_emulator.sh b/scripts/run_android_emulator.sh
index 29c2425cd0e..041c2f17b94 100755
--- a/scripts/run_android_emulator.sh
+++ b/scripts/run_android_emulator.sh
@@ -10,21 +10,43 @@ set -ex
 # This script is originally adopted from https://github.com/pytorch/pytorch/blob/main/android/run_tests.sh
 ADB_PATH=$ANDROID_HOME/platform-tools/adb
 
+adb_shell_with_retries() {
+  local attempts="$1"
+  shift
+
+  for ((i = 1; i <= attempts; i++)); do
+    if "$ADB_PATH" shell "$@"; then
+      return 0
+    fi
+    sleep 5
+    "$ADB_PATH" wait-for-device
+  done
+
+  return 1
+}
+
 echo "Waiting for emulator boot to complete"
 # shellcheck disable=SC2016
 $ADB_PATH wait-for-device shell 'while [[ -z $(getprop sys.boot_completed) ]]; do sleep 5; done;'
+$ADB_PATH wait-for-device
+
+echo "Unlock emulator and disable animations"
+adb_shell_with_retries 5 input keyevent 82 || true
+adb_shell_with_retries 5 settings put global window_animation_scale 0.0 || true
+adb_shell_with_retries 5 settings put global transition_animation_scale 0.0 || true
+adb_shell_with_retries 5 settings put global animator_duration_scale 0.0 || true
 
 # The device will be created by ReactiveCircus/android-emulator-runner GHA
 echo "List all running emulators"
 $ADB_PATH devices
 
-adb uninstall org.pytorch.executorch.test || true
-adb install -t android-test-debug-androidTest.apk
+"$ADB_PATH" uninstall org.pytorch.executorch.test || true
+"$ADB_PATH" install -t android-test-debug-androidTest.apk
 
-adb logcat -c
-adb shell am instrument -w -r \
+"$ADB_PATH" logcat -c
+"$ADB_PATH" shell am instrument -w -r \
   org.pytorch.executorch.test/androidx.test.runner.AndroidJUnitRunner >result.txt 2>&1
-adb logcat -d > logcat.txt
+"$ADB_PATH" logcat -d > logcat.txt
 cat logcat.txt
 grep -q FAILURES result.txt && cat result.txt
 grep -q FAILURES result.txt && exit -1
diff --git a/torch_pin.py b/torch_pin.py
index 856a67c1990..3575d9a376d 100644
--- a/torch_pin.py
+++ b/torch_pin.py
@@ -1,59 +1,2 @@
-# CHANNEL selects the wheel source for torch and its domain libraries.
-#   "nightly" — dev builds from /whl/nightly. NIGHTLY_VERSION is appended to
-#               every package spec, and CI source-builds pytorch from the
-#               pinned SHA in pytorch.txt to catch upstream regressions.
-#   "test"    — release candidates from /whl/test.
-#   "release" — stable releases from /whl.
-# For "test" and "release", NIGHTLY_VERSION is ignored and CI installs the
-# published wheels directly (no source build).
-#
-# Example — pinning to a 2.12 release candidate when nightly is broken:
-#   1. Set CHANNEL = "test".
-#   2. Set the four version constants to the RC's major.minor.patch
-#      (look up matching versions on https://download.pytorch.org/whl/test/).
-#   3. Re-run install_requirements.sh; commit. The pre-commit hook calls
-#      .github/scripts/update_pytorch_pin.py, which writes torch_branch()
-#      (e.g. "release/2.12") into .ci/docker/ci_commit_pins/pytorch.txt and
-#      re-syncs grafted c10 headers.
-CHANNEL = "test"
-
 TORCH_VERSION = "2.11.0"
-TORCHAUDIO_VERSION = "2.11.0"
-TORCHCODEC_VERSION = "0.11.0"
-TORCHVISION_VERSION = "0.26.0"
-
-NIGHTLY_VERSION = "dev20260318"
-
-
-def _spec(name: str, version: str) -> str:
-    if CHANNEL == "nightly":
-        return f"{name}=={version}.{NIGHTLY_VERSION}"
-    return f"{name}=={version}"
-
-
-def torch_spec() -> str:
-    return _spec("torch", TORCH_VERSION)
-
-
-def torchaudio_spec() -> str:
-    return _spec("torchaudio", TORCHAUDIO_VERSION)
-
-
-def torchcodec_spec() -> str:
-    return _spec("torchcodec", TORCHCODEC_VERSION)
-
-
-def torchvision_spec() -> str:
-    return _spec("torchvision", TORCHVISION_VERSION)
-
-
-def torch_index_url_base() -> str:
-    if CHANNEL == "release":
-        return "https://download.pytorch.org/whl"
-    return f"https://download.pytorch.org/whl/{CHANNEL}"
-
-
-def torch_branch() -> str:
-    # PyTorch uses "release/M.N" branches; derive from the pinned version.
-    # Used by update_pytorch_pin.py to write into pytorch.txt for test/release.
-    return f"release/{TORCH_VERSION.rsplit('.', 1)[0]}"
+# NIGHTLY_VERSION = "dev20260318" Temporarily pinning to stable release candidate. Revert https://github.com/pytorch/executorch/pull/18287